scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18656B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Large Language Models in Software Documentation and Modeling: A Literature Review and Findings",
      6     "authors": [
      7       "Lukás Radoský",
      8       "Ivan Polasek"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.04938",
     13     "doi": null
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "Abstract claims that LLMs are 'great for processing various software documentation artifacts' and 'excel at understanding structured languages' but provides no independent evidence—only reviews claims from other papers without validation.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "As a review of existing work, the paper reports causal claims (e.g., 'outperform previous state-of-the-art') from cited papers but doesn't make novel causal claims of its own requiring study design justification.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Scope is bounded to 4 journals, 2024–2025, but the conclusion claims broad patterns ('evolution rather than revolution,' 'long-awaited revolution lagging behind hype') without acknowledging these findings apply only to a narrow 2-year window in 4 major venues.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper reports why papers use zero-shot prompting (cost, fairness) but doesn't discuss alternative explanations for observed trends—e.g., publication bias toward positive results, venue selection bias, or temporal confounding.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "Section 5 lists metrics (BLEU, ROUGE, accuracy, F1) but doesn't discuss whether these metrics measure what they claim—e.g., whether BLEU/ROUGE reflect actual summary quality or just surface similarity.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No dedicated limitations or threats-to-validity section. A single sentence in the conclusion mentions the 'revolution lagging behind hype' but does not address methodological limitations of the review itself.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No specific threats discussed: no mention of publication bias (major venues skew positive), selection bias (4-venue restriction), temporal bias (only 2024–2025 coverage), or screening inconsistency (subjective 'documentation and modeling' categorization).",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Methodology describes what was searched (4 venues, 2024–2025, keywords) but doesn't state scope boundaries—e.g., no discussion that findings apply only to these venues, exclude arXiv/workshops, or represent early-stage LLM4SE research.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Acknowledgment explicitly states 100% EU funding through the NextGenerationEU Recovery and Resilience Plan, project 'InnovAIte Slovakia' No. 09I02-03-V01-00029.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors affiliated with Department of Applied Informatics, Comenius University Bratislava. No disclosed affiliation with evaluated software/products.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "EU funding program for AI-driven breakthroughs is independent of the specific papers reviewed, though it may create incentive to promote AI enthusiasm broadly.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement or declaration of patents, equity, consulting relationships provided.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Key terms 'large language models,' 'software documentation,' and 'software modeling' are used without formal definition. Paper broadly includes code summarization as 'documentation' without justifying this classification.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Contribution is stated: analyze 57 papers from 4 major venues on LLM usage for documentation and modeling tasks, organized by task and summarizing prompt techniques, metrics, evaluation approaches, and datasets.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "Section 2 lists 20+ prior SLRs on LLM4SE but doesn't explain how this review differs—e.g., prior work already covers broad scope (all SDLC phases) or narrow topics (requirements, testing). Positioning is unclear.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "survey": {
    117       "search_and_selection": {
    118         "search_strategy_reproducible": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Search strategy is described: manually searched 4 venues (TSE, TOSEM, EMSE, ICSE) for 2024–2025 publications using keywords 'LLM, language model, GPT, BERT, and variations.' Could be reproduced.",
    122           "source": "haiku"
    123         },
    124         "inclusion_exclusion_explicit": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "Inclusion criteria (4 venues, 2024–2025, LLM-related, documentation/modeling) are described informally but not stated as formal inclusion/exclusion criteria. 'Documentation and modeling' is subjectively applied—code summarization included but decision rationale unclear.",
    128           "source": "haiku"
    129         },
    130         "prisma_or_structured_protocol": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No mention of PRISMA, AMSTAR, or structured review protocol. Methodology is informal: 'brief inspection,' 'manual reading,' 'full-text search when in doubt'—no systematic screening process documented.",
    134           "source": "haiku"
    135         },
    136         "search_terms_provided": {
    137           "applies": true,
    138           "answer": true,
    139           "justification": "Keywords provided: 'LLM, language model, GPT, BERT, and some variations.' Explicitly states heuristic to avoid omitting papers without 'LLM' in title.",
    140           "source": "haiku"
    141         },
    142         "databases_listed": {
    143           "applies": true,
    144           "answer": true,
    145           "justification": "Four venues explicitly listed: IEEE Transactions on Software Engineering (TSE), ACM Transactions on Software Engineering and Methodology (TOSEM), Springer Empirical Software Engineering (EMSE), International Conference on Software Engineering (ICSE).",
    146           "source": "haiku"
    147         },
    148         "screening_process_documented": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Three-phase process described (venue identification, keyword search, manual reading) but no screening counts provided—how many papers were initially retrieved? How many excluded at each stage? Flow chart absent.",
    152           "source": "haiku"
    153         },
    154         "review_scope_justified": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "Scope is described (4 major venues, 2024–2025 'due to rapid advances') but not justified: why only 2 years? Why exclude other venues, workshops, or arXiv? Why these 4 journals specifically? No justification provided.",
    158           "source": "haiku"
    159         }
    160       },
    161       "synthesis_quality": {
    162         "conflicting_findings_acknowledged": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "Papers are organized by task (commit messages, code summarization, etc.) but conflicting findings are not discussed—e.g., different prompt techniques or model comparisons within the same task are not synthesized or reconciled.",
    166           "source": "haiku"
    167         },
    168         "quality_assessment_of_sources": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No quality rubric, risk-of-bias assessment, or methodological evaluation applied to the 57 reviewed papers. All papers treated equally regardless of study design, rigor, or sample size.",
    172           "source": "haiku"
    173         },
    174         "publication_bias_discussed": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "No acknowledgment of publication bias. Review focuses on 'major venues' (IEEE, ACM, Springer) which publish primarily positive/successful results. Negative findings and null results are not discussed as missing.",
    178           "source": "haiku"
    179         },
    180         "quantitative_synthesis_present": {
    181           "applies": true,
    182           "answer": false,
    183           "justification": "Figure 1 provides paper counts per task category (descriptive only). Section 5 lists metric types and evaluator counts but no meta-analysis, aggregated statistics, or quantitative synthesis across papers.",
    184           "source": "haiku"
    185         },
    186         "recommendations_supported_by_evidence": {
    187           "applies": true,
    188           "answer": false,
    189           "justification": "Conclusion claims LLMs bring 'evolution rather than revolution' and 'might be lagging behind hype' but these recommendations are author opinion, not strongly supported by quantitative evidence or systematic analysis of the reviewed papers.",
    190           "source": "haiku"
    191         }
    192       }
    193     }
    194   },
    195   "claims": [
    196     {
    197       "claim": "Large language models are great for processing various software documentation artifacts and excel at understanding structured languages.",
    198       "evidence": "Abstract assertion, not independently verified by the paper.",
    199       "supported": "unsupported"
    200     },
    201     {
    202       "claim": "Most papers use zero-shot prompting, either to mitigate prompt influence on model comparison or to mimic practical usage.",
    203       "evidence": "Sections 4.1–4.10 consistently show zero-shot prompting as dominant; few-shot used as baseline in comparisons.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "LLM-based approaches outperform previous state-of-the-art approaches even with simple zero-shot prompts.",
    208       "evidence": "Cited from [1, 2, 3] but not independently validated in this review.",
    209       "supported": "weak"
    210     },
    211     {
    212       "claim": "The MCMD dataset is the most popular for commit message generation.",
    213       "evidence": "Section 4.1 cites [42, 1, 39, 41] using MCMD; claim applies only to reviewed papers.",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "Novel approaches are often used to improve existing tasks rather than redefine software engineering workflows.",
    218       "evidence": "Conclusion: 'LLMs are being used to improve the speed and quality of existing workflow...instead of redefining them.' Not quantified.",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Human-based evaluation typically involves 2–42 evaluators, either authors, experts, or students.",
    223       "evidence": "Section 5 documents evaluator counts across papers [12, 1, 40, 41, 2, 8, 46, 74, 75], [66, 58], [63, 76].",
    224       "supported": "strong"
    225     }
    226   ],
    227   "methodology_tags": [
    228     "meta-analysis",
    229     "qualitative"
    230   ],
    231   "key_findings": "The review analyzes 57 papers from four major venues (2024–2025) on LLM usage for software documentation and modeling tasks. Most papers employ zero-shot prompting and standard metrics (BLEU, ROUGE, accuracy, F1). Code summarization, commit message generation, and StackOverflow tag generation are prevalent tasks. Human evaluation commonly uses 2–42 evaluators. The authors conclude that LLMs are improving existing software engineering workflows through evolution rather than fundamental transformation.",
    232   "red_flags": [
    233     {
    234       "flag": "No systematic review protocol",
    235       "detail": "No PRISMA checklist, no documented screening process with exclusion counts, no risk-of-bias tool applied to reviewed papers."
    236     },
    237     {
    238       "flag": "Narrow and unjustified scope",
    239       "detail": "Limited to 4 journals/conferences, only 2024–2025 (2-year window), with 57 papers total. No justification for venue or temporal selection; excludes workshops, arXiv, other fields."
    240     },
    241     {
    242       "flag": "No quality assessment of reviewed papers",
    243       "detail": "All 57 papers treated equally. No methodological quality rubric, no study design classification, no exclusion of low-rigor papers."
    244     },
    245     {
    246       "flag": "Publication bias not acknowledged",
    247       "detail": "Focus on major venues (IEEE, ACM, Springer) inherently biases toward positive/successful results. No discussion of missing negative findings or null results."
    248     },
    249     {
    250       "flag": "Subjective inclusion criteria",
    251       "detail": "'Documentation and modeling' applied subjectively—paper includes code summarization as documentation without clear rationale; inclusion/exclusion decisions not transparent."
    252     },
    253     {
    254       "flag": "No limitations section",
    255       "detail": "Critical for SLR rigor. Paper does not discuss scope limitations, temporal boundaries, venue bias, or threats to the validity of its conclusions."
    256     },
    257     {
    258       "flag": "Unsupported abstract claims",
    259       "detail": "Abstract asserts LLMs are 'great' and 'excel' without providing evidence; these are broad claims not validated by the review's own analysis."
    260     },
    261     {
    262       "flag": "Weak engagement with prior work",
    263       "detail": "Section 2 lists 20+ existing SLRs but doesn't explain how this review differs, provides novel insights, or builds on prior work. Positioning is unclear."
    264     },
    265     {
    266       "flag": "Weak quantitative synthesis",
    267       "detail": "Only descriptive organization by task; no meta-analysis, statistical aggregation, or quantitative comparison of techniques/metrics across papers."
    268     },
    269     {
    270       "flag": "No supplementary materials",
    271       "detail": "No list of 57 reviewed papers, no search results spreadsheet, no detailed screening data—limits reproducibility and transparency."
    272     }
    273   ],
    274   "cited_papers": [
    275     {
    276       "title": "Large language models for software engineering: A systematic literature review",
    277       "authors": "Xinyi Hou et al.",
    278       "year": 2024,
    279       "venue": "ACM Transactions on Software Engineering and Methodology",
    280       "relevance": "Foundational prior SLR covering all SDLC phases; direct comparison point for this narrower review."
    281     },
    282     {
    283       "title": "Analysing the role of generative AI in software engineering - results from an MLR",
    284       "authors": "Tuomas Bazzan et al.",
    285       "year": 2024,
    286       "venue": "Systems, Software and Services Process Improvement",
    287       "relevance": "Prior SLR on GenAI in SE; establishes context for LLM4SE reviews."
    288     },
    289     {
    290       "title": "A survey on large language models for software engineering",
    291       "authors": "Quanjun Zhang et al.",
    292       "year": 2024,
    293       "venue": "arXiv",
    294       "relevance": "Concurrent SLR on LLM4SE; likely has broader scope or different venue selection."
    295     },
    296     {
    297       "title": "LLM-based multi-agent systems for software engineering: Literature review, vision, and the road ahead",
    298       "authors": "Junda He, Christoph Treude, and David Lo",
    299       "year": 2025,
    300       "venue": "ACM Transactions on Software Engineering and Methodology",
    301       "relevance": "Focused SLR on LLM-based multi-agent systems; complements this review."
    302     },
    303     {
    304       "title": "Software development life cycle perspective: A survey of benchmarks for code large language models and agents",
    305       "authors": "Kaixing Wang et al.",
    306       "year": 2025,
    307       "venue": "arXiv",
    308       "relevance": "SLR on benchmarks and SDLC perspective; relevant for metrics and dataset analysis."
    309     },
    310     {
    311       "title": "A survey on large language models for code generation",
    312       "authors": "Juyong Jiang, Fan Wang, Jiasi Shen, Sungju Kim, and Sunghun Kim",
    313       "year": 2025,
    314       "venue": "ACM Transactions on Software Engineering and Methodology",
    315       "relevance": "Focused SLR on code generation; overlaps with implementation tasks in broader LLM4SE space."
    316     }
    317   ],
    318   "engagement_factors": {
    319     "practical_relevance": {
    320       "score": 2,
    321       "justification": "Review identifies practical tasks (code summarization, commit messages) and tools, but provides limited guidance on which approaches work best or how to select methods; practitioners must read original papers."
    322     },
    323     "surprise_contrarian": {
    324       "score": 0,
    325       "justification": "Conclusion that LLMs bring 'evolution not revolution' is somewhat contrarian to hype, but weakly supported; mostly aligns with incremental improvement narrative."
    326     },
    327     "fear_safety": {
    328       "score": 0,
    329       "justification": "No safety, alignment, or AI risk concerns raised; purely focused on technical task performance."
    330     },
    331     "drama_conflict": {
    332       "score": 0,
    333       "justification": "No controversy, debate, or conflict narrative; straightforward technical review."
    334     },
    335     "demo_ability": {
    336       "score": 1,
    337       "justification": "Review summarizes tools and approaches that could be demoed (code summarization, commit generators), but the paper itself provides no executable demo or artifact."
    338     },
    339     "brand_recognition": {
    340       "score": 2,
    341       "justification": "Reviews major venues (IEEE, ACM, Springer) and well-known models (GPT-4, BERT, CodeT5); authors from Comenius University Bratislava (regional, not top-tier brand)."
    342     }
    343   },
    344   "hn_data": {
    345     "threads": [],
    346     "top_points": 0,
    347     "total_points": 0,
    348     "total_comments": 0
    349   }
    350 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs