scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20902B)
      1 {
      2   "paper": {
      3     "title": "Large Language Models in Software Documentation and Modeling: A Literature Review and Findings",
      4     "authors": ["Lukás Radoský", "Ivan Polasek"],
      5     "year": 2026,
      6     "venue": "SAMI 2026 (IEEE 24th World Symposium on Applied Machine Intelligence and Informatics)",
      7     "arxiv_id": "2602.04938"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis"],
     12   "key_findings": "Literature review of 57 papers from TSE, TOSEM, EMSE, and ICSE (2024-2025) on LLMs for software documentation and modeling. Papers span 11 task categories, with code summarization (12 papers) and commit message generation (7 papers) most studied. Zero-shot prompting dominates; advanced techniques like chain-of-thought and multi-agent systems receive little attention. The authors conclude LLMs have led to evolutionary improvements rather than revolutionary changes in SE documentation and modeling workflows.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code, analysis scripts, or repository link provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The list of 57 analyzed papers is not released as structured data. Only references are provided in standard bibliography format."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment or tool specifications provided. A survey could release analysis tools with environment details, but none are given."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions. Section 3 describes the methodology at a high level but does not provide step-by-step reproducible instructions (e.g., exact search queries, screening forms)."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "Survey paper with no experiments or statistical analysis of its own."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "Survey paper makes no comparative statistical claims requiring tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No experiments conducted; survey summarizes existing literature descriptively."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No experiments; the 'sample' is the set of papers found via literature search, not an experimental sample."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No experimental runs to report variance across."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Section 2 lists related SLRs but does not formally compare this survey's coverage, methodology, or findings against them. The comparison is descriptive (what topics others cover) without structured comparison of results."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No formal baseline comparison is conducted against prior surveys, so contemporariness cannot be assessed."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "Survey has no components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "Survey does not evaluate a system using metrics."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is irrelevant to a literature review's claims."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No test set applicable to a survey paper."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Figure 1 shows paper counts per task category (11 categories). Sections 4.1-4.11 provide per-category analysis of approaches, datasets, and prompting techniques."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "The paper does not discuss failure cases or where the reviewed approaches break down. It mostly summarizes what each paper proposes without critical analysis of limitations."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 6 notes that 'the long-awaited revolution stemming from the introduction of LLMs might be lagging behind its hype' and that LLMs lead to 'evolution rather than revolution.' Section 4.4 notes that zero-shot LLMs were outperformed by BERT-based models for emotion detection [51]."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims a literature review analyzing articles from four major venues, organized per tasks, with an overview of prompt techniques, metrics, and datasets. Sections 3-5 deliver on each of these claims."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper makes no causal claims. All findings are descriptive summaries of the literature."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 3 explicitly bounds scope to four venues (TSE, TOSEM, EMSE, ICSE), publications from 2024-2025, and tasks related to software documentation and modeling. The title could be read more broadly but the methodology section clearly states the boundaries."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper presents observations (e.g., zero-shot dominance, evolution over revolution) without discussing alternative explanations. For instance, the dominance of zero-shot prompting could be due to API costs, simplicity bias, or publication norms — none of which are explored."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper claims LLMs lead to 'evolution rather than revolution' based on counting and categorizing 57 papers, but does not discuss what would constitute 'revolution' vs 'evolution' or whether paper counts are an adequate proxy for assessing the transformative impact of LLMs on SE."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "Survey does not use any models."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "Survey does not use prompting."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "Survey does not use models with hyperparameters."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "Section 3 describes three phases (venue identification, keyword search, manual reading) with filtering criteria (keywords: LLM, GPT, BERT, etc.) but provides no paper counts at each filtering stage. The reader cannot determine how many papers were screened at each phase to arrive at 57."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations or threats-to-validity section. The conclusion (Section 6) briefly notes future directions but does not discuss limitations of the review itself."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity discussed. Key unaddressed threats include: venue selection bias (only 4 venues), temporal limitation (2024-2025 only), keyword-based filtering potentially missing relevant papers, and manual screening subjectivity."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 3 explicitly states scope: four venues (TSE, TOSEM, EMSE, ICSE), 2024-2025 publications, and tasks related to software documentation and modeling (broadly defined to include code summarization)."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No structured dataset of the 57 analyzed papers with their categorizations, extracted attributes, or coding scheme is provided."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3 describes the three-phase collection process: venue identification, keyword-based search (LLM, language model, GPT, BERT), and manual abstract/full-text review for relevance."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data source is published papers from standard SE venues."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "While Section 3 describes three filtering phases with criteria, no counts are provided at each stage (e.g., total papers in venues → after keyword filtering → after manual review → 57 final). The pipeline lacks quantitative traceability."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Acknowledgment section states: 'This research and paper was 100% funded by the EU NextGenerationEU through the Recovery and Resilience Plan for Slovakia under the project \"InnovAIte Slovakia\"' with project number."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Both authors are clearly listed as from Department of Applied Informatics, Faculty of Mathematics, Physics and Informatics, Comenius University Bratislava, with ORCID IDs."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "EU NextGenerationEU funding through Slovakia's Recovery and Resilience Plan. The funder has no commercial stake in the survey's findings about LLM usage in SE."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement found in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Survey paper — does not evaluate any pre-trained model on a benchmark."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Survey paper — no model evaluation."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper — no model evaluation on benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this survey."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Survey paper — no method with inference cost."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper — no computational experiments."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No PRISMA flow diagram, no registered protocol, no reproducible search queries. Section 3 describes a 3-phase process (venue selection, keyword search, manual reading) but lacks the rigor of a structured review protocol — no search strings, no database queries, no screening counts."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No quality scoring, risk-of-bias assessment, or structured evaluation of included studies. All 57 papers are treated equally regardless of their methodological quality."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No discussion of publication bias. The paper notes that 'most papers report improvement over state-of-the-art approaches, often significant' (Section 6) but does not consider whether this reflects publication bias toward positive results."
    306       }
    307     }
    308   },
    309   "claims": [
    310     {
    311       "claim": "Code summarization is the most studied documentation/modeling task for LLMs, with 12 papers in the analyzed corpus.",
    312       "evidence": "Figure 1 shows paper counts per category. Code summarization leads with 12 papers, followed by commit message generation and code commenting/reviewing/logging at 7 each.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "Zero-shot prompting is the dominant approach in LLM4SE documentation and modeling research, with few-shot and chain-of-thought receiving less attention.",
    317       "evidence": "Sections 4.1-4.11 repeatedly note zero-shot as the default choice across task categories. Section 6 states: 'The popular approaches involve promptless models and zero-shot prompting; less attention is given to more advanced techniques such as few-shot prompting and chain-of-thought.'",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "LLMs have led to evolution rather than revolution in SE documentation and modeling tasks.",
    322       "evidence": "Section 6 states: 'the LLMs are being used to improve the speed and quality of existing workflow and task structure in software engineering instead of redefining them. At this point in time, their incorporation seems to lead to evolution rather than revolution.'",
    323       "supported": "weak"
    324     },
    325     {
    326       "claim": "LLM-driven multi-agent systems have received little attention in the documentation and modeling domain.",
    327       "evidence": "Section 6 notes: 'the emerging area of LLM-driven multi-agent systems has been given little attention, several papers mention them as a possible future research direction.' Only one paper [84] uses a MAS approach among the 57 analyzed.",
    328       "supported": "moderate"
    329     }
    330   ],
    331   "red_flags": [
    332     {
    333       "flag": "No quality assessment of sources",
    334       "detail": "The survey treats all 57 papers equally without assessing their methodological quality, risk of bias, or strength of evidence. This risks laundering weak or poorly designed studies alongside rigorous ones."
    335     },
    336     {
    337       "flag": "No PRISMA or structured protocol",
    338       "detail": "The paper selection is described informally in Section 3 with no PRISMA flow diagram, no registered protocol, and no paper counts at filtering stages. The screening process is not reproducible."
    339     },
    340     {
    341       "flag": "Very shallow synthesis",
    342       "detail": "The paper reads primarily as a categorized bibliography — listing what each paper does — rather than providing critical synthesis, cross-paper comparison, or identification of systematic gaps. Most section content is descriptive summation of individual papers."
    343     },
    344     {
    345       "flag": "Sweeping conclusion unsupported by evidence",
    346       "detail": "The 'evolution rather than revolution' conclusion in Section 6 is presented as a finding but is really an editorial opinion. No framework is provided for what would constitute 'revolution' vs 'evolution,' making the claim unfalsifiable."
    347     },
    348     {
    349       "flag": "No limitations section",
    350       "detail": "A survey of this type should discuss venue selection bias (only 4 venues), temporal limits, keyword-based filtering gaps, and single-reviewer screening. None of these threats to validity are addressed."
    351     }
    352   ],
    353   "cited_papers": [
    354     {
    355       "title": "Large language models for software engineering: A systematic literature review",
    356       "authors": ["Xinyi Hou"],
    357       "year": 2024,
    358       "relevance": "Comprehensive SLR on LLM4SE across all SDLC phases — a key prior survey to position against."
    359     },
    360     {
    361       "title": "A survey on large language models for code generation",
    362       "authors": ["Juyong Jiang", "Fan Wang", "Jiasi Shen", "Sungju Kim", "Sunghun Kim"],
    363       "year": 2025,
    364       "relevance": "Survey focused on LLM code generation, directly relevant to the code-related tasks reviewed."
    365     },
    366     {
    367       "title": "A survey on large language models for software engineering",
    368       "authors": ["Quanjun Zhang"],
    369       "year": 2024,
    370       "relevance": "Another broad LLM4SE survey providing complementary coverage."
    371     },
    372     {
    373       "title": "The impact of llm-assistants on software developer productivity: A systematic literature review",
    374       "authors": ["Amr Mohamed", "Maram Assi", "Mariam Guizani"],
    375       "year": 2025,
    376       "relevance": "SLR on LLM impact on developer productivity — relevant to the productivity claims in the AI programming literature."
    377     },
    378     {
    379       "title": "LLM-based multi-agent systems for software engineering: Literature review, vision, and the road ahead",
    380       "authors": ["Junda He", "Christoph Treude", "David Lo"],
    381       "year": 2025,
    382       "relevance": "Survey on LLM multi-agent systems for SE — directly relevant to the agentic workflow dimension."
    383     },
    384     {
    385       "title": "Systematic literature review of prompt engineering patterns in software engineering",
    386       "authors": ["Yuya Sasaki", "Hironori Washizaki", "Jialong Li", "Dominik Sander", "Nobukazu Yoshioka", "Yoshiaki Fukazawa"],
    387       "year": 2024,
    388       "relevance": "SLR on prompt engineering patterns in SE — directly relevant to the prompting technique analysis."
    389     },
    390     {
    391       "title": "How much space do metrics have in GenAI assisted software development?",
    392       "authors": ["Samarth Sikand"],
    393       "year": 2024,
    394       "relevance": "Study on metrics in GenAI-assisted SE, relevant to evaluating how the field measures LLM impact."
    395     },
    396     {
    397       "title": "Software development life cycle perspective: A survey of benchmarks for code large language models and agents",
    398       "authors": ["Kaixing Wang"],
    399       "year": 2025,
    400       "relevance": "Survey of benchmarks for code LLMs and agents across the SDLC."
    401     },
    402     {
    403       "title": "Novice developers' perspectives on adopting LLMs for software development: A systematic literature review",
    404       "authors": ["Samuel Ferino", "Rashina Hoda", "John Grundy", "Christoph Treude"],
    405       "year": 2025,
    406       "relevance": "SLR on novice developer adoption of LLMs — relevant to the human factors dimension of LLM4SE."
    407     },
    408     {
    409       "title": "From LLMs to LLM-based agents for software engineering: A survey of current, challenges and future",
    410       "authors": ["Haolin Jin"],
    411       "year": 2025,
    412       "relevance": "Survey comparing standalone LLMs vs LLM-based agents for SE tasks."
    413     }
    414   ]
    415 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs