scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23035B)
      1 {
      2   "paper": {
      3     "title": "Large Language Models for Requirements Engineering: A Systematic Literature Review",
      4     "authors": ["Mohammad Amin Zadenoori", "Jacek Dąbrowski", "Waad Alhoshan", "Liping Zhao", "Alessio Ferrari"],
      5     "year": 2025,
      6     "venue": "arXiv (preprint submitted to Elsevier)",
      7     "arxiv_id": "2509.11446"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis"],
     12   "key_findings": "This SLR of 74 papers (2023-2024) on LLMs for Requirements Engineering finds a 136% growth from 2023 to 2024, with research shifting from traditional NLP4RE tasks (defect detection, classification) toward cognitively demanding tasks like elicitation and validation (20% each). GPT-family models dominate (77%), Zero-shot (38%) and Few-shot (26%) prompting dominate, while RAG (7%) and interactive prompting (4%) remain underexplored. Evaluation relies heavily on laboratory experiments (76%) with limited real-world validation.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "The authors state 'The spreadsheets resulting from our data extraction and grouping, along with all synthesized data, including generated statistics and classifications, are publicly available in the supplementary material of this survey [13]' with a Zenodo link (doi:10.5281/zenodo.17068810)."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Data extraction forms, classification spreadsheets, and synthesized data are shared via Zenodo [13]."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment or dependency specifications are provided. This is a survey paper but analysis scripts (if any) have no documented environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided for replicating the search, screening, or data extraction process beyond what is described in the methodology section."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "This is a survey paper that reports descriptive statistics (counts, percentages) of the literature. No experimental results requiring confidence intervals."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "Survey paper reporting descriptive statistics only; no comparative claims requiring significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "Survey paper with no experimental comparisons requiring effect sizes."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "Survey paper; sample size is determined by the systematic search process, not statistical requirements."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "Survey paper with no experimental runs to report variance across."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The survey explicitly compares its findings against the prior NLP4RE survey by Zhao et al. (2021) covering 404 studies from 1983-2019, noting shifts in task focus, artifact types, and phase distribution."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Zhao et al. (2021) is the most relevant prior survey in NLP4RE. The paper also discusses related SLRs by Hou et al. (2024), Wang et al. (2024), and others in Section 3."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "Survey paper with no system components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "Survey paper; no system performance metrics to evaluate."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Survey paper; no system outputs to evaluate with human judges."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Survey paper; no train/test split concept applies."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down extensively by RE task (Table 5), RE phase (Table 6), input artifact (Table 7), output artifact (Table 8), prompting strategy (Table 10), LLM type (Table 14), evaluation method (Tables 16-17), and more."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper identifies underexplored areas (Requirements Retrieval, Terminology Extraction at 1% each), gaps in reproducibility (only 16% share datasets), lack of field studies (0% field experiments), and over-reliance on GPT models and laboratory experiments."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports negative findings: 39% of studies provide no supplementary materials, 32% don't specify prompt selection strategy, 0% use field experiments or sample studies, and only 16% share datasets."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about 74 studies, 136% growth, Zero-shot (44%→38% in body) and Few-shot (29%→26% in body) dominance, RAG (6%→7%) and Interactive (5%→4%) being underexplored are all supported by tables in Section 5. Note slight discrepancies between abstract and body percentages (e.g., '44%' in abstract vs '38%' in body for Zero-shot) but these appear to be due to multi-label counting differences."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes causal-style claims like 'LLMs are not merely automating traditional NLP4RE tasks but are enabling research to tackle tasks that were previously considered difficult for machines' and 'RE researchers, like the general public, engaged with LLMs once easy-to-use interfaces lowered technical barriers.' These causal attributions are not tested — the paper only observes correlations in publication patterns."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly bounds to 74 studies from 2023-2024, acknowledges the time boundary of its search (September/December 2024), and limits claims to the reviewed corpus. Section 7 discusses threats to validity."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper does not discuss alternative explanations for its findings. For example, the shift from NLP4RE tasks to LLM4RE tasks could be due to publication venue selection bias, researcher incentives for novelty, or the specific search string used — none of these are explored."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper uses publication counts and categorization frequencies as proxies for research trends and community interest but does not discuss limitations of this proxy (e.g., publication counts don't reflect research quality or actual adoption)."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "Survey paper that does not use any LLM or model for its own analysis."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "Survey paper that does not use prompting."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "Survey paper with no model-based experiments."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Figure 2 shows the full search pipeline with counts at each stage: 244 from primary search → 136 after screening → 62 after criteria; 940 from secondary → 887 after dedup → 120 after screening → 12 after criteria. Table 2 provides specific inclusion/exclusion criteria. The search string is provided verbatim."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 7 'Threats to Validity' provides a dedicated discussion of study selection validity, data validity, and research validity."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 7 discusses specific threats: single database (Scopus), data extraction performed by first author only without systematic cross-checking, subjectivity of classification schemes, and plans for future cross-validation. The paper also includes a disclaimer that 'No systematic cross-checking has been performed.'"
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper explicitly scopes to generative pre-trained transformers (not encoder-only like BERT), 2023-2024 time period, peer-reviewed studies in CORE A*/A/B conferences and Q1/Q2 journals. States it is 'a preliminary study' in the disclaimer."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The data extraction spreadsheets are available via Zenodo [13] (doi:10.5281/zenodo.17068810), allowing independent verification of classifications."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 4.2 describes the search strategy in detail: search string provided verbatim, Scopus database, execution date (September 11, 2024), secondary venue-based search (December 18, 2024), specific venues listed in Table 1."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data source is a literature search of published papers."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Figure 2 documents each stage with counts: primary search (244 → 136 → 62), secondary search (940 → 887 → 120 → 12), yielding 74 total. Table 2 provides inclusion/exclusion criteria. Section 4.3 describes the data extraction form and classification scheme construction."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding acknowledgment section found in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: University of Padova, Lero/University of Limerick, IMSIU, University of Manchester, University College Dublin."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Funding is not disclosed, so independence cannot be assessed. One author is at Lero (Research Ireland Centre for Software) which may have funding, but this is not stated."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement found in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Survey paper that does not evaluate any pre-trained model's capability on a benchmark."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper that does not evaluate any pre-trained model on a benchmark."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this survey."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this survey."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this survey."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this survey."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this survey."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this survey."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this survey."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Survey paper; no inference or computational method to report costs for."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper; no computational experiments."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "The paper follows Kitchenham's SLR guidelines (Section 4), includes a structured search process with a flow diagram (Figure 2), defined search strings, inclusion/exclusion criteria (Table 2), and a two-stage search strategy (primary digital library + secondary venue-based search)."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The survey does not assess the methodological quality of its 74 primary studies. It categorizes them by task, phase, prompting strategy, etc., but treats all papers equally regardless of quality. Table 2 uses venue quality (CORE ranking, SJR quartile) as an inclusion filter, but this is access control, not quality assessment of the studies themselves."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper does not discuss publication bias. It does not consider whether the 74 studies skew toward positive results about LLMs for RE, nor does it discuss the absence of negative-result papers in the corpus."
    306       }
    307     }
    308   },
    309   "claims": [
    310     {
    311       "claim": "74 primary studies on LLMs for RE were published between 2023-2024, with a 136% increase from 2023 to 2024.",
    312       "evidence": "Section 5.1: 22 studies in 2023, 52 in 2024, shown in Figure 3.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "LLM4RE shifts focus from traditional NLP4RE tasks (defect detection, classification) to cognitively demanding tasks (elicitation, validation at 20% each).",
    317       "evidence": "Section 5.2.1, Table 5 shows distribution. Comparison with Zhao et al. (2021) discussed in text, noting defect detection and classification were dominant in NLP4RE but rank 8th and 5th in LLM4RE.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "GPT-family models dominate (77% of studies), with limited exploration of open-source alternatives.",
    322       "evidence": "Section 5.4.3, Table 14: GPT in 57/74 studies (77%), LLaMA in 9 (12%), Mixtral/Mistral in 3 (4%).",
    323       "supported": "strong"
    324     },
    325     {
    326       "claim": "Zero-shot (38%) and Few-shot (26%) prompting dominate, while RAG (7%) and Interactive (4%) remain underexplored.",
    327       "evidence": "Section 5.3.1, Table 10 shows the distribution of prompting strategies across studies.",
    328       "supported": "strong"
    329     },
    330     {
    331       "claim": "Laboratory experiments dominate evaluation (76%), with limited real-world validation (Field Studies 7%, Field Experiments 0%).",
    332       "evidence": "Section 5.6.2, Table 17 shows 56/74 studies use laboratory experiments, 5 use field studies, 0 use field experiments.",
    333       "supported": "strong"
    334     },
    335     {
    336       "claim": "Only 16% of studies shared publicly available datasets, creating replicability problems.",
    337       "evidence": "Section 5.4.1: 'only 16% of the studies made use of public datasets' with 10 datasets identified in Table 13.",
    338       "supported": "strong"
    339     }
    340   ],
    341   "red_flags": [
    342     {
    343       "flag": "No quality assessment of source papers",
    344       "detail": "The survey categorizes and counts 74 papers but does not assess their methodological quality. All papers are treated equally regardless of rigor, potentially laundering weak results alongside strong ones."
    345     },
    346     {
    347       "flag": "Single-author data extraction without systematic cross-checking",
    348       "detail": "The paper explicitly acknowledges that data extraction was performed by the first author only, with the last author consulted 'in case of doubts.' The disclaimer states 'No systematic cross-checking has been performed.' This introduces potential bias in classification decisions."
    349     },
    350     {
    351       "flag": "Percentage discrepancies between abstract and body",
    352       "detail": "The abstract reports Zero-shot at 44% and Few-shot at 29%, while Section 5.3.1 reports 38% and 26%. This appears due to different denominators (per-study vs per-technique) but is not explained, creating confusion."
    353     },
    354     {
    355       "flag": "Self-labeled as preliminary",
    356       "detail": "The paper includes a disclaimer: 'This paper is a preliminary study. No systematic cross-checking has been performed on the retrieved and analysed studies.' This is unusually candid but raises questions about whether the findings are reliable enough for citation."
    357     },
    358     {
    359       "flag": "No publication bias consideration",
    360       "detail": "The survey does not discuss whether its corpus of 74 papers is biased toward positive results about LLMs for RE. Given that negative results are underrepresented in CS venues generally, the optimistic framing of LLM4RE could reflect publication bias."
    361     }
    362   ],
    363   "cited_papers": [
    364     {
    365       "title": "Large language models for software engineering: Survey and open problems",
    366       "authors": ["A. Fan", "B. Gokkaya", "M. Harman", "M. Lyubarskiy", "S. Sengupta", "S. Yoo", "J. M. Zhang"],
    367       "year": 2023,
    368       "doi": "10.1109/ICSE-FoSE59343.2023.00008",
    369       "relevance": "Foundational survey on LLMs for SE, highlighting emergent properties and hybrid approaches."
    370     },
    371     {
    372       "title": "Large language models for software engineering: A systematic literature review",
    373       "authors": ["X. Hou", "Y. Zhao", "Y. Liu", "Z. Yang", "K. Wang", "L. Li", "X. Luo", "D. Lo", "J. Grundy", "H. Wang"],
    374       "year": 2024,
    375       "doi": "10.1145/3695988",
    376       "relevance": "SLR of 395 publications on LLMs in SE (2017-2024), covering data collection, optimization, and evaluation."
    377     },
    378     {
    379       "title": "LLMs and LLM-based agents in software engineering: A survey",
    380       "authors": ["M. Jin", "S. Kumar", "W. Zhang"],
    381       "year": 2024,
    382       "doi": "10.1016/j.jss.2023.111589",
    383       "relevance": "Survey of LLM-based agents with tools, memory, and decision-making for SE tasks."
    384     },
    385     {
    386       "title": "LLM-based multi-agent systems for software engineering: Literature review, vision, and the road ahead",
    387       "authors": ["J. He", "C. Treude", "D. Lo"],
    388       "year": 2025,
    389       "doi": "10.1145/3712003",
    390       "relevance": "Review of multi-agent LLM systems for SE with framework capabilities and limitations."
    391     },
    392     {
    393       "title": "Software testing with large language models: Survey, landscape, and vision",
    394       "authors": ["J. Wang", "Y. Huang", "C. Chen", "Z. Liu", "S. Wang", "Q. Wang"],
    395       "year": 2024,
    396       "doi": "10.1109/TSE.2024.3368208",
    397       "relevance": "Survey of 102 studies on LLMs for software testing including prompt engineering and scalability challenges."
    398     },
    399     {
    400       "title": "Natural language processing for requirements engineering: A systematic mapping study",
    401       "authors": ["L. Zhao", "W. Alhoshan", "A. Ferrari", "K. J. Letsholo", "M. A. Ajagbe", "E.-V. Chioasca", "R. T. Batista-Navarro"],
    402       "year": 2021,
    403       "doi": "10.1145/3444689",
    404       "relevance": "Predecessor survey of 404 NLP4RE studies (1983-2019), used as the primary comparison baseline."
    405     },
    406     {
    407       "title": "The prompt report: a systematic survey of prompt engineering techniques",
    408       "authors": ["S. Schulhoff"],
    409       "year": 2024,
    410       "arxiv_id": "2406.06608",
    411       "relevance": "Comprehensive survey of prompting techniques, cited as a map for exploration in RE prompt engineering."
    412     },
    413     {
    414       "title": "Evaluation guidelines for empirical studies in software engineering involving LLMs",
    415       "authors": ["S. Baltes"],
    416       "year": 2025,
    417       "arxiv_id": "2508.15503",
    418       "relevance": "Framework for empirical LLM studies in SE with focus on replicability and verifiability."
    419     },
    420     {
    421       "title": "ClarifyGPT: A framework for enhancing LLM-based code generation via requirements clarification",
    422       "authors": ["F. Mu", "L. Shi", "S. Wang", "Z. Yu", "B. Zhang", "C. Wang", "S. Liu", "Q. Wang"],
    423       "year": 2024,
    424       "relevance": "LLM-based tool for requirements clarification in code generation, bridging RE and SE."
    425     },
    426     {
    427       "title": "Software architecture meets LLMs: A systematic literature review",
    428       "authors": ["L. Schmid", "T. Hey", "M. Armbruster", "S. Corallo", "D. Fuchß", "J. Keim", "H. Liu", "A. Koziolek"],
    429       "year": 2025,
    430       "arxiv_id": "2505.16697",
    431       "relevance": "SLR on LLMs for software architecture tasks, complementary domain-specific survey."
    432     }
    433   ]
    434 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs