scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20463B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Large Language Models for Requirements Engineering: A Systematic Literature Review",
      6     "authors": [
      7       "Mohammad Amin Zadenoori",
      8       "Jacek Dąbrowski",
      9       "Waad Alhoshan",
     10       "Liping Zhao",
     11       "Alessio Ferrari"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2509.11446",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The abstract and Section 1 state 'Zero-shot (44%) and Few-shot (29%)' but Table 10 shows 38% and 26% respectively; Section 5.4.2 reports 48% share code/prompts while Section 6.5 claims 61% — these internal inconsistencies undermine the abstract's accuracy.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper claims LLM4RE growth was 'catalyzed by the availability of accessible interfaces' (ChatGPT caused the surge) but this is a correlation-based observation from a literature count; no causal design supports it.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims are generally scoped to 'the reviewed 74 studies' and the 2023–2024 window; broad statements about the field are framed as observations from the corpus, not universal claims.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not consider that dominance of elicitation/validation tasks may reflect venue bias (74% of papers appear at RE conference and its workshops), nor does it address selection effects from using only Scopus.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper measures paper counts, task frequencies, and strategy distributions as direct descriptions of the literature corpus rather than claiming they proxy external outcomes.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 ('Threats to Validity') is a dedicated section addressing study selection validity, data validity, and research validity.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are named: single-database limitation (Scopus only), single-author data extraction without systematic cross-checking, and reliance on venue ranking as a quality proxy — all named explicitly.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Scope is bounded to 2023–2024 peer-reviewed studies in CORE A*/A/B conferences and Q1/Q2 journals found via Scopus plus manual venue search; the paper also includes an explicit preliminary-study disclaimer.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All five authors list institutional affiliations (Padova, Limerick, IMSIU, Manchester, UCD) on the title page.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funder is disclosed, making independence assessment inapplicable; however, first author Zadenoori is cited as [117] and co-author Ferrari's work is cited as [66, 88] without disclosure of this self-citation bias.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 2 formally defines Requirements Engineering (phases, tasks, artifacts) and LLMs (architecture, capabilities), and Figure 1 provides an explicit conceptual scheme.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly states it provides a systematic mapping of 74 LLM4RE studies across demographics, RE tasks, prompting strategies, resources, and evaluation methods, structured around five research questions.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 3 discusses multiple prior surveys (Zhao et al. NLP4RE, Hou et al. LLMs in SE, Jin et al., He et al.) and the paper consistently compares its findings against Zhao et al. [27] throughout the results.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "survey": {
    120       "search_and_selection": {
    121         "search_strategy_reproducible": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The exact Scopus search string is provided verbatim in Section 4.2, and the secondary-search venues are listed in Table 1 with execution dates given.",
    125           "source": "haiku"
    126         },
    127         "inclusion_exclusion_explicit": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Table 2 lists four inclusion criteria (I1–I3) and four exclusion criteria (E1–E4) with explicit definitions, including venue-quality thresholds via CORE and SJR rankings.",
    131           "source": "haiku"
    132         },
    133         "prisma_or_structured_protocol": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper follows Kitchenham's SLR guidelines [26] and provides a PRISMA-style flow diagram (Figure 2) with paper counts at each filtering stage.",
    137           "source": "haiku"
    138         },
    139         "search_terms_provided": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "The full Boolean search string is quoted in Section 4.2, combining LLM-representative terms with RE-domain terms using explicit AND/OR operators.",
    143           "source": "haiku"
    144         },
    145         "databases_listed": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Scopus is named as the primary database; Table 1 lists eight specific journals/conferences for secondary manual search.",
    149           "source": "haiku"
    150         },
    151         "screening_process_documented": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "Figure 2 documents the screening funnel with counts: 244 initial → 136 after title/abstract screening → 62 after inclusion/exclusion + 12 from secondary search = 74 final studies.",
    155           "source": "haiku"
    156         },
    157         "review_scope_justified": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The 2023–2024 scope is justified by the observation that no LLM4RE papers appeared before ChatGPT's release; Scopus is justified for its coverage of IEEE/ACM/Elsevier venues; venue selection for secondary search is explained.",
    161           "source": "haiku"
    162         }
    163       },
    164       "synthesis_quality": {
    165         "conflicting_findings_acknowledged": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The paper categorizes and counts findings but does not explicitly discuss cases where primary studies report conflicting results on effectiveness or applicability of LLMs to the same RE task.",
    169           "source": "haiku"
    170         },
    171         "quality_assessment_of_sources": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Venue ranking (CORE A*/A/B, Q1/Q2) is used as an inclusion filter, not a quality rubric applied to included studies; no risk-of-bias assessment or quality scoring of the 74 papers is performed.",
    175           "source": "haiku"
    176         },
    177         "publication_bias_discussed": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "The paper does not discuss publication bias or acknowledge that the reviewed studies likely overrepresent positive findings about LLM effectiveness in RE tasks.",
    181           "source": "haiku"
    182         },
    183         "quantitative_synthesis_present": {
    184           "applies": true,
    185           "answer": false,
    186           "justification": "The synthesis is limited to descriptive frequency counts and percentages; there is no meta-analysis, effect size aggregation, or statistical comparison of results across studies.",
    187           "source": "haiku"
    188         },
    189         "recommendations_supported_by_evidence": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Future-work recommendations in Section 6 are consistently tied to identified gaps — e.g., recommending more RAG-based approaches because only 7% of studies use them, or more Field Studies because 76% rely on lab experiments.",
    193           "source": "haiku"
    194         }
    195       }
    196     }
    197   },
    198   "claims": [
    199     {
    200       "claim": "Publications on LLM4RE grew 136% from 2023 to 2024, with 22 studies in 2023 and 52 in 2024.",
    201       "evidence": "Figure 3 and Section 5.1 report exact counts with year-over-year comparison.",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Requirements Elicitation and Validation each account for 20% of studied tasks, dominating the LLM4RE landscape.",
    206       "evidence": "Table 5 documents 15 studies each (20%) for these tasks across the 74 primary studies.",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Zero-shot prompting is used in 38% of studies and K-shot in 26%, dominating prompt engineering strategies.",
    211       "evidence": "Table 10 reports counts of 28 and 19 studies respectively; however, the abstract and introduction incorrectly state 44% and 29%, creating an internal inconsistency.",
    212       "supported": "moderate"
    213     },
    214     {
    215       "claim": "GPT-family models dominate LLM adoption, cited in 77% of reviewed studies.",
    216       "evidence": "Table 14 lists 57/74 studies using GPT models.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Only 16% of studies use publicly available datasets, creating significant reproducibility barriers.",
    221       "evidence": "Section 5.4.1 identifies 10 public datasets used in 12 of 74 studies (16%).",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "Laboratory experiments dominate evaluation (76%), with zero Field Experiments or Sample Studies.",
    226       "evidence": "Table 17 shows 56/74 studies use laboratory experiments and 0 use field experiments or sample studies.",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "RAG-based and interactive prompting remain severely underexplored at 7% and 4% of studies respectively.",
    231       "evidence": "Table 10 reports 5 studies using RAG-driven prompting and 3 using Interactive prompting out of 74.",
    232       "supported": "strong"
    233     },
    234     {
    235       "claim": "48% of studies provide source code or prompts, indicating growing open science awareness.",
    236       "evidence": "Section 5.4.2 reports 36/74 studies (48%), though Section 6.5 contradictorily states '45 studies (61%)' — a direct internal inconsistency.",
    237       "supported": "weak"
    238     }
    239   ],
    240   "methodology_tags": [
    241     "meta-analysis",
    242     "qualitative"
    243   ],
    244   "key_findings": "This SLR of 74 LLM4RE studies (2023–2024) finds the field exploded after ChatGPT's release, growing 136% in one year and shifting focus from traditional NLP4RE tasks (defect detection, classification) to cognitively demanding work like elicitation and validation (20% each). Zero-shot and few-shot prompting dominate (38%/26%) while advanced techniques like RAG (7%) remain underexplored. GPT models account for 77% of model usage. Reproducibility is poor: only 16% of studies share public datasets, and 76% rely solely on laboratory experiments with no field experiments or sample studies at all. The paper also self-identifies as a preliminary study with no systematic inter-rater reliability checks on data extraction.",
    245   "red_flags": [
    246     {
    247       "flag": "Abstract/intro statistics contradict results tables",
    248       "detail": "Abstract and Section 1 state Zero-shot=44%, Few-shot=29%, but Table 10 shows 38% and 26% respectively. Section 5.4.2 reports 48% code sharing but Section 6.5 says 61% — the same paper contradicts itself on basic counts."
    249     },
    250     {
    251       "flag": "Single-author data extraction, no inter-rater reliability",
    252       "detail": "The paper explicitly states data extraction was performed by the first author alone with only selective cross-checking by the last author — acknowledged in Section 4.3 and the preliminary study disclaimer."
    253     },
    254     {
    255       "flag": "Undisclosed self-citation",
    256       "detail": "First author Zadenoori's own paper [117] and co-author Ferrari's papers [66, 88] are included among the reviewed studies without disclosure of potential bias in classification or interpretation."
    257     },
    258     {
    259       "flag": "Single database primary search",
    260       "detail": "Primary search uses only Scopus; ACM Digital Library, IEEE Xplore, and Google Scholar are excluded from the primary search, with coverage gaps only partially addressed by the secondary venue-based search."
    261     },
    262     {
    263       "flag": "No quality assessment of included studies",
    264       "detail": "Venue ranking is used as an inclusion filter but no study-level quality rubric is applied; a workshop paper at a B-ranked conference is treated the same as a journal article with rigorous methodology."
    265     },
    266     {
    267       "flag": "No publication bias discussion",
    268       "detail": "The paper does not acknowledge that published studies disproportionately report positive results, which could inflate the apparent capability of LLMs for RE tasks."
    269     }
    270   ],
    271   "cited_papers": [
    272     {
    273       "title": "Natural Language Processing for Requirements Engineering: A Systematic Mapping Study",
    274       "relevance": "Primary baseline comparison — Zhao et al.'s NLP4RE survey covering 1983–2019 that this paper explicitly continues and compares against throughout."
    275     },
    276     {
    277       "title": "Large Language Models for Software Engineering: A Systematic Literature Review (Hou et al., 2024)",
    278       "relevance": "Directly comparable SLR of 395 LLM4SE publications; examines overlapping territory across the broader SE field."
    279     },
    280     {
    281       "title": "The ABC of Software Engineering Research (Stol & Fitzgerald, 2018)",
    282       "relevance": "Classification framework for empirical strategies used throughout the paper to categorize evaluation approaches in primary studies."
    283     },
    284     {
    285       "title": "Guidelines for Performing Systematic Literature Reviews in Software Engineering (Kitchenham, 2011)",
    286       "relevance": "Methodological protocol this SLR claims to follow; foundational reference for SLR methodology in SE."
    287     },
    288     {
    289       "title": "Evaluation Guidelines for Empirical Studies in Software Engineering Involving LLMs (Baltes et al., 2025)",
    290       "relevance": "Proposed evaluation framework for LLM studies in SE; directly relevant to methodological quality assessment of the primary studies reviewed."
    291     },
    292     {
    293       "title": "Replication in Requirements Engineering: The NLP for RE Case (Abualhaija et al., 2024)",
    294       "relevance": "Addresses reproducibility challenges in NLP4RE, directly informing the paper's analysis of dataset sharing and replicability in LLM4RE."
    295     },
    296     {
    297       "title": "LLM-Based Multi-Agent Systems for Software Engineering (He et al., 2025)",
    298       "relevance": "Related SLR covering multi-agent LLM systems for SE; relevant to agentic AI methodology assessment context."
    299     }
    300   ],
    301   "engagement_factors": {
    302     "practical_relevance": {
    303       "score": 3,
    304       "justification": "Provides a comprehensive taxonomy of tools, datasets, and prompting strategies that practitioners can immediately apply to requirements engineering workflows."
    305     },
    306     "surprise_contrarian": {
    307       "score": 1,
    308       "justification": "The finding that RAG and interactive prompting are severely underexplored (7%/4%) despite being key LLM strengths is mildly contrarian; the poor reproducibility rate is notable."
    309     },
    310     "fear_safety": {
    311       "score": 0,
    312       "justification": "Briefly mentions safety-critical domains (healthcare, autonomous systems) as motivation but does not raise AI risk concerns as a central theme."
    313     },
    314     "drama_conflict": {
    315       "score": 0,
    316       "justification": "No controversy or adversarial framing; purely a mapping study presenting descriptive findings."
    317     },
    318     "demo_ability": {
    319       "score": 0,
    320       "justification": "This is a literature survey; there is nothing to demo or try directly."
    321     },
    322     "brand_recognition": {
    323       "score": 0,
    324       "justification": "Authors are from European universities without a famous lab affiliation; no industry partner involvement."
    325     }
    326   },
    327   "hn_data": {
    328     "threads": [
    329       {
    330         "hn_id": "44332699",
    331         "title": "AbsenceBench: Language models can't tell what's missing",
    332         "points": 324,
    333         "comments": 84,
    334         "url": "https://news.ycombinator.com/item?id=44332699",
    335         "created_at": "2025-06-20T22:26:52Z"
    336       },
    337       {
    338         "hn_id": "44116412",
    339         "title": "FlowTSE: Target Speaker Extraction with Flow Matching",
    340         "points": 25,
    341         "comments": 2,
    342         "url": "https://news.ycombinator.com/item?id=44116412",
    343         "created_at": "2025-05-28T14:30:33Z"
    344       },
    345       {
    346         "hn_id": "43451552",
    347         "title": "Blockchain with Proof of Quantum Work",
    348         "points": 5,
    349         "comments": 1,
    350         "url": "https://news.ycombinator.com/item?id=43451552",
    351         "created_at": "2025-03-23T08:24:58Z"
    352       },
    353       {
    354         "hn_id": "44445700",
    355         "title": "A nanosecond-duration radio pulse originating from the defunct Relay 2 satellite",
    356         "points": 2,
    357         "comments": 0,
    358         "url": "https://news.ycombinator.com/item?id=44445700",
    359         "created_at": "2025-07-02T16:31:26Z"
    360       },
    361       {
    362         "hn_id": "43424742",
    363         "title": "Blockchain with Proof of Quantum Work",
    364         "points": 2,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=43424742",
    367         "created_at": "2025-03-20T15:35:28Z"
    368       },
    369       {
    370         "hn_id": "41630923",
    371         "title": "Marca: Mamba Accelerator with ReConfigurable Architecture",
    372         "points": 2,
    373         "comments": 0,
    374         "url": "https://news.ycombinator.com/item?id=41630923",
    375         "created_at": "2024-09-23T21:43:44Z"
    376       },
    377       {
    378         "hn_id": "45250763",
    379         "title": "Advancing Deep Search Agents with Knowledge Graphs and Multi-Turn RL",
    380         "points": 1,
    381         "comments": 0,
    382         "url": "https://news.ycombinator.com/item?id=45250763",
    383         "created_at": "2025-09-15T15:22:49Z"
    384       },
    385       {
    386         "hn_id": "44980047",
    387         "title": "DuPO: Enabling Reliable LLM Self-Verification via Dual Preference Optimization",
    388         "points": 1,
    389         "comments": 0,
    390         "url": "https://news.ycombinator.com/item?id=44980047",
    391         "created_at": "2025-08-22T01:07:28Z"
    392       },
    393       {
    394         "hn_id": "44032309",
    395         "title": "How AI Generates Creativity from Inauthenticity",
    396         "points": 1,
    397         "comments": 0,
    398         "url": "https://news.ycombinator.com/item?id=44032309",
    399         "created_at": "2025-05-19T17:29:24Z"
    400       },
    401       {
    402         "hn_id": "41598858",
    403         "title": "An Imperative Language for Verified Exact Real-Number Computation",
    404         "points": 1,
    405         "comments": 0,
    406         "url": "https://news.ycombinator.com/item?id=41598858",
    407         "created_at": "2024-09-20T04:41:52Z"
    408       }
    409     ],
    410     "top_points": 324,
    411     "total_points": 364,
    412     "total_comments": 87
    413   }
    414 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs