scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (21278B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
      6     "authors": [
      7       "Xinyi Hou",
      8       "Yanjie Zhao",
      9       "Yue Liu",
     10       "Zhou Yang",
     11       "Kailong Wang",
     12       "Li Li",
     13       "Xiapu Luo",
     14       "David Lo",
     15       "John Grundy",
     16       "Haoyu Wang"
     17     ],
     18     "year": 2024,
     19     "venue": "ACM Transactions on Software Engineering and Methodology",
     20     "arxiv_id": "2308.10620",
     21     "doi": "XXXXXXX.XXXXXXX"
     22   },
     23   "checklist": {
     24     "claims_and_evidence": {
     25       "abstract_claims_supported": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The abstract claims about selecting and analyzing 395 papers (2017–2024), answering four RQs, and publicly available artifacts are all verified in the body of the paper with detailed methodology sections and appendices.",
     29         "source": "haiku"
     30       },
     31       "causal_claims_justified": {
     32         "applies": false,
     33         "answer": false,
     34         "justification": "This is a descriptive systematic review; no causal claims requiring experimental justification are made. Trend observations (e.g., decoder-only dominance in 2023) are presented as correlational observations, not causal findings.",
     35         "source": "haiku"
     36       },
     37       "generalization_bounded": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Findings are explicitly scoped to the 395 papers in their corpus from January 2017 to January 2024, with clear acknowledgment in the threats-to-validity section that keyword incompleteness may cause omissions.",
     41         "source": "haiku"
     42       },
     43       "alternative_explanations_discussed": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "When the authors attribute trends (e.g., 'This rapid transition can be linked to the generative capability of decoder-only LLMs'), they do not consider alternative explanations such as API availability, commercial incentives, or sampling bias from including 241 arXiv preprints.",
     47         "source": "haiku"
     48       },
     49       "proxy_outcome_distinction": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "The paper measures paper counts, venue distributions, and reported techniques; claims stay at this bibliometric level without conflating frequency of study with effectiveness or quality of the studied methods.",
     53         "source": "haiku"
     54       }
     55     },
     56     "limitations_and_scope": {
     57       "limitations_section_present": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Section 7 'Threats to Validity' is a dedicated section covering three distinct threats: paper search omission, study selection bias, and empirical knowledge bias.",
     61         "source": "haiku"
     62       },
     63       "threats_to_validity_specific": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Specific threats are named with concrete mitigations: keyword incompleteness addressed by QGS + snowballing, automated mislabeling addressed by retaining borderline papers for manual review, subjective bias addressed by inviting two external reviewers for secondary review.",
     67         "source": "haiku"
     68       },
     69       "scope_boundaries_stated": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Time frame (2017–2024) is explicitly justified by the Transformer inception date; the review explicitly excludes papers leveraging SE methods to enhance LLMs (as opposed to LLMs for SE) and lists exclusion criteria in Table 3.",
     73         "source": "haiku"
     74       }
     75     },
     76     "conflicts_of_interest": {
     77       "funding_disclosed": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No funding acknowledgment section is present in the provided paper text; the paper lists institutional affiliations but makes no disclosure of grants, industry funding, or other financial support.",
     81         "source": "haiku"
     82       },
     83       "affiliations_disclosed": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "All ten authors have their institutional affiliations explicitly listed on the title page (HUST, Monash, SMU, Beihang, PolyU).",
     87         "source": "haiku"
     88       },
     89       "funder_independent_of_outcome": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Funding is not disclosed, so independence of funder from outcome cannot be assessed.",
     93         "source": "haiku"
     94       },
     95       "financial_interests_declared": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No competing interests or financial interests statement appears anywhere in the paper text.",
     99         "source": "haiku"
    100       }
    101     },
    102     "scope_and_framing": {
    103       "key_terms_defined": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "LLMs are defined and distinguished from general PLMs by parameter scale; architectures (encoder-only, encoder-decoder, decoder-only) are taxonomized and described in Section 3.1; SE tasks are organized into six SDLC phases.",
    107         "source": "haiku"
    108       },
    109       "intended_contribution_clear": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The introduction states five specific contributions explicitly as bullet points, including being the first comprehensive SLR on 395 LLM4SE papers covering LLM taxonomy, datasets, optimization, evaluation, and 85 SE tasks.",
    113         "source": "haiku"
    114       },
    115       "engagement_with_prior_work": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Table 1 explicitly compares this work against eight prior surveys (Zhang et al., Zheng et al., Fan et al., Wang et al., etc.) on dimensions of model scope, SE scope, SLR status, time frame, and paper count, clearly situating the contribution.",
    119         "source": "haiku"
    120       }
    121     }
    122   },
    123   "type_checklist": {
    124     "survey": {
    125       "search_and_selection": {
    126         "search_strategy_reproducible": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "The Quasi-Gold Standard (QGS) approach is fully described with the complete SE and LLM keyword lists explicitly provided, enabling reproduction of the automated search.",
    130           "source": "haiku"
    131         },
    132         "inclusion_exclusion_explicit": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "Table 3 lists three inclusion criteria and nine exclusion criteria with specific, concrete conditions (e.g., 'short papers with fewer than 8 pages', 'non-English literature').",
    136           "source": "haiku"
    137         },
    138         "prisma_or_structured_protocol": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "The paper explicitly follows Kitchenham et al.'s SLR methodology (cited as [197, 198]), a well-established SE systematic review protocol, with planning, conducting, and reporting phases.",
    142           "source": "haiku"
    143         },
    144         "search_terms_provided": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Both keyword sets are fully listed in Section 2.2.1: SE task keywords (30+ terms) and LLM keywords (30+ terms) including wildcards, with explanation of why broad terms like 'ML' and 'DL' were included.",
    148           "source": "haiku"
    149         },
    150         "databases_listed": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Seven databases are explicitly named with paper counts: IEEE Xplore (1,192), ACM Digital Library (10,445), ScienceDirect (62,290), Web of Science (42,166), Springer (85,671), arXiv (9,966), DBLP (4,035).",
    154           "source": "haiku"
    155         },
    156         "screening_process_documented": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Figure 1 shows a detailed flowchart with counts at every stage: 218,765 → 80,611 → 5,078 → 1,172 → 810 → 594 → 382 after quality assessment, then snowballing adds 13 for a final 395.",
    160           "source": "haiku"
    161         },
    162         "review_scope_justified": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "The 2017 start date is justified by the publication of the Transformer architecture paper; arXiv inclusion is justified by the field's rapid evolution; top-venue selection is justified by targeting authoritative and comprehensive SE coverage.",
    166           "source": "haiku"
    167         }
    168       },
    169       "synthesis_quality": {
    170         "conflicting_findings_acknowledged": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "The paper synthesizes findings descriptively by counting and categorizing, but does not systematically identify or discuss conflicting results across reviewed papers (e.g., papers reporting different conclusions about the same LLM on similar tasks).",
    174           "source": "haiku"
    175         },
    176         "quality_assessment_of_sources": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "Table 4 defines ten Quality Assessment Criteria (QAC1–QAC10) with a three-point scoring rubric (0/1/2/3), and papers below a threshold score (80% of maximum) are excluded from the final corpus.",
    180           "source": "haiku"
    181         },
    182         "publication_bias_discussed": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "Publication bias—the tendency for positive results to be over-represented in published literature—is not discussed anywhere in the threats-to-validity section or elsewhere, despite being a standard concern in systematic reviews of empirical work.",
    186           "source": "haiku"
    187         },
    188         "quantitative_synthesis_present": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "The paper provides substantial quantitative synthesis through vote counting: frequency tables for LLMs per SE task, paper counts per venue and year, input form distributions, evaluation metric frequencies, and trend analysis across architectures over time.",
    192           "source": "haiku"
    193         },
    194         "recommendations_supported_by_evidence": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Section 8's challenges and opportunities are each supported by specific citations to reviewed papers; for example, the recommendation to address industrial dataset scarcity is backed by the finding that only 6 of 395 studies used industrial datasets.",
    198           "source": "haiku"
    199         }
    200       }
    201     }
    202   },
    203   "claims": [
    204     {
    205       "claim": "Decoder-only LLM architectures dominated LLM4SE research in 2023, comprising 70.7% of papers that year.",
    206       "evidence": "Figure 5 shows 432 decoder-only instances across 195 unique papers in 2023 vs. 85 encoder-decoder and 94 encoder-only instances.",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Code generation is the most studied SE task, appearing in 118 of 395 papers.",
    211       "evidence": "Table 10 lists code generation under software development with 118 papers, far exceeding the next task (program repair at 35).",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "61% of selected papers (241 of 395) are arXiv preprints rather than peer-reviewed venue publications.",
    216       "evidence": "Figure 2(a) shows 241 arXiv papers vs. 154 peer-reviewed venue papers; the text explicitly notes this.",
    217       "supported": "strong"
    218     },
    219     {
    220       "claim": "Only 6 of the 395 studies used industrial datasets, indicating a gap between academic research and real-world deployment contexts.",
    221       "evidence": "Figure 6 shows industrial datasets used in only 6 papers; the text notes this 'suggests a potential misalignment between academic and industrial research.'",
    222       "supported": "strong"
    223     },
    224     {
    225       "claim": "Token-based input forms account for approximately 97.75% of LLM4SE studies that specify input format.",
    226       "evidence": "Table 8 shows 347 token-based papers out of 355 that specify input form (150 text tokens + 118 code tokens + 78 combined + rounding).",
    227       "supported": "strong"
    228     },
    229     {
    230       "claim": "LLM4SE research grew explosively in 2023, with 273 relevant papers published in that year alone.",
    231       "evidence": "Figure 2(b) shows paper counts: 7 (2020), 13 (2021), 56 (2022), 273 (2023), 46 (Jan 2024 only).",
    232       "supported": "strong"
    233     },
    234     {
    235       "claim": "Software development tasks (primarily code generation) account for 56.65% of LLM4SE applications, while software management tasks account for only 0.69%.",
    236       "evidence": "Figure 10(a) shows these percentage breakdowns across six SDLC phases.",
    237       "supported": "strong"
    238     }
    239   ],
    240   "methodology_tags": [
    241     "meta-analysis",
    242     "qualitative"
    243   ],
    244   "key_findings": "This SLR of 395 LLM4SE papers (2017–2024) finds that decoder-only architectures (especially GPT and ChatGPT variants) have come to dominate SE research since 2022, with code generation being the single most studied task (118 papers). The field is characterized by heavy reliance on open-source datasets (62.83% of studies), near-total dependence on token-based inputs (97.75%), and strikingly little use of industrial datasets (only 6 studies), suggesting a significant gap between academic benchmarks and real-world software practice. Research volume exploded in 2023 with 273 papers, driven largely by arXiv preprints (61% of the corpus) that bypass peer review—a structural feature of the corpus not critically examined by the authors.",
    245   "red_flags": [
    246     {
    247       "flag": "Majority arXiv preprints",
    248       "detail": "61% of included papers (241/395) are unreviewed arXiv preprints. While the authors apply a quality rubric, including this many preprints introduces unknown reliability variation and likely inflates optimistic performance claims common in preprints."
    249     },
    250     {
    251       "flag": "Publication bias unaddressed",
    252       "detail": "The survey never discusses publication bias—the well-known tendency for positive LLM benchmark results to be published while failures or negative comparisons are not. Given the fast-moving commercial stakes of LLM4SE, this omission is significant."
    253     },
    254     {
    255       "flag": "No conflict synthesis",
    256       "detail": "Conflicting results across reviewed papers are not identified or reconciled. The survey reports what papers claim about LLM capabilities without noting when studies reach opposite conclusions about the same model or task."
    257     },
    258     {
    259       "flag": "Funding not disclosed",
    260       "detail": "No funding acknowledgment appears in the paper, making it impossible to assess potential industry influence on framing or paper inclusion decisions."
    261     },
    262     {
    263       "flag": "Quality rubric assesses reporting, not methodology",
    264       "detail": "The 10-criterion QAC in Table 4 assesses whether papers describe their setups clearly, not whether evaluations are methodologically sound (e.g., appropriate baselines, proper test sets, contamination controls). Low-quality evaluations that are well-described pass the filter."
    265     }
    266   ],
    267   "cited_papers": [
    268     {
    269       "title": "Guidelines for performing systematic literature reviews in software engineering",
    270       "relevance": "The methodological foundation for the entire review; the authors explicitly follow Kitchenham et al.'s three-phase SLR process."
    271     },
    272     {
    273       "title": "Machine/deep learning for software engineering: A systematic literature review",
    274       "relevance": "Primary precursor survey (Wang et al. 2022) covering ML/DL for SE with 1,209 papers; this paper positions itself as the LLM-specific successor."
    275     },
    276     {
    277       "title": "A survey of deep learning for software engineering",
    278       "relevance": "Yang et al. 2022 DL4SE survey; authors consulted it when formulating RQs and use its task taxonomy."
    279     },
    280     {
    281       "title": "Software Testing with Large Language Model: Survey, Landscape, and Vision",
    282       "relevance": "Directly related narrow SLR (Wang et al. 2023) on LLMs for testing with 52 papers; compared in Table 1."
    283     },
    284     {
    285       "title": "A Survey of Learning-based Automated Program Repair",
    286       "relevance": "Related narrow SLR (Zhang et al. 2023) on APR; used for comparison and as source of QGS papers."
    287     },
    288     {
    289       "title": "Evaluating large language models trained on code",
    290       "relevance": "Chen et al. 2021 (Codex/HumanEval); one of the most-cited primary studies (referenced 62 times in the corpus) and the source of the HumanEval benchmark."
    291     },
    292     {
    293       "title": "A survey of large language models",
    294       "relevance": "Zhao et al. 2023 background survey on LLMs generally; used to frame the LLM definition and scope."
    295     },
    296     {
    297       "title": "Large language models for software engineering: Survey and open problems",
    298       "relevance": "Fan et al. 2023 concurrent survey compared in Table 1; does not follow systematic review protocol, which this paper addresses."
    299     }
    300   ],
    301   "engagement_factors": {
    302     "practical_relevance": {
    303       "score": 3,
    304       "justification": "Directly helps practitioners and researchers navigate which LLMs and techniques to use for which SE tasks, with 85 tasks catalogued and trend data."
    305     },
    306     "surprise_contrarian": {
    307       "score": 1,
    308       "justification": "Confirms expected trends (GPT dominance, code generation focus) rather than challenging conventional wisdom; the industrial dataset scarcity finding is the most noteworthy surprise."
    309     },
    310     "fear_safety": {
    311       "score": 1,
    312       "justification": "Touches on security concerns (vulnerability detection, jailbreaks, PII leakage) but only in passing as future research directions, not as primary findings."
    313     },
    314     "drama_conflict": {
    315       "score": 0,
    316       "justification": "A standard academic survey with no controversy, competing claims, or dramatic findings; even the criticism of prior surveys is mild."
    317     },
    318     "demo_ability": {
    319       "score": 1,
    320       "justification": "A public GitHub repository with replication artifacts is provided, though the survey itself is not a demonstrable tool."
    321     },
    322     "brand_recognition": {
    323       "score": 2,
    324       "justification": "Extensively covers GPT-4, ChatGPT, Codex, GitHub Copilot, and other high-profile models; authors are from well-known SE research groups (HUST, SMU, Monash)."
    325     }
    326   },
    327   "hn_data": {
    328     "threads": [
    329       {
    330         "hn_id": "36012320",
    331         "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    332         "points": 34,
    333         "comments": 2,
    334         "url": "https://news.ycombinator.com/item?id=36012320",
    335         "created_at": "2023-05-20T14:43:30Z"
    336       },
    337       {
    338         "hn_id": "41391420",
    339         "title": "Kan 2.0: Kolmogorov-Arnold Networks Meet Science",
    340         "points": 8,
    341         "comments": 0,
    342         "url": "https://news.ycombinator.com/item?id=41391420",
    343         "created_at": "2024-08-29T14:54:10Z"
    344       },
    345       {
    346         "hn_id": "36002796",
    347         "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    348         "points": 6,
    349         "comments": 0,
    350         "url": "https://news.ycombinator.com/item?id=36002796",
    351         "created_at": "2023-05-19T15:02:59Z"
    352       },
    353       {
    354         "hn_id": "36585056",
    355         "title": "Natural Selection Favors AIs over Humans",
    356         "points": 4,
    357         "comments": 4,
    358         "url": "https://news.ycombinator.com/item?id=36585056",
    359         "created_at": "2023-07-04T11:08:02Z"
    360       },
    361       {
    362         "hn_id": "32632312",
    363         "title": "Exploring the Role of the Cybercrime Underground in the Russia-Ukraine Conflict",
    364         "points": 4,
    365         "comments": 0,
    366         "url": "https://news.ycombinator.com/item?id=32632312",
    367         "created_at": "2022-08-28T21:36:55Z"
    368       },
    369       {
    370         "hn_id": "37866902",
    371         "title": "Getting Bored of Cyberwar",
    372         "points": 3,
    373         "comments": 1,
    374         "url": "https://news.ycombinator.com/item?id=37866902",
    375         "created_at": "2023-10-13T05:03:06Z"
    376       },
    377       {
    378         "hn_id": "36008023",
    379         "title": "Tree-of-Thought (ToT), complex and general problem solving with LLMs",
    380         "points": 3,
    381         "comments": 1,
    382         "url": "https://news.ycombinator.com/item?id=36008023",
    383         "created_at": "2023-05-19T23:38:37Z"
    384       },
    385       {
    386         "hn_id": "37990040",
    387         "title": "Exploring the Space of Key-Value-Query Models with Intention",
    388         "points": 3,
    389         "comments": 0,
    390         "url": "https://news.ycombinator.com/item?id=37990040",
    391         "created_at": "2023-10-23T18:58:38Z"
    392       },
    393       {
    394         "hn_id": "36381655",
    395         "title": "Natural Selection Favors AIs over Humans",
    396         "points": 3,
    397         "comments": 0,
    398         "url": "https://news.ycombinator.com/item?id=36381655",
    399         "created_at": "2023-06-18T16:23:55Z"
    400       },
    401       {
    402         "hn_id": "36149566",
    403         "title": "Tree of Thoughts: Official vs. Wrong Implementation",
    404         "points": 2,
    405         "comments": 0,
    406         "url": "https://news.ycombinator.com/item?id=36149566",
    407         "created_at": "2023-06-01T11:00:31Z"
    408       }
    409     ],
    410     "top_points": 34,
    411     "total_points": 70,
    412     "total_comments": 8
    413   }
    414 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs