scan.json (21609B)
1 { 2 "paper": { 3 "title": "Large Language Models for Software Engineering: A Systematic Literature Review", 4 "authors": [ 5 "Xinyi Hou", 6 "Yanjie Zhao", 7 "Yue Liu", 8 "Zhou Yang", 9 "Kailong Wang", 10 "Li Li", 11 "Xiapu Luo", 12 "David Lo", 13 "John Grundy", 14 "Haoyu Wang" 15 ], 16 "year": 2024, 17 "venue": "ACM Transactions on Software Engineering and Methodology", 18 "arxiv_id": "2308.10620" 19 }, 20 "scan_version": 2, 21 "active_modules": ["survey_methodology"], 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper provides a GitHub repository: https://github.com/xinyi-hou/LLM4SE_SLR, mentioned in the abstract and Section 3.1." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The replication package at the GitHub repository includes the list of LLMs and their parameter sizes, and the paper list. Referenced in Sections 3.1 and 7." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No environment or dependency specifications are provided. This is a survey, but the analysis scripts (if any) have no documented environment." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are provided for replicating the search, filtering, or analysis procedures beyond the methodology description in Section 2." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": false, 48 "answer": false, 49 "justification": "This is a systematic literature review that does not run experiments or report quantitative results requiring confidence intervals." 50 }, 51 "significance_tests": { 52 "applies": false, 53 "answer": false, 54 "justification": "No statistical comparisons are made; the paper reports descriptive counts and distributions of papers across categories." 55 }, 56 "effect_sizes_reported": { 57 "applies": false, 58 "answer": false, 59 "justification": "No experimental effects are measured; this is a survey paper reporting distributions and trends." 60 }, 61 "sample_size_justified": { 62 "applies": false, 63 "answer": false, 64 "justification": "No experimental sample sizes; the paper aims for comprehensive coverage of relevant literature via systematic search." 65 }, 66 "variance_reported": { 67 "applies": false, 68 "answer": false, 69 "justification": "No experiments are run; descriptive statistics of paper counts do not require variance reporting." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Table 1 compares this SLR against 8 prior surveys, positioning it relative to existing reviews by scope, time frame, number of papers, and whether they follow an SLR process." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "The compared surveys in Table 1 are from 2022-2023, which are contemporary to this 2024 work." 82 }, 83 "ablation_study": { 84 "applies": false, 85 "answer": false, 86 "justification": "This is a survey paper with no system components to ablate." 87 }, 88 "multiple_metrics": { 89 "applies": false, 90 "answer": false, 91 "justification": "No system evaluation is performed; this is a literature review." 92 }, 93 "human_evaluation": { 94 "applies": false, 95 "answer": false, 96 "justification": "No system outputs to evaluate; this is a survey." 97 }, 98 "held_out_test_set": { 99 "applies": false, 100 "answer": false, 101 "justification": "No experiments requiring train/test splits; this is a survey." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper provides extensive breakdowns: by SE activity (Fig. 10), by LLM architecture (Fig. 5), by data type (Table 7), by evaluation metric (Table 9), and by SE task (Table 10)." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 8 discusses challenges and limitations of LLM4SE including ambiguity in code generation, generalizability issues, evaluation challenges, and interpretability/trustworthiness concerns." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper reports gaps: only 6 studies used industrial datasets (Section 4.1), software management has minimal coverage (0.69%, Section 6.1), and discusses limitations like models failing after semantic-preserving transformations (Section 8.1.2)." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims analysis of 395 papers across four RQs, which is substantiated throughout Sections 3-6 with detailed breakdowns and tables." 124 }, 125 "causal_claims_justified": { 126 "applies": false, 127 "answer": false, 128 "justification": "The paper makes no causal claims; it describes the landscape and trends of LLM4SE research without claiming causal relationships." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper states its scope as papers from January 2017 to January 2024, specifies the search databases used, and discusses limitations of the search process in Section 7 (paper search omission, study selection bias)." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": false, 137 "answer": false, 138 "justification": "This is a survey/taxonomy paper presenting no empirical results that would require alternative explanations." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": false, 142 "answer": false, 143 "justification": "This is a survey paper with no measurements; it reports counts and categorizations of existing literature." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": false, 149 "answer": false, 150 "justification": "No models are used in the methodology; this is a literature survey." 151 }, 152 "prompts_provided": { 153 "applies": false, 154 "answer": false, 155 "justification": "No prompting is used; this is a literature survey." 156 }, 157 "hyperparameters_reported": { 158 "applies": false, 159 "answer": false, 160 "justification": "No model training or inference is performed; this is a literature survey." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used; this is a literature survey." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 2 and Figure 1 detail the full paper selection pipeline with counts at each stage: 218,765 initial papers → 80,611 after page filter → 5,078 after keyword screening → 1,172 after venue filtering → 810 after dedup → 382 after quality assessment → 395 final. Inclusion/exclusion criteria are stated in Table 3, and quality assessment criteria in Table 4." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 7 'Threats to Validity' provides a dedicated discussion of three types of threats: paper search omission, study selection bias, and empirical knowledge bias." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 7 discusses specific threats: incomplete keyword summarization may miss papers, BibTeX record ambiguity causing mislabeling, subjective judgments in quality assessment. They describe specific mitigation steps like having two experienced reviewers conduct secondary review." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "The paper clearly bounds scope to LLMs applied to SE tasks (not SE for LLMs), papers from 2017-2024, papers with 8+ pages, and excludes workshops, theses, and tool demos (Table 3). The scope exclusion of SE4LLM is explicitly noted in Section 8.3." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "The replication package at https://github.com/xinyi-hou/LLM4SE_SLR contains the paper list and extracted data, referenced in Sections 3.1 and 7." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 2.2 describes the search strategy in detail: manual search across 6 venues, automated search across 7 databases with specific search strings, and snowballing. The complete keyword sets are provided." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants; this is a literature survey. Data sources are standard academic databases described in Section 2.2." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "Figure 1 documents the full pipeline with counts at each stage. Section 2.3 describes inclusion/exclusion criteria, quality assessment scoring, and the snowballing process. Specific thresholds are stated (e.g., QAC score ≥ 16.8/21 for published papers)." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding acknowledgment section is visible in the paper text provided." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "All author affiliations are clearly listed: Huazhong University of Science and Technology, Monash University, Singapore Management University, Beihang University, Hong Kong Polytechnic University." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding information is disclosed, so independence cannot be assessed." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": false, 237 "answer": false, 238 "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": false, 242 "answer": false, 243 "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": false, 247 "answer": false, 248 "justification": "This is a survey paper that does not evaluate any pre-trained model on benchmarks." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this literature survey." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this literature survey." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this literature survey." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this literature survey." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this literature survey." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this literature survey." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this literature survey." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": false, 291 "answer": false, 292 "justification": "This is a survey paper with no method that incurs inference costs." 293 }, 294 "compute_budget_stated": { 295 "applies": false, 296 "answer": false, 297 "justification": "This is a survey paper with no computational experiments." 298 } 299 }, 300 "survey_methodology": { 301 "prisma_or_structured_protocol": { 302 "applies": true, 303 "answer": true, 304 "justification": "The paper follows Kitchenham et al.'s SLR methodology (cited in Section 2). It uses the Quasi-Gold Standard (QGS) approach with a detailed flow diagram (Figure 1), systematic search strings, and structured stages for identification, selection, and quality assessment." 305 }, 306 "quality_assessment_of_sources": { 307 "applies": true, 308 "answer": true, 309 "justification": "Table 4 presents 10 Quality Assessment Criteria (QAC1-QAC10) scored on 0-3 scales. Papers needed ≥80% of maximum score to be included. Section 2.3.2 describes the full quality assessment process." 310 }, 311 "publication_bias_discussed": { 312 "applies": true, 313 "answer": false, 314 "justification": "The paper does not discuss publication bias. It acknowledges that 241/395 papers are from arXiv (not peer-reviewed) but does not discuss whether the included literature skews toward positive results or whether negative-result papers are underrepresented." 315 } 316 } 317 }, 318 "claims": [ 319 { 320 "claim": "395 relevant papers on LLM4SE were identified from January 2017 to January 2024, with 273 published in 2023 alone, showing rapidly growing research interest.", 321 "evidence": "Section 2.5 and Figure 2(b) show the temporal distribution: 7 (2020), 13 (2021), 56 (2022), 273 (2023), 46 (January 2024).", 322 "supported": "strong" 323 }, 324 { 325 "claim": "Decoder-only LLMs dominate SE tasks, with 432 instances across 195 papers in 2023, constituting 70.7% of research.", 326 "evidence": "Section 3.2 and Figure 5 show the trend analysis with specific counts per architecture type per year.", 327 "supported": "strong" 328 }, 329 { 330 "claim": "Code generation is the most prevalent SE task addressed by LLMs, with 118 studies, followed by program repair with 35 studies.", 331 "evidence": "Table 10 provides the complete distribution of SE tasks, and Section 6.4 discusses code generation in detail.", 332 "supported": "strong" 333 }, 334 { 335 "claim": "Only 6 out of 395 studies used industrial datasets, suggesting misalignment between academic research and industrial contexts.", 336 "evidence": "Section 4.1 and Figure 6 report the dataset source distribution: 235 open-source, 49 collected, 84 constructed, 6 industrial.", 337 "supported": "strong" 338 }, 339 { 340 "claim": "Software development accounts for 56.65% of LLM4SE research, while software management represents only 0.69%.", 341 "evidence": "Section 6.1 and Figure 10(a) show the distribution across six SE activities.", 342 "supported": "strong" 343 } 344 ], 345 "methodology_tags": ["meta-analysis"], 346 "key_findings": "This SLR of 395 papers (2017-2024) maps the LLM4SE landscape across four dimensions: models, data, optimization techniques, and SE tasks. Decoder-only architectures dominate (70.7% in 2023), with code generation (118 papers) and program repair (35 papers) as the most-studied tasks. The review identifies significant gaps: only 6 studies use industrial datasets, software management and design are barely explored, and the field lacks comprehensive evaluation frameworks. The paper catalogues 85 distinct SE tasks, 70+ LLMs, 8 prompt engineering techniques, and 19+ evaluation metrics used across the surveyed literature.", 347 "red_flags": [ 348 { 349 "flag": "No quality assessment of surveyed papers' methodology", 350 "detail": "While the paper assesses source quality for inclusion (QAC criteria in Table 4), the assessment focuses on paper completeness and clarity rather than methodological rigor. The survey does not evaluate whether the 395 included papers use appropriate experimental designs, report statistics correctly, or address contamination. This risks laundering weak results by treating all included papers as equally valid evidence." 351 }, 352 { 353 "flag": "61% of included papers are non-peer-reviewed arXiv preprints", 354 "detail": "241 of 395 papers are arXiv preprints. While the authors acknowledge this and apply quality assessment, the high proportion means the survey's conclusions rest heavily on unvetted work. The quality assessment criteria (QAC) are relatively permissive — scoring ≥80% on clarity/motivation/contribution criteria does not ensure methodological soundness." 355 }, 356 { 357 "flag": "Publication bias not discussed", 358 "detail": "The survey does not consider whether the LLM4SE literature it reviews is biased toward positive results. Papers showing LLMs fail at SE tasks may be underrepresented, skewing the survey's characterization of LLM effectiveness." 359 } 360 ], 361 "cited_papers": [ 362 { 363 "title": "Evaluating large language models trained on code", 364 "authors": ["Mark Chen"], 365 "year": 2021, 366 "arxiv_id": "2107.03374", 367 "relevance": "Introduces Codex and HumanEval benchmark, foundational for LLM code generation evaluation." 368 }, 369 { 370 "title": "A systematic evaluation of large language models of code", 371 "authors": ["Frank F Xu", "Uri Alon", "Graham Neubig", "Vincent Josua Hellendoorn"], 372 "year": 2022, 373 "relevance": "Systematic evaluation of multiple LLMs (Codex, GPT-J, GPT-Neo, PolyCoder) on SE tasks including code completion." 374 }, 375 { 376 "title": "Is your code generated by chatgpt really correct? Rigorous evaluation of large language models for code generation", 377 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 378 "year": 2023, 379 "arxiv_id": "2305.01210", 380 "relevance": "Introduces EvalPlus for more rigorous evaluation of LLM-generated code, addressing test adequacy." 381 }, 382 { 383 "title": "SWE-bench: Can language models resolve real-world github issues?", 384 "authors": ["Carlos E Jimenez"], 385 "year": 2023, 386 "arxiv_id": "2310.06770", 387 "relevance": "Major benchmark for evaluating LLM capability on real-world software engineering tasks." 388 }, 389 { 390 "title": "Keep the Conversation Going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT", 391 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 392 "year": 2023, 393 "arxiv_id": "2304.00385", 394 "relevance": "Key study on conversational automated program repair with ChatGPT, demonstrating cost-effectiveness." 395 }, 396 { 397 "title": "Large language models for software engineering: Survey and open problems", 398 "authors": ["Angela Fan", "Beliz Gokkaya", "Mark Harman"], 399 "year": 2023, 400 "arxiv_id": "2310.03533", 401 "relevance": "Competing survey on LLMs for SE, useful for comparison of survey scope and findings." 402 }, 403 { 404 "title": "Software Testing with Large Language Model: Survey, Landscape, and Vision", 405 "authors": ["Junjie Wang"], 406 "year": 2023, 407 "arxiv_id": "2307.07221", 408 "relevance": "Focused survey on LLMs for software testing, complementary to this broader SLR." 409 }, 410 { 411 "title": "A survey of large language models for code: Evolution, benchmarking, and future trends", 412 "authors": ["Zibin Zheng"], 413 "year": 2023, 414 "arxiv_id": "2311.10372", 415 "relevance": "Survey specifically on code LLMs with benchmarking focus, overlapping scope." 416 }, 417 { 418 "title": "Competition-level code generation with AlphaCode", 419 "authors": ["Yujia Li"], 420 "year": 2022, 421 "relevance": "Demonstrates LLM capability at competition-level programming, significant for code generation evaluation." 422 }, 423 { 424 "title": "Automated Program Repair in the Era of Large Pre-Trained Language Models", 425 "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"], 426 "year": 2023, 427 "relevance": "Comprehensive study of LLMs for automated program repair across multiple benchmarks." 428 }, 429 { 430 "title": "ChatDev: Communicative Agents for Software Development", 431 "authors": ["Chen Qian"], 432 "year": 2023, 433 "arxiv_id": "2307.07924", 434 "relevance": "Multi-agent LLM system for software development, relevant to agentic AI workflows." 435 }, 436 { 437 "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework", 438 "authors": ["Sirui Hong"], 439 "year": 2023, 440 "arxiv_id": "2308.00352", 441 "relevance": "Multi-agent framework for collaborative software development using LLMs." 442 } 443 ] 444 }