scan.json (23884B)
1 { 2 "paper": { 3 "title": "A Systematic Literature Review of Code Hallucinations in LLMs: Characterization, Mitigation Methods, Challenges, and Future Directions for Reliable AI", 4 "authors": [ 5 "Cuiyun Gao", 6 "Guodong Fan", 7 "Chun Yong Chong", 8 "Shizhan Chen", 9 "Chao Liu", 10 "David Lo", 11 "Zibin Zheng", 12 "Qing Liao" 13 ], 14 "year": 2025, 15 "venue": "ACM Transactions on Software Engineering and Methodology", 16 "arxiv_id": "2511.00776" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No code repository, analysis scripts, or data extraction tools are mentioned or released. A survey can release its analysis scripts and extracted data tables but this one does not." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "The extracted study data (Table 2) and search results are described but not released as a downloadable dataset. No supplementary data files or repositories are provided." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No environment or tooling specifications are provided. While this is a survey paper, any analysis scripts or tools used for data extraction and classification could have been documented." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step instructions for reproducing the search, screening, or analysis process are provided beyond the methodology description in Section 3. The search strings and databases are stated but there is no replication package." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": false, 44 "answer": false, 45 "justification": "This is a systematic literature review that does not run experiments or perform statistical aggregation. No confidence intervals are applicable." 46 }, 47 "significance_tests": { 48 "applies": false, 49 "answer": false, 50 "justification": "No comparative statistical claims are made that would require significance testing. The paper is a narrative synthesis, not a meta-analysis." 51 }, 52 "effect_sizes_reported": { 53 "applies": false, 54 "answer": false, 55 "justification": "No experimental results are reported. The paper is a qualitative systematic review, not a meta-analysis with statistical aggregation." 56 }, 57 "sample_size_justified": { 58 "applies": false, 59 "answer": false, 60 "justification": "No experiments with sample sizes are conducted. The number of reviewed papers (60) is a result of the search and screening process, not a statistical sample." 61 }, 62 "variance_reported": { 63 "applies": false, 64 "answer": false, 65 "justification": "No experimental runs or quantitative aggregation is performed. This is a qualitative survey." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper compares itself against the prior survey by Lee et al. [67] in Section 1, listing four specific advantages. It also positions itself relative to NLP hallucination surveys." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "The comparison is against Lee et al. [67] from 2025, which is the most recent and closely related survey on hallucinations in code generation LLMs." 78 }, 79 "ablation_study": { 80 "applies": false, 81 "answer": false, 82 "justification": "This is a survey paper with no system or method to ablate." 83 }, 84 "multiple_metrics": { 85 "applies": false, 86 "answer": false, 87 "justification": "This is a survey paper that does not evaluate a system. It reviews metrics used by others (Section 7.4) but does not apply them." 88 }, 89 "human_evaluation": { 90 "applies": false, 91 "answer": false, 92 "justification": "This is a survey paper. No system outputs are produced that would require human evaluation." 93 }, 94 "held_out_test_set": { 95 "applies": false, 96 "answer": false, 97 "justification": "No experiments are run; there is no test set to hold out." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper provides per-category breakdowns: publication venue types (Fig. 6a), contribution types (Fig. 6b), publication trends per year (Fig. 7), hallucination cause categories (Fig. 8), and mitigation method categories (Fig. 9)." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 8 (Challenges and Opportunities) discusses limitations and failure modes of current approaches, including the difficulty of distinguishing hallucination from mistake (8.1.1), design pattern unawareness (8.1.4), and challenges with private data (8.1.6)." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports negative findings: 'no method excelling universally' for knowledge editing (Section 5.3.2), that 'most existing efforts...overlook the unique structural and semantic properties of code' (RQ3 summary), and that current benchmarks are limited to Python/Java (Section 7.2)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims to (1) survey 60 papers, (2) define hallucination in code context, (3) review mitigation strategies, (4) review benchmarks, and (5) discuss challenges. All five are substantiated in Sections 3-8 respectively." 120 }, 121 "causal_claims_justified": { 122 "applies": false, 123 "answer": false, 124 "justification": "The paper is a survey that synthesizes findings from reviewed papers. It does not make causal claims of its own about interventions or effects." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 9 (Threats to validity) explicitly bounds the scope: 'we limit the scope to publications from 2022 to 2025, which may result in the omission of some earlier foundational work' and acknowledges that the search covered only three databases." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": false, 133 "answer": false, 134 "justification": "As a pure survey/taxonomy paper with no empirical results of its own, there are no findings for which alternative explanations would apply. The paper presents no original empirical claims requiring alternative explanation." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "The paper does not use any LLMs in its methodology. It is a manual systematic literature review." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "No prompting is used. The paper is a manual systematic review." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "No models or experiments are used. This is a systematic literature review." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 3 documents the full search and filtering pipeline: search strings are provided (Section 3.2), three databases are named, inclusion/exclusion criteria are listed (Section 3.3), inter-rater reliability is measured via Cohen's Kappa (0.59 pilot, 0.73 full), and Table 1 shows counts at each stage (67+46 total → 60+24 after dedup → 36+24 after review). Filtering criteria at each stage are stated." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 9 is a dedicated 'Threats to validity' section with two subsections (9.1 Literature Selection Validity, 9.2 Data Analysis Validity) spanning approximately one page." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 9.1 discusses specific threats: the temporal limit to 2022-2025 may miss earlier work, the search covered only three databases (ACM DL, IEEE Xplore, Web of Science), and terminology is not standardized. Section 9.2 notes that two independent annotators extracted data with cross-validation by a third reviewer, and acknowledges the risk of mis-classification due to inconsistent terminology." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 9.1 states: 'we limit the scope to publications from 2022 to 2025' and the search was restricted to three databases. Section 3.3 lists explicit exclusion criteria (grey literature excluded, extended journal versions preferred). The scope is bounded to code hallucination specifically." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "The list of 60 reviewed papers is not released as a downloadable dataset. Individual references are cited in-text but no supplementary data file or spreadsheet with the extracted data (Table 2) is made available." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3.2 provides the exact search strings used, names the three databases (ACM Digital Library, IEEEXplore, Web of Science), and states the search date (May 25, 2025). Section 3.3 lists inclusion/exclusion criteria." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants are involved. The paper reviews published literature, not human subjects." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Table 1 shows the full pipeline: total papers per database, total before deduplication (67 code + 46 NLP), after removing duplicates (61 + 46), and after review (36 + 24 = 60). Section 3.3 describes the two-author screening with Cohen's Kappa measurement and third-author tie-breaking." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding sources or acknowledgments section is present in the paper. There is no mention of grants, sponsors, or funding agencies." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: Harbin Institute of Technology, Shandong Agriculture and Engineering University, Monash University Malaysia, Tianjin University, Chongqing University, Singapore Management University, and Sun Yat-sen University. None of the authors appear to be affiliated with companies whose products are being reviewed." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed at all, so independence cannot be assessed. The absence of any funding disclosure is itself a concern." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement or financial disclosure is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This is a systematic literature review. It does not evaluate any pre-trained model's capability on a benchmark." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "This is a systematic literature review. No model evaluation is performed." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "This is a systematic literature review. No model evaluation is performed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants are involved in this systematic literature review." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants are involved." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants are involved." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants are involved. Paper inclusion/exclusion criteria are assessed under data_preprocessing_documented." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants or experimental conditions." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants or experimental conditions." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "This is a survey paper. It does not propose or run any method that incurs inference costs." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "This is a survey paper with no computational experiments." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "Code hallucination is a growing research area, with 20 publications in 2024 compared to 5 in 2023.", 295 "evidence": "Figure 7(a) shows publication counts per year. Section 4.3 states: 'a notable rise in 2024 with 20 publications, compared to 5 in 2023.'", 296 "supported": "moderate" 297 }, 298 { 299 "claim": "ArXiv preprints account for the largest portion (54%) of code hallucination publications, followed by conferences (30%) and journals (16%).", 300 "evidence": "Figure 6(a) and Section 4.3: 'Preprints on arXiv account for the largest portion (54%), followed by conferences (30%) and journals (16%).'", 301 "supported": "strong" 302 }, 303 { 304 "claim": "Most code hallucination mitigation approaches still adopt methods similar to NLP, overlooking code's unique structural and semantic properties.", 305 "evidence": "RQ3 summary in Section 6: 'most of these approaches often overlook the unique structural and semantic properties of code' and 'Solutions specifically tailored to code hallucination...are still relatively scarce.'", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "Only one prior survey paper (1%) exists specifically on code hallucination, highlighting the lack of comprehensive overviews.", 310 "evidence": "Figure 6(b) and Section 4.3: 'Only one survey paper (1%) exists, further highlighting the lack of comprehensive overviews.'", 311 "supported": "moderate" 312 }, 313 { 314 "claim": "Current code hallucination benchmarks focus primarily on Python and Java, leaving low-resource or domain-specific languages underrepresented.", 315 "evidence": "Section 7.2 and Table 7 show all listed benchmarks use Python or Python+Java. Section 7.2: 'these benchmarks focus primarily on a limited set of PLs, e.g., Python or Java, leaving low-resource or domain-specific languages underrepresented.'", 316 "supported": "strong" 317 }, 318 { 319 "claim": "Inter-rater agreement for study selection was 'moderate' (Cohen's Kappa 0.59) in pilot and 'substantial' (0.73) for full assessment.", 320 "evidence": "Section 3.3: 'The agreement rate in the pilot study is \"moderate\" (0.59)...The agreement rate in the full assessment is \"substantial\" (0.73).'", 321 "supported": "strong" 322 } 323 ], 324 "methodology_tags": [ 325 "meta-analysis", 326 "qualitative" 327 ], 328 "key_findings": "This systematic literature review of 60 papers identifies three categories of code hallucination (knowledge/factuality, functional misalignment/faithfulness, and environmental compatibility) and traces their causes to model-level, prompt/input, training data, and project/environment factors. The review finds that most mitigation approaches still borrow from NLP without adequately addressing code-specific structural properties. Current benchmarks are narrowly focused on Python and Java, and the paper identifies key challenges including distinguishing hallucination from ordinary errors, the relationship between model confidence and hallucination, and the tension between generalization and factual accuracy.", 329 "red_flags": [ 330 { 331 "flag": "No quality assessment of reviewed studies", 332 "detail": "The survey does not perform any structured quality assessment or risk-of-bias evaluation of the 60 reviewed papers. It summarizes and categorizes findings but does not evaluate whether the underlying studies are methodologically sound. This risks laundering the signal-to-noise ratio of its sources." 333 }, 334 { 335 "flag": "Moderate inter-rater agreement", 336 "detail": "The Cohen's Kappa for full paper selection was 0.73 ('substantial'), but the pilot was only 0.59 ('moderate'). While this is reported transparently, the initial moderate agreement suggests the inclusion criteria were not sufficiently precise, introducing potential selection bias." 337 }, 338 { 339 "flag": "No replication package", 340 "detail": "Despite following Kitchenham's SLR guidelines, the paper provides no downloadable dataset, extraction forms, or supplementary materials. The list of 60 papers must be reconstructed from in-text citations, making independent verification difficult." 341 }, 342 { 343 "flag": "No funding or competing interests disclosure", 344 "detail": "The paper contains no acknowledgments section, no funding disclosure, and no competing interests statement. While the authors are all academic, the absence of any disclosure is a gap." 345 } 346 ], 347 "cited_papers": [ 348 { 349 "title": "Hallucination by Code Generation LLMs: Taxonomy, Benchmarks, Mitigation, and Challenges", 350 "authors": ["Yunseo Lee", "John Youngeun Song", "Dongsun Kim", "Jindae Kim", "Mijung Kim", "Jaechang Nam"], 351 "year": 2025, 352 "arxiv_id": "2504.20799", 353 "relevance": "Directly related prior survey on hallucinations in code generation LLMs, covering types, benchmarks, and mitigation strategies." 354 }, 355 { 356 "title": "Exploring and evaluating hallucinations in llm-powered code generation", 357 "authors": ["Fang Liu", "Yang Liu", "Lin Shi", "Houkun Huang", "Ruifeng Wang", "Zhen Yang", "Li Zhang"], 358 "year": 2024, 359 "arxiv_id": "2404.00971", 360 "relevance": "Proposes the HALLUCODE benchmark for evaluating hallucination recognition in LLM-generated code." 361 }, 362 { 363 "title": "CodeHalu: Investigating code hallucinations in llms via execution-based verification", 364 "authors": ["Yuchen Tian", "Weixiang Yan", "Qian Yang"], 365 "year": 2025, 366 "relevance": "Introduces the CodeHaluEval benchmark with 8,883 samples for systematic quantitative evaluation of code hallucinations." 367 }, 368 { 369 "title": "LLM hallucinations in practical code generation: Phenomena, mechanism, and mitigation", 370 "authors": ["Ziyao Zhang", "Chong Wang", "Yanlin Wang"], 371 "year": 2025, 372 "relevance": "Empirical study of hallucination mechanisms and mitigation in repository-level code generation scenarios." 373 }, 374 { 375 "title": "Codemirage: Hallucinations in code generated by large language models", 376 "authors": ["Vibhor Agarwal", "Yulong Pei", "Salwa Alamir", "Xiaomo Liu"], 377 "year": 2024, 378 "arxiv_id": "2408.08333", 379 "relevance": "Introduces a code hallucination taxonomy and the CodeMirage benchmark dataset for evaluation." 380 }, 381 { 382 "title": "We Have a Package for You! A Comprehensive Analysis of Package Hallucinations by Code Generating LLMs", 383 "authors": ["J. Spracklen", "R. Wijewickrama", "A. H. M. N. Sakib", "A. Maiti", "M. Jadliwala"], 384 "year": 2024, 385 "relevance": "Systematically evaluates package hallucinations in LLM-generated code across 576K samples and 16 LLMs, revealing supply chain threats." 386 }, 387 { 388 "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions", 389 "authors": ["Lei Huang", "Weijiang Yu", "Weitao Ma"], 390 "year": 2025, 391 "relevance": "Comprehensive taxonomy and analysis of hallucinations in LLMs broadly, relevant as foundational NLP hallucination survey." 392 }, 393 { 394 "title": "Survey of hallucination in natural language generation", 395 "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"], 396 "year": 2023, 397 "relevance": "Foundational survey on hallucination in NLG covering definitions, evaluation metrics, and mitigation techniques." 398 }, 399 { 400 "title": "De-Hallucinator: Mitigating LLM Hallucinations in Code Generation Tasks via Iterative Grounding", 401 "authors": ["Aryaz Eghbali", "Michael Pradel"], 402 "year": 2024, 403 "arxiv_id": "2401.01701", 404 "relevance": "Proposes iterative grounding with API references to mitigate hallucinations in LLM code generation." 405 }, 406 { 407 "title": "Towards Mitigating API Hallucination in Code Generated by LLMs with Hierarchical Dependency Aware", 408 "authors": ["Yujia Chen", "Mingyu Chen", "Cuiyun Gao"], 409 "year": 2025, 410 "relevance": "Proposes MARIN framework for mitigating API hallucination through dependency-aware constrained decoding." 411 }, 412 { 413 "title": "Constrained decoding for secure code generation", 414 "authors": ["Yanjun Fu", "Ethan Baker", "Yu Ding", "Yizheng Chen"], 415 "year": 2024, 416 "arxiv_id": "2405.00218", 417 "relevance": "Proposes CODEGUARD+ benchmark and explores constrained decoding for secure code generation." 418 }, 419 { 420 "title": "Gorilla: Large Language Model Connected with Massive APIs", 421 "authors": ["S. G. Patil", "T. Zhang", "X. Wang", "J. E. Gonzalez"], 422 "year": 2023, 423 "relevance": "Fine-tuned LLaMA model for API call generation that outperforms GPT-4 and significantly reduces hallucinations." 424 }, 425 { 426 "title": "Bugs in large language models generated code: An empirical study", 427 "authors": ["Florian Tambon", "Arghavan Moradi-Dakhel", "Amin Nikanjam", "Foutse Khomh"], 428 "year": 2025, 429 "relevance": "Empirical study identifying 10 distinctive bug patterns in LLM-generated code including Hallucinated Object pattern." 430 }, 431 { 432 "title": "Identifying and Mitigating API Misuse in Large Language Models", 433 "authors": ["T. Y. Zhuo", "J. He", "J. Sun", "Z. Xing", "D. Lo", "J. Grundy", "X. Du"], 434 "year": 2025, 435 "relevance": "First comprehensive study of API misuse patterns in LLM-generated code with taxonomy of four misuse types." 436 } 437 ] 438 }