scan.json (26494B)
1 { 2 "paper": { 3 "title": "CodeMirage: Hallucinations in Code Generated by Large Language Models", 4 "authors": ["Vibhor Agarwal", "Yulong Pei", "Salwa Alamir", "Xiaomo Liu"], 5 "year": 2024, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2408.08333" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "The paper states in a footnote on page 1: 'The dataset will be released upon acceptance.' No GitHub or Zenodo link is provided. A promise of future release counts as NO per schema rules." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "Same footnote: 'The dataset will be released upon acceptance.' The CodeMirage dataset is not currently available. The base datasets (HumanEval, MBPP) are public, but the novel artifact — the hallucinated code snippets — is not released." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup is provided. The paper mentions using CodeLLaMA-7B-Instruct and OpenAI's API but does not specify library versions or environment details." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The paper describes methodology at a high level but does not give enough detail to replicate without the dataset or code." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Table 5 reports only point estimates for accuracy, macro-precision, macro-recall, and macro-F1. No confidence intervals, error bars, or ± notation is provided." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims GPT-4 'performs the best' and 'beats the fine-tuned CodeBERT model by 6.15 percentage macro-F1 score' but does not perform any significance tests to support these comparative claims." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "While the paper provides raw metric differences (e.g., '6.15 percentage macro-F1 score' improvement), no formal effect sizes (Cohen's d, odds ratios, etc.) are reported. The raw difference with baseline context is marginal — the paper does report 'from X to Y' implicitly via the table, but the 6.15pp difference is stated in isolation without systematic effect size reporting across conditions." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for the dataset size (1,137 snippets) or the 200-sample human annotation subset. No power analysis is discussed." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Results appear to be from single runs. No standard deviation, variance across seeds, or multiple-run results are reported for any of the LLM experiments." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares multiple models: CodeLLaMA, GPT-3.5, GPT-4, and fine-tuned CodeBERT as baselines for code hallucination detection (Section 5.1, Table 5)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The baselines include GPT-4 (2023), CodeLLaMA (2023), and fine-tuned CodeBERT. For a 2024 paper, these are reasonably contemporary models representing both open-source and proprietary state-of-the-art." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "The paper claims in contributions (Section 1) to 'conduct comprehensive experiments and ablation studies' but the results section (Section 5.3) only compares different models — it does not ablate components of the detection approach (e.g., removing the one-shot example, varying prompt structure). Comparing different models is not an ablation study." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 5 reports accuracy, macro-precision, macro-recall, and macro-F1 scores for all models." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 4.2 describes human annotation of 200 code snippets by 5 annotators to validate the dataset quality, with Cohen's kappa reported (0.76). This evaluates the quality of the LLM-generated hallucinated code." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "For CodeBERT, an 80:20 stratified train/test split is used (Section 5.1). The LLM models (GPT-3.5, GPT-4, CodeLLaMA) are evaluated with one-shot prompting on the full dataset without any tuning on it, so there is no train/test leakage for those." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": false, 93 "justification": "Table 5 provides results broken down by base dataset (HumanEval vs. MBPP) but not by hallucination type. The paper does not report per-category (dead code, syntactic error, logical error, etc.) detection performance, which would be important for understanding where models struggle." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "The paper does not discuss specific failure cases or provide qualitative error analysis. It notes CodeLLaMA 'does not perform well' and GPT-3.5 has lower performance, but does not analyze why or show example failures." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that CodeLLaMA achieves very low macro-F1 scores (0.0424 and 0.0271), and that GPT-3.5 also performs poorly. These are genuine negative results showing that not all LLMs are effective at this task." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims are generally supported: it introduces a taxonomy, proposes the CodeMirage benchmark, and reports GPT-4 performs best on HumanEval and gives 'comparable results' to CodeBERT on MBPP. Table 5 confirms these claims. The 'comparable' hedging for MBPP is appropriate." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": false, 115 "justification": "The paper makes implicit causal claims through ablation language in contributions ('ablation studies to demonstrate the capabilities') and says that the gap between GPT-3.5 and GPT-4 is 'surprising,' implying model capability causes better detection. However, model comparisons alone without controlling for confounds (prompt sensitivity, token limits, etc.) do not adequately support causal inference." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title claims 'Hallucinations in Code Generated by Large Language Models' broadly, but the study only covers Python code from HumanEval and MBPP, generated by GPT-3.5. The taxonomy claims to be 'comprehensive' but is validated only on Python. The paper does not bound its generalization to Python or to these specific benchmarks." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for the observed results. For example, GPT-4's better performance could be partly due to the detection prompt being better suited to GPT-4, or the hallucinated code being generated by GPT-3.5 giving GPT-4 an advantage in detecting its patterns. No such confounds are discussed." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper uses 'GPT-3.5', 'GPT-4', and 'CodeLLaMA-7B-Instruct' but does not specify exact model versions or snapshot dates for the OpenAI models (e.g., 'gpt-3.5-turbo-0613'). CodeLLaMA-7B-Instruct is specified by size and variant, which is more specific but still lacks a precise checkpoint." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Full prompt templates are provided in Appendix A (generation prompts for each hallucination type) and Appendix B (detection prompt), including the layout in Tables 1 and 4. The actual example code within prompts is shown. The placeholder structure is clear (programming question inserted)." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "Section 5.2 reports temperature of 0.7 and max tokens of 256 for the detection experiments. However, hyperparameters for the generation phase (GPT-3.5 generating hallucinated code) are not reported — temperature, top-p, etc. for dataset construction are missing. CodeBERT fine-tuning hyperparameters (learning rate, epochs, etc.) are also not reported." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. The approach is straightforward one-shot prompting without tool use, feedback loops, or multi-step workflows." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper describes how hallucinated code was generated (Section 4.1) but does not describe any preprocessing, filtering, or quality control on the generated outputs beyond the human annotation validation on a 200-sample subset. It is unclear whether any generated outputs were discarded for quality reasons before the final 1,137 total." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. The paper goes directly from results to 'Conclusions and Future Work' (Section 6) without discussing limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No threats to validity are discussed anywhere in the paper. There is no mention of potential biases from using GPT-3.5 to generate the hallucinated code, single-language limitation, or annotation limitations." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what its results do NOT show. It does not bound its claims to Python, to HumanEval/MBPP, or to GPT-3.5-generated code. The broad title suggests general applicability without such boundaries." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "The CodeMirage dataset is not released ('will be released upon acceptance'). Raw annotations, LLM outputs, and individual predictions are not available for independent verification." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.1 describes the dataset generation process in detail: base datasets (HumanEval, MBPP) are used, hallucination types are randomly assigned per problem, type-specific prompts are fed to GPT-3.5, and the output is collected. Section 4.2 describes the human annotation procedure." 182 }, 183 "recruitment_methods_described": { 184 "applies": true, 185 "answer": false, 186 "justification": "Section 4.2 mentions '5 human annotators, well-versed in Python programming' but does not describe how they were recruited, their backgrounds, whether they were colleagues/students, or whether this introduces any bias." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "The generation pipeline is described at a high level, but intermediate steps are unclear. For instance, it is not stated how many GPT-3.5 generations failed or were discarded, whether all 1,137 problems yielded exactly one hallucinated snippet each, or what the conversion rate was. The jump from 164+973=1,137 total problems to exactly 1,137 hallucinated snippets suggests one-to-one correspondence, but this is not explicitly confirmed." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "There is no acknowledgments section listing funding sources. The disclaimer states it was prepared by JP Morgan AI Research but does not disclose specific funding or grants." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: University of Surrey and JP Morgan AI Research (London, UK and New York, USA). The first author's work was done during an internship at JP Morgan AI Research, as noted in the footnote." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "The work was done at JP Morgan AI Research, which is a corporate lab. JP Morgan uses LLMs for code generation in practice and has a stake in understanding and mitigating hallucinations. No discussion of whether this creates a conflict of interest. The paper does not evaluate JP Morgan products specifically, but the funder is not independent of the general research area." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "There is no competing interests statement. The disclaimer section is a legal boilerplate about JP Morgan but does not address financial conflicts of interest related to the findings." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper evaluates GPT-3.5, GPT-4, and CodeLLaMA on code from HumanEval and MBPP benchmarks but does not state the training data cutoff dates for any of these models." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "HumanEval (2021) and MBPP (2021) are well-known public benchmarks that could be in the training data of GPT-3.5, GPT-4, and CodeLLaMA. The paper does not discuss potential train/test overlap at all." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "HumanEval and MBPP were published in 2021, before the training cutoff of all models used. The fact that GPT-3.5 may have seen these problems could affect the quality and nature of the hallucinated code it generates, and GPT-4/CodeLLaMA may have seen the correct solutions, affecting detection performance. This contamination risk is not addressed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "The human annotation task in Section 4.2 is dataset validation by expert annotators, not a human subjects study requiring pre-registration." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "The annotators are performing a professional coding task (labeling code snippets), not participating as research subjects. IRB approval is not typically required for expert annotation tasks." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "The annotators are described as 'well-versed in Python programming' but since this is expert annotation for dataset validation rather than a human subjects study, detailed demographics are not required." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "This is expert annotation, not a human subjects study. Annotator selection criteria are relevant but fall outside the human_studies category." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human subjects experiment was conducted. The annotation task does not involve treatment/control conditions." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human subjects experiment was conducted requiring blinding." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human subjects experiment was conducted. The annotation task does not have meaningful attrition concerns." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The paper does not report inference costs, tokens consumed, or wall-clock time for any of the experiments (neither dataset generation via GPT-3.5 nor hallucination detection via multiple LLMs)." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget is stated. The paper does not report GPU hours for CodeLLaMA/CodeBERT, API costs for GPT-3.5/GPT-4, or total training time for CodeBERT fine-tuning." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "GPT-4 performs the best on HumanEval dataset for code hallucination detection, beating fine-tuned CodeBERT by 6.15 percentage points in macro-F1.", 286 "evidence": "Table 5 shows GPT-4 achieves macro-F1 of 0.5512 on HumanEval vs CodeBERT's 0.4897, a difference of 6.15pp (Section 5.3).", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "GPT-4 gives comparable results to fine-tuned CodeBERT on MBPP dataset.", 291 "evidence": "Table 5 shows GPT-4 macro-F1 of 0.5195 on MBPP vs CodeBERT's 0.6344. GPT-4 has higher macro-Precision (0.6644 vs 0.6455) but lower macro-F1 (Section 5.3).", 292 "supported": "weak" 293 }, 294 { 295 "claim": "The CodeMirage dataset's automatically assigned hallucination type labels are reliable, with 0.81 accuracy against human annotations.", 296 "evidence": "Section 4.2 reports accuracy of 0.81 between automatic gold labels and majority human annotations on 200 sampled snippets, with Cohen's kappa of 0.76 among annotators.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "This is the first work studying hallucinations in LLM-generated code.", 301 "evidence": "Section 1 states 'To the best of our knowledge, this is the first attempt at studying hallucinations in the code generated by LLMs.' Related work (Section 2) discusses adjacent but not identical problems.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "There is a big performance gap between GPT-3.5 and GPT-4 for code hallucination detection.", 306 "evidence": "Table 5 shows GPT-3.5 achieves macro-F1 of 0.2654 (HumanEval) and 0.2092 (MBPP) vs GPT-4's 0.5512 and 0.5195, roughly 2x difference (Section 5.3).", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "The paper introduces CodeMirage, a benchmark of 1,137 GPT-3.5-generated hallucinated Python code snippets across 5 hallucination types (dead code, syntactic errors, logical errors, robustness issues, security vulnerabilities) built on HumanEval and MBPP. GPT-4 with one-shot prompting outperforms fine-tuned CodeBERT on HumanEval (macro-F1: 0.5512 vs 0.4897) but falls behind on MBPP (0.5195 vs 0.6344). CodeLLaMA-7B-Instruct performs very poorly on this task (macro-F1 < 0.05). Human annotators achieve Cohen's kappa of 0.76, and automatic labels align with human majority at 0.81 accuracy.", 312 "red_flags": [ 313 { 314 "flag": "No limitations section", 315 "detail": "The paper has no dedicated limitations or threats-to-validity section. This is a significant omission given the many potential confounds (single language, single generation model, contamination risk, etc.)." 316 }, 317 { 318 "flag": "Benchmark contamination risk unaddressed", 319 "detail": "HumanEval and MBPP (both 2021) are almost certainly in the training data of GPT-3.5, GPT-4, and CodeLLaMA. This could affect both the quality of hallucinated code generation and the detection performance. The paper does not discuss this at all." 320 }, 321 { 322 "flag": "Broad title with narrow scope", 323 "detail": "The title claims 'Hallucinations in Code Generated by Large Language Models' generally, but the study only covers Python, only two benchmarks (HumanEval, MBPP), and only GPT-3.5-generated hallucinated code. No bounds on generalization are stated." 324 }, 325 { 326 "flag": "Dataset not released", 327 "detail": "The CodeMirage dataset is described but not available ('will be released upon acceptance'). This prevents independent verification or replication." 328 }, 329 { 330 "flag": "No error bars or significance tests", 331 "detail": "All results are single-run point estimates with no uncertainty quantification. Claims of one model outperforming another rest on raw number comparisons without statistical testing." 332 }, 333 { 334 "flag": "GPT-4 'comparable' to CodeBERT claim is generous", 335 "detail": "On MBPP, GPT-4 macro-F1 (0.5195) trails CodeBERT (0.6344) by 11.5 percentage points. Calling this 'comparable' is generous framing. The paper highlights GPT-4's higher precision but the overall metric gap is substantial." 336 }, 337 { 338 "flag": "Generation model used for detection benchmark", 339 "detail": "GPT-3.5 generates the hallucinated code and is then evaluated as a detector. This creates a potential confound: GPT-3.5's poor detection performance may partly reflect difficulty detecting its own idiosyncratic patterns, while GPT-4 may benefit from having a different architecture." 340 } 341 ], 342 "cited_papers": [ 343 { 344 "title": "Evaluating large language models trained on code", 345 "authors": ["Mark Chen", "Jerry Tworek"], 346 "year": 2021, 347 "arxiv_id": "2107.03374", 348 "relevance": "Introduces HumanEval benchmark used as a base dataset in this paper, central to code generation evaluation." 349 }, 350 { 351 "title": "Program synthesis with large language models", 352 "authors": ["Jacob Austin", "Augustus Odena"], 353 "year": 2021, 354 "arxiv_id": "2108.07732", 355 "relevance": "Introduces MBPP benchmark used as a base dataset in this paper for code generation evaluation." 356 }, 357 { 358 "title": "Code llama: Open foundation models for code", 359 "authors": ["Baptiste Roziere", "Jonas Gehring"], 360 "year": 2023, 361 "arxiv_id": "2308.12950", 362 "relevance": "Open-source code LLM used as a baseline for hallucination detection experiments." 363 }, 364 { 365 "title": "CodeBERT: A pre-trained model for programming and natural languages", 366 "authors": ["Zhangyin Feng", "Daya Guo"], 367 "year": 2020, 368 "relevance": "Pre-trained code model fine-tuned as a baseline for code hallucination detection." 369 }, 370 { 371 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 372 "authors": ["Jiawei Liu", "Chunqiu Steven Xia"], 373 "year": 2023, 374 "arxiv_id": "2305.01210", 375 "relevance": "Evaluates functional correctness of LLM-generated code with automated test generation, closely related to code quality assessment." 376 }, 377 { 378 "title": "LLM is like a box of chocolates: the non-determinism of ChatGPT in code generation", 379 "authors": ["Shuyin Ouyang", "Jie M Zhang"], 380 "year": 2023, 381 "arxiv_id": "2308.02828", 382 "relevance": "Studies non-determinism in LLM code generation, directly relevant to reliability and hallucination concerns." 383 }, 384 { 385 "title": "Purple llama CyberSecEval: A secure coding benchmark for language models", 386 "authors": ["Manish Bhatt", "Sahana Chennabasappa"], 387 "year": 2023, 388 "arxiv_id": "2312.04724", 389 "relevance": "Benchmark for cybersecurity evaluation of LLM coding assistants, related to security vulnerability hallucination type." 390 }, 391 { 392 "title": "Large language models of code fail at completing code with potential bugs", 393 "authors": ["Tuan Dinh", "Jinman Zhao"], 394 "year": 2023, 395 "arxiv_id": "2306.03438", 396 "relevance": "Studies buggy-code completion problem, directly relevant to understanding LLM code generation failures." 397 }, 398 { 399 "title": "Survey of hallucination in natural language generation", 400 "authors": ["Ziwei Ji", "Nayeon Lee"], 401 "year": 2023, 402 "relevance": "Foundational survey on LLM hallucinations that provides the text hallucination framework this paper extends to code." 403 }, 404 { 405 "title": "Siren's song in the AI ocean: A survey on hallucination in large language models", 406 "authors": ["Yue Zhang", "Yafu Li"], 407 "year": 2023, 408 "arxiv_id": "2309.01219", 409 "relevance": "Comprehensive survey on LLM hallucination taxonomy, directly relevant to this paper's extension to code hallucinations." 410 }, 411 { 412 "title": "Software vulnerability and functionality assessment using LLMs", 413 "authors": ["Rasmus Ingemann Tuffveson Jensen", "Vali Tawosi", "Salwa Alamir"], 414 "year": 2024, 415 "arxiv_id": "2403.08429", 416 "relevance": "Uses LLMs to assess software vulnerabilities, related to the security vulnerability hallucination detection task." 417 }, 418 { 419 "title": "AI-assisted coding: Experiments with GPT-4", 420 "authors": ["Russell A. Poldrack", "Thomas Lu"], 421 "year": 2023, 422 "arxiv_id": "2304.13187", 423 "relevance": "Empirical study of GPT-4 for coding tasks, directly relevant to understanding LLM code generation capabilities." 424 } 425 ] 426 }