scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29170B)
      1 {
      2   "paper": {
      3     "title": "Beyond Functional Correctness: Exploring Hallucinations in LLM-Generated Code",
      4     "authors": ["Fang Liu", "Yang Liu", "Lin Shi", "Zhen Yang", "Li Zhang", "Xiaoli Lian", "Zhongqi Li", "Yuchi Ma"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2404.00971"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper provides a GitHub link to the replication package: https://github.com/Lorien1128/code_hallucination. This is mentioned in the abstract and conclusion."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The replication package is stated to be publicly available at the GitHub link, which includes the annotated dataset and codebook. The benchmarks used (HumanEval, CoderEval) are also public."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. Model URLs are provided but no dependency or environment setup details are given."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper does not include step-by-step reproduction instructions. It describes the methodology at a high level (e.g., greedy decoding, prompt format) but does not provide runnable scripts or a README with commands to replicate results."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper uses 95% confidence level with 5% margin of error for sampling (Section IV-A2 and VI-A), and reports a 95% Bootstrap confidence interval of (0.0113, 0.0725) for the mediation analysis (Section V-B2)."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper states 'we apply statistical tests on the above results that exhibit obvious differences, and the results demonstrate that the observed differences are significant' (end of Section V-A2). A Bootstrap mediation effect significance test is also performed (Section V-B2). Cohen's Kappa is reported for inter-annotator agreement."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported throughout: e.g., hallucination proportions by category (39.60% Requirement Conflicting, 25.50% Code Inconsistency, 34.90% Knowledge), pass@1 improvements from mitigation strategies (Self-Refine: 7.32 decrease, CoT: 9.76 decrease, RAG: 11.74 increase), and hallucination reduction quantities in Table IV."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper justifies sampling at '95% confidence level and 5% margin of error' when sampling subsets for manual evaluation (Section VI-A) and for cause verification (Section V-B1: 'we randomly sampled 144 tasks (with 95% confidence level and 5% margin of error) from the 230 Python tasks')."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper uses greedy decoding (single deterministic run per model per problem), so results are single-run numbers. No variance across multiple runs or seeds is reported because the experimental design explicitly eliminates randomness. However, this means there is no measure of result stability across different conditions."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "For the hallucination mitigation experiment (Section VI-A), an 'Origin' baseline (original prompt without enhancement) is compared against Self-Refine, CoT, and RAG strategies, as shown in Table IV."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The mitigation strategies compared (Self-Refine, CoT, RAG) are contemporary techniques. The LLMs evaluated include recent models like GPT-4 and DeepSeek-R1 (2025). The baseline is the standard prompt approach."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study is conducted. The three mitigation strategies (Self-Refine, CoT, RAG) are tested independently but no component-level ablation is performed on any of them. The taxonomy construction methodology is not ablated either."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics are reported: pass@1 scores, hallucination counts and proportions by category, Cohen's Kappa for inter-annotator agreement, and the breakdown of hallucination types (Requirement Conflicting, Code Inconsistency, Knowledge) with their subcategories."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The core methodology is a manual thematic analysis by human annotators. 3,120 code samples were manually analyzed by 2 experts (pilot study) and 10 annotators (main study), totaling approximately 300 person-hours. Cohen's Kappa agreement scores ranged from 0.76 to 0.96."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a qualitative/manual analysis study, not a machine learning training study. There is no model being trained on a training set that would require a held-out test set."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Extensive per-category breakdowns are provided: hallucination distribution across 12 categories (Figures 6-9), per-model breakdowns, per-benchmark breakdowns (HumanEval vs CoderEval-Python vs CoderEval-Java), and per-category pass@1 scores (Table III)."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Multiple failure cases are discussed with concrete code examples: Figures 3-5 show specific hallucination examples (Behavior Conflicting, Useless Statements, Computer Theory knowledge conflicting). Figures 12-13 show cases where mitigation strategies partially failed."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Negative results are reported: Self-Refine and CoT strategies decreased pass@1 performance (7.32 and 9.76 points respectively, Table IV). The paper attributes this to smaller LLMs' constrained reasoning capacity and error propagation. CoT 'did not have a significant impact on the number of Code Inconsistency hallucinations.'"
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims a comprehensive taxonomy (3 primary, 12 specific categories — supported by Figure 2), analysis of distribution across LLMs and benchmarks (supported by Figures 6-9), cause and impact analysis (supported by Figure 10 and Sections V-B, V-C), and prompt enhancement exploration (supported by Table IV). All claims are substantiated in the results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims about hallucination causes (ambiguous requirements, lack of domain knowledge, model-related). These are justified through: (1) manual expert annotation of causes, (2) verification experiments where enriching prompts with domain knowledge reduced Library/Project hallucinations from 7 to 2, and (3) Bootstrap mediation analysis showing prompt length's effect is mediated by complexity. The causal claims from the mitigation experiments are supported by controlled comparisons."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly bounds its findings to Python and Java, two benchmarks (HumanEval, CoderEval), and the specific LLMs tested. The Threats to Validity section states: 'Our empirical study specifically targets Python and Java programming tasks' and 'it would be interesting to explore the hallucinations in other programming languages and code generation scenarios as well.'"
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations: the distinction between hallucinations and logical reasoning errors (Section II-B), the role of compiler optimizations in mitigating efficiency impacts (Section V-C), and that the pass@1 decrease from prompt enhancement 'mainly originates from samples that were originally free of hallucinations' rather than introducing new problems."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions are provided: CodeLlama-7B (with HuggingFace URL), GPT-4 'gpt-4-0125-preview version', DeepSeek-Coder-1.3B and 7B (with HuggingFace URLs), and DeepSeek-R1 671B (with API URL). Section IV-A1 provides footnotes with exact model links."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Prompt formats are shown in Figures 12-13 for the mitigation experiments. The paper states prompts consist of 'a brief role description and the default problem description from the dataset, with specific details available in the Appendix.' Full prompt templates with actual examples are provided."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section IV-A1 'LLM Decoding Parameter Settings' specifies greedy decoding (do_sample=False), which makes temperature and top_p irrelevant. 'All other parameters of these model use the recommended values in their official documentations.' For RAG: sliding window size 20 lines, step size 2 lines, max token length 1,000, OpenAI p50k_base tokenizer."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The experiments involve direct prompting of LLMs for code generation, with prompting strategies (Self-Refine, CoT, RAG) described in detail but none involve multi-step agentic workflows."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The data collection and preprocessing pipeline is documented in detail: benchmark selection (HumanEval, CoderEval), LLM selection and configuration, code generation process yielding 3,120 samples (Table I: 820 + 1,150 + 1,150), pilot analysis of 500 samples (~20%), then remaining 80% labeled by 10 annotators, with disagreement resolution procedures described."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VI-C 'Threats to Validity' provides a substantive discussion covering external validity (generalizability), internal validity (subjectivity of manual analysis), and construct validity (decoding strategies and prompts)."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats are discussed: the study is limited to Python and Java only, subjectivity in manual hallucination categorization despite using a codebook, greedy decoding may not represent all generation scenarios, prompt design choices may affect results. The paper also notes 'semantics are inherently continuous and lack clear-cut discrete divisions' as a fundamental challenge."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states scope boundaries: limited to Python and Java, two benchmarks (standalone and repository-level), specific LLMs tested, and NL2Code generation task only. It states 'hallucination occurrences and distribution may vary across different code-related tasks, such as code translation, unit test generation, program repair, code review' as areas not covered."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The replication package at GitHub includes the annotated dataset, codebook, and appendix. The paper states 'The replication package is available at https://github.com/Lorien1128/code_hallucination' which would include the raw annotations."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section IV-A describes data collection in detail: benchmarks selected (HumanEval and CoderEval), LLMs used with specific versions, decoding parameters (greedy), prompt design, and total samples collected (3,120). Table I provides the breakdown."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The 10 annotators are described as having 'at least two years of experience in Java or Python programming' and being 'familiar with code LLMs,' but the paper does not describe how they were recruited (e.g., students, hired workers, colleagues). The two expert annotators are identified as co-authors. Recruitment methods for the 10 annotators are not specified."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The full pipeline is documented: (1) code generation from 4/5 LLMs on 2 benchmarks → 3,120 samples, (2) pilot analysis of 500 samples by 2 experts → initial codebook, (3) remaining 2,620 samples labeled by 10 annotators, (4) disagreement resolution through discussion, (5) final taxonomy of 3 primary and 12 specific categories with 1,212 hallucinatory snippets from 1,134 samples."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The Acknowledgement section lists funding: 'National Natural Science Foundation of China Grants Nos. 62302021, 62577007, 62332001, 62502283, and the State Key Laboratory of Complex & Critical Software Environment (Grant Nos. CCSE-2025ZX-09 and CCSE-2024ZX-14).'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly listed: Beihang University, Shandong University, and Huawei Cloud Computing Technologies Co., Ltd. Two authors (Zhongqi Li and Yuchi Ma) are affiliated with Huawei."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The funders are the National Natural Science Foundation of China and a university state key laboratory — government and academic funding sources that do not have a direct financial stake in the specific hallucination findings of any particular LLM."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included in the paper. Two authors are from Huawei, which develops and deploys LLMs, but no disclosure of financial interests related to the findings is provided."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates LLMs on benchmarks (HumanEval, CoderEval) but does not state the training data cutoff dates for any of the models used. This is relevant because HumanEval was published in 2021 and could be in training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether HumanEval or CoderEval problems appeared in the training data of the LLMs. HumanEval (published 2021) is widely known to have contamination issues with post-2021 models, but this is not addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HumanEval was published in 2021 and is widely available online. All models evaluated (CodeLlama, GPT-4, DeepSeek-Coder, DeepSeek-R1) were trained after 2021 and may have seen these benchmark problems. The paper does not discuss this contamination risk, though the focus is on hallucination taxonomy rather than capability evaluation."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "The study uses human annotators for code analysis, not human participants in an experiment. The annotators are labeling code samples, not being studied as subjects. This is a manual analysis methodology, not a human subjects study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "The annotators are researchers performing manual code analysis, not human participants being studied. No IRB approval is needed for having research assistants label data."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are being studied. The annotators' qualifications are reported (years of programming experience) but they are research team members, not study subjects."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are being studied. Annotator selection criteria are described (programming experience) as part of methodology quality control, not as a human subjects protocol."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human subjects experiment. The study is a manual analysis of LLM-generated code, not an experiment with human participants assigned to conditions."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human subjects experiment. The study is a manual analysis of code samples, not an experiment requiring blinding."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are being studied. Annotator completion is implicit (all 3,120 samples were analyzed) but this is a workforce management issue, not participant attrition."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper generates 3,120 code samples across 5 LLMs (including GPT-4 API calls) and runs RAG retrieval, but does not report any API costs, tokens consumed, or computational costs for the code generation or mitigation experiments."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Person-hours for manual analysis are reported (~60 for pilot, ~240 for main study), but no computational budget (GPU hours, API costs, hardware used) is stated for running the LLMs or the mitigation experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "LLM-generated code exhibits three primary categories of hallucinations: Requirement Conflicting (39.60%), Code Inconsistency (25.50%), and Knowledge Hallucinations (34.90%), further divided into 12 specific categories.",
    286       "evidence": "Manual thematic analysis of 3,120 code samples by expert annotators, identifying 1,212 hallucinatory code snippets from 1,134 samples. Distribution presented in Figures 6-7 and Section IV-B.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Behavior Conflicting is the most prevalent hallucination type across all studied LLMs and benchmarks, except for DeepSeek-R1 where Library/Project hallucinations dominate (80.3%).",
    291       "evidence": "Per-model distribution in Figure 7 shows Behavior Conflicting ranging from 34.7% (GPT-4) to 46.1% (DeepSeek-Coder-7B) for non-reasoning models, while DeepSeek-R1 shows only 3.2% Behavior Conflicting but 80.3% Library/Project.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The frequency of code hallucinations is negatively correlated with model parameter size, as demonstrated by the DeepSeek-Coder series (1.3B: 276, 7B: 232, R1-671B: 188 total hallucinations).",
    296       "evidence": "Figure 6 shows decreasing hallucination counts from DeepSeek-Coder-1.3B (276) to 7B (232) to R1-671B (188). However, CodeLlama-7B (314) has more hallucinations than DeepSeek-Coder-1.3B (276), suggesting parameter size is not the only factor.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Model-related causes account for 80.86% of hallucinations, while Lack of Domain-Specific Knowledge accounts for 26.24% and Ambiguous or Incomplete requirements for 1.90%.",
    301       "evidence": "Manual cause annotation of hallucinatory samples (Section V-B1). Categories sum to more than 100% because a single hallucination can have multiple causes.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "Prompt enhancement techniques (Self-Refine, CoT, RAG) can reduce hallucinations, particularly Requirement Conflicting, though Self-Refine and CoT decrease pass@1 performance on smaller models.",
    306       "evidence": "Table IV shows hallucination reduction: Self-Refine (30→16 hallucinatory samples), CoT (30→18), RAG (62→29). However, pass@1 decreased for Self-Refine (-7.32) and CoT (-9.76) while RAG increased pass@1 (+11.74).",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "The impact of prompt length on hallucinatory code is largely mediated by prompt complexity, as shown by a Bootstrap mediation effect significance test with 95% CI of (0.0113, 0.0725).",
    311       "evidence": "Bootstrap mediation analysis in Section V-B2, with both confidence interval bounds greater than 0, indicating significant mediation effect. Supported by the heatmap in Figure 11.",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "qualitative"],
    316   "key_findings": "The paper establishes a comprehensive taxonomy of code hallucinations in LLM-generated code, comprising 3 primary categories (Requirement Conflicting, Code Inconsistency, Knowledge) and 12 specific types, based on manual analysis of 3,120 code samples from 5 LLMs. Behavior Conflicting is the most prevalent hallucination type, and model-related causes dominate (80.86%). The paper finds that larger models and reasoning-enhanced models (DeepSeek-R1) produce fewer hallucinations overall but still struggle with Library/Project knowledge gaps. Prompt enhancement techniques (Self-Refine, CoT, RAG) can reduce hallucinations but may decrease functional correctness for smaller models.",
    317   "red_flags": [
    318     {
    319       "flag": "Benchmark contamination not addressed",
    320       "detail": "HumanEval was published in 2021 and is widely available online. All evaluated models were trained after 2021 and may have memorized benchmark problems. The paper does not discuss how contamination could affect the hallucination analysis — a model that has memorized a solution may exhibit different hallucination patterns than one generating from scratch."
    321     },
    322     {
    323       "flag": "Huawei affiliation with evaluated models",
    324       "detail": "Two authors are from Huawei Cloud Computing Technologies, and the study evaluates models including DeepSeek (a Chinese competitor). While this does not appear to bias the results, no competing interests statement is provided."
    325     },
    326     {
    327       "flag": "Residual 'Model-related Causes' category is a catch-all",
    328       "detail": "The 'Model-related Causes' category (80.86%) is assigned when 'the prompt shows no obvious flaws.' This is essentially a residual category that attributes hallucinations to the model by default, making it difficult to draw actionable conclusions about root causes."
    329     },
    330     {
    331       "flag": "Mitigation experiments only on smallest model",
    332       "detail": "The hallucination mitigation experiments (Self-Refine, CoT, RAG) were conducted only on DeepSeek-Coder-1.3B, the smallest and weakest model. The paper acknowledges that pass@1 decreased because of 'smaller LLMs' constrained reasoning capacity.' Results may not generalize to the larger models where they would be most practically useful."
    333     }
    334   ],
    335   "cited_papers": [
    336     {
    337       "title": "Evaluating large language models trained on code",
    338       "authors": ["Mark Chen", "Jerry Tworek"],
    339       "year": 2021,
    340       "arxiv_id": "2107.03374",
    341       "relevance": "Introduces HumanEval benchmark used in this study and the pass@k evaluation methodology for code generation."
    342     },
    343     {
    344       "title": "CodeMirage: Hallucinations in Code Generated by Large Language Models",
    345       "authors": ["Vibhor Agarwal", "Yulong Pei", "Salwa Alamir", "Xiaomo Liu"],
    346       "year": 2024,
    347       "arxiv_id": "2408.08333",
    348       "relevance": "Proposes a taxonomy of code defects as hallucinations, directly comparable to this paper's hallucination taxonomy."
    349     },
    350     {
    351       "title": "We have a package for you! A comprehensive analysis of package hallucinations by code generating LLMs",
    352       "authors": ["Joseph Spracklen"],
    353       "year": 2024,
    354       "arxiv_id": "2406.10279",
    355       "relevance": "Studies package hallucinations specifically, a subtype of the knowledge hallucinations explored in this paper."
    356     },
    357     {
    358       "title": "ColluBench: A benchmark for predicting language model hallucinations in code",
    359       "authors": ["Nan Jiang", "Qi Li", "Lin Tan", "Tianyi Zhang"],
    360       "year": 2024,
    361       "arxiv_id": "2410.09997",
    362       "relevance": "Proposes a benchmark for predicting code hallucinations, complementary to this paper's taxonomy and analysis."
    363     },
    364     {
    365       "title": "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation",
    366       "authors": ["Ziyao Zhang", "Yanlin Wang", "Chong Wang", "Jiachi Chen", "Zibin Zheng"],
    367       "year": 2024,
    368       "arxiv_id": "2409.20550",
    369       "relevance": "Studies hallucinations in repository-level code generation, closely related work focusing on practical code generation scenarios."
    370     },
    371     {
    372       "title": "CodeHalu: Code Hallucinations in LLMs Driven by Execution-Based Verification",
    373       "authors": ["Yuchen Tian"],
    374       "year": 2024,
    375       "relevance": "Categorizes code hallucinations using execution-based verification, offering an alternative taxonomy to this paper."
    376     },
    377     {
    378       "title": "What's wrong with your code generated by large language models? An extensive study",
    379       "authors": ["Shihan Dou"],
    380       "year": 2024,
    381       "arxiv_id": "2407.06153",
    382       "relevance": "Analyzes bug types in LLM-generated code, providing a complementary perspective on code quality issues."
    383     },
    384     {
    385       "title": "Bugs in large language models generated code",
    386       "authors": ["Florian Tambon"],
    387       "year": 2024,
    388       "arxiv_id": "2403.08937",
    389       "relevance": "Analyzes bug patterns in LLM-generated code and their prevalence, relevant to understanding code quality."
    390     },
    391     {
    392       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    393       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    394       "year": 2023,
    395       "relevance": "Introduces EvalPlus for rigorous evaluation of LLM code generation, used in this paper for HumanEval evaluation."
    396     },
    397     {
    398       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    399       "authors": ["Daya Guo"],
    400       "year": 2025,
    401       "arxiv_id": "2501.12948",
    402       "relevance": "Describes DeepSeek-R1, one of the LLMs evaluated in this study, showing how reasoning training affects hallucination patterns."
    403     },
    404     {
    405       "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions",
    406       "authors": ["Lei Huang"],
    407       "year": 2023,
    408       "arxiv_id": "2311.05232",
    409       "relevance": "Major survey on LLM hallucinations in NLG that provides the theoretical foundation for this paper's code hallucination taxonomy."
    410     },
    411     {
    412       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    413       "authors": ["Hao Yu"],
    414       "year": 2024,
    415       "relevance": "Introduces the CoderEval benchmark used in this study for repository-level code generation evaluation."
    416     }
    417   ]
    418 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs