scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24035B)
      1 {
      2   "paper": {
      3     "title": "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation",
      4     "authors": ["Ziyao Zhang", "Yanlin Wang", "Chong Wang", "Jiachi Chen", "Zibin Zheng"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2409.20550"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval", "qualitative"],
     12   "key_findings": "LLM code hallucinations fall into three categories: Task Requirement Conflicts (43.5%), Factual Knowledge Conflicts (31.9%), and Project Context Conflicts (24.6%), with eight subcategories. Task Requirement Conflicts are the most prevalent across all six studied models. Four root causes are identified: training data quality, intention understanding capacity, knowledge acquisition capacity, and repository-level context awareness. A lightweight RAG-based mitigation consistently improves Pass@1 across all models, though improvements are modest (0.87-3.05 percentage points).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "A replication package URL is provided: https://github.com/DeepSoftwareAnalytics/LLMCodingHallucination, mentioned in the abstract and contributions section."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses the publicly available CoderEval benchmark and states the replication package includes 'code, data, and experimental results.'"
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications (requirements.txt, dependency versions, etc.) are described in the paper. Model sizes are listed but no runtime environment details."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided in the paper itself. The replication package is linked but no README or reproduction steps are described."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Table I reports only point estimates for Pass@1 with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims RAG-based mitigation improves all models but provides no significance tests for these differences."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Table I reports absolute percentage point improvements for each model (e.g., 'CodeLlama 2.17% → 5.22% (↑3.05%)'), providing baseline context for the effect size."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for using 230 coding tasks from CoderEval. No power analysis or discussion of whether this is sufficient for the claims made."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance or standard deviation reported across the 10 generated code snippets per task. Only aggregate Pass@1 numbers shown."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The RAG-based mitigation is compared against a 'Raw Method' baseline (docstring + function signature only) in Table I."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The only baseline is the raw (no-RAG) method. No comparison against other hallucination mitigation approaches or more sophisticated RAG methods. RepoCoder is referenced but not compared against directly."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study on the RAG-based mitigation (e.g., varying number of retrieved snippets, window size, similarity method)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "Only Pass@1 is used to evaluate the mitigation method. No other metrics (Pass@k for k>1, hallucination rate reduction, etc.) are reported."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The hallucination taxonomy is constructed through manual open coding by authors and three volunteers with extensive Python experience (Section III-C). This constitutes human evaluation of LLM-generated code outputs."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The CoderEval benchmark provides test cases for evaluating generated code correctness. The evaluation uses the standard benchmark test set."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Figure 2 shows hallucination distribution across categories, and Figure 11 breaks down hallucination types per model. Detailed subcategory percentages are provided throughout Section IV-A."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Extensive failure case analysis with code examples in Figures 3-10, showing specific hallucination instances across all categories."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "All RAG results show improvement. No discussion of cases where RAG failed or made things worse, or configurations that didn't work."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims a hallucination taxonomy, distribution analysis, four root causes, and RAG mitigation effectiveness — all supported by Sections IV and V."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper identifies four 'root causes' of hallucinations (Section IV-C) but these are speculative analyses without controlled experiments. Claims like 'training data quality causes hallucinations' are causal but not experimentally validated."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Practical Code Generation' broadly but results are limited to Python tasks from CoderEval (230 tasks). The paper discusses this in threats to validity but the title and framing overreach."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The root cause analysis in Section IV-C presents four factors as the explanation without considering alternatives. No discussion of whether the taxonomy itself or annotation process could explain the distribution patterns."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper uses Pass@1 to measure 'correctness' and manual hallucination counts to measure 'hallucination prevalence' but does not discuss whether these proxies capture the full scope of practical code generation quality they claim to study."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model versions are stated: GPT-3.5-Turbo, CodeGen-350M-Mono, PanGu-α-2.6B, DeepSeekCoder-6.7B, CodeLlama-7b-Python-hf, StarCoder2-7B (Section III-B)."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper describes that docstrings and function signatures are provided as input, but does not show the actual prompt templates or formatting used when querying the LLMs."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Temperature is set to 0.6 with nuclear sampling, following CoderEval settings (Section III-A). RAG uses window size 20 lines, sliding step 2 lines, top-10 retrieval."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The RAG method is a simple retrieval-then-generate pipeline, not an agentic system."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section III-C describes the open coding process: 10% initial sample → preliminary taxonomy → iterative labeling of remaining 90%. Section V-B describes the RAG corpus construction with sliding window parameters."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section VII 'Threats to Validity' discusses external, internal, and construct validity threats."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: Python-only language limitation, 230-task dataset scale, absence of formal inter-rater reliability measure, model bias during annotation (mitigated by mixing results). These are specific to this study."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section VII explicitly states the study focused on Python only and acknowledges 'constructing hallucination taxonomies for other programming languages and comparing them with our current taxonomy is a valuable future direction.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The replication package at the GitHub URL claims to include 'code, data, and experimental results,' which would include the annotation data."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section III describes the dataset (CoderEval, 230 Python tasks), the LLM generation process (10 snippets per task, temperature 0.6), and the annotation methodology (open coding with iterative refinement)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "Three 'newly invited volunteers with extensive Python programming experience' performed annotation, but how they were recruited is not described. Only their experience levels are mentioned (two with 10+ years, one with 4 years)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section III-C documents the full pipeline: initial 10% sample → preliminary taxonomy → full annotation by three volunteers → iterative taxonomy refinement. The RAG pipeline is also documented in Section V-B."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Sun Yat-sen University and Nanyang Technological University."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are provided for any of the six models. CoderEval tasks could have been in training data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether CoderEval tasks or their source repositories appeared in any model's training data."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "CoderEval is sourced from public GitHub repositories. No discussion of whether these repositories were in the training data of the evaluated models."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in a study sense. The annotators performed open coding on LLM outputs, not a human subjects study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human subjects research. Annotators are researchers performing analysis."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. Annotators' experience is mentioned but this is not a human subjects study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference costs, API costs, or latency reported for any of the six models or the RAG mitigation approach."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No computational budget stated. Running 6 models × 230 tasks × 10 generations each is significant but unquantified."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No seed sensitivity analysis. Results are reported as single aggregate numbers with no cross-seed variation."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "The paper states 10 code snippets are generated per task using nuclear sampling with temperature 0.6 (Section III-A)."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "RAG parameters (window size 20, step 2, top-10) are taken from RepoCoder without stating whether alternatives were tried."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "RAG configuration adopted from RepoCoder without justification for why those specific parameters are optimal for this task."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors propose and evaluate their own RAG mitigation without acknowledging potential bias in the evaluation."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "RAG adds retrieval overhead to each generation but no compute comparison between raw and RAG methods is provided."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper explicitly discusses why CoderEval is more appropriate than HumanEval/MBPP for studying practical hallucinations, citing its repository-level context and non-standalone functions (Section I, II-A)."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding involved. Models are queried directly with prompts."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of temporal leakage. CoderEval tasks come from public repos that could have been in training data of all models."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "The RAG evaluation excludes ground-truth code from the retrieval corpus: 'code lines containing or following the ground-truth code are excluded from the scanning process' (Section V-B)."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether CoderEval's source repositories overlap with model training data."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection method applied (no canary strings, membership inference, or decontamination)."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "LLM hallucinations in code generation fall into three major categories with eight subcategories: Task Requirement Conflicts (43.53%), Factual Knowledge Conflicts (31.91%), and Project Context Conflicts (24.56%).",
    364       "evidence": "Manual open coding on 6 LLMs × 230 tasks × 10 snippets, iterative taxonomy construction described in Section III-C, distribution in Figure 2.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "Task Requirement Conflicts are the most prevalent hallucination type across all six models.",
    369       "evidence": "Figure 11 shows distribution across models. All models show Task Requirement Conflicts as the largest category.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "RAG-based mitigation consistently improves Pass@1 for all six studied LLMs.",
    374       "evidence": "Table I shows improvements: CodeGen +1.31%, PanGu-α +1.70%, DeepSeekCoder +0.87%, CodeLlama +3.05%, StarCoder2 +2.57%, ChatGPT +2.21%.",
    375       "supported": "weak"
    376     },
    377     {
    378       "claim": "Four root causes contribute to hallucinations: training data quality, intention understanding capacity, knowledge acquisition capacity, and repository-level context awareness.",
    379       "evidence": "Qualitative analysis in Section IV-C with illustrative examples but no controlled experiments isolating each factor.",
    380       "supported": "weak"
    381     }
    382   ],
    383   "red_flags": [
    384     {
    385       "flag": "No inter-rater reliability measure",
    386       "detail": "The paper acknowledges in Section VII that there is 'no formal inter-rater reliability measure for annotating hallucinations.' For a taxonomy paper, inter-rater agreement (e.g., Cohen's kappa) is essential to validate the classification scheme."
    387     },
    388     {
    389       "flag": "Very low absolute performance",
    390       "detail": "Pass@1 scores range from 0.04% to 12.61%. At these levels, small absolute improvements (e.g., +0.87%) may not be meaningful and could be within noise margins, but no statistical tests are provided."
    391     },
    392     {
    393       "flag": "No contamination analysis",
    394       "detail": "CoderEval is sourced from public GitHub repositories. All evaluated models may have trained on these exact repositories, which would fundamentally affect the hallucination analysis."
    395     },
    396     {
    397       "flag": "Root cause analysis is speculative",
    398       "detail": "The four identified 'root causes' are presented as findings but are qualitative interpretations without controlled experiments to isolate causal factors."
    399     }
    400   ],
    401   "cited_papers": [
    402     {
    403       "title": "Evaluating large language models trained on code",
    404       "authors": ["M. Chen", "J. Tworek"],
    405       "year": 2021,
    406       "arxiv_id": "2107.03374",
    407       "relevance": "Introduces HumanEval benchmark, foundational code generation evaluation."
    408     },
    409     {
    410       "title": "CoderEval: A benchmark of pragmatic code generation with generative pre-trained models",
    411       "authors": ["H. Yu", "B. Shen"],
    412       "year": 2024,
    413       "relevance": "The primary benchmark used in this study; evaluates repository-level code generation."
    414     },
    415     {
    416       "title": "ClassEval: A manually-crafted benchmark for evaluating LLMs on class-level code generation",
    417       "authors": ["X. Du", "M. Liu"],
    418       "year": 2023,
    419       "arxiv_id": "2308.01861",
    420       "relevance": "Related benchmark for class-level code generation evaluation."
    421     },
    422     {
    423       "title": "DeepSeek-Coder: When the large language model meets programming",
    424       "authors": ["D. Guo", "Q. Zhu"],
    425       "year": 2024,
    426       "arxiv_id": "2401.14196",
    427       "relevance": "One of the six LLMs evaluated; key open-source code model."
    428     },
    429     {
    430       "title": "Code Llama: Open foundation models for code",
    431       "authors": ["B. Rozière", "J. Gehring"],
    432       "year": 2023,
    433       "arxiv_id": "2308.12950",
    434       "relevance": "One of the six LLMs evaluated; Meta's code generation model family."
    435     },
    436     {
    437       "title": "StarCoder 2 and The Stack v2: The next generation",
    438       "authors": ["A. Lozhkov", "R. Li"],
    439       "year": 2024,
    440       "arxiv_id": "2402.19173",
    441       "relevance": "One of the evaluated models; open-source code LLM."
    442     },
    443     {
    444       "title": "RepoCoder: Repository-level code completion through iterative retrieval and generation",
    445       "authors": ["F. Zhang", "B. Chen"],
    446       "year": 2023,
    447       "arxiv_id": "2303.12570",
    448       "relevance": "Basis for the RAG-based mitigation approach used in this paper."
    449     },
    450     {
    451       "title": "Exploring and evaluating hallucinations in LLM-powered code generation",
    452       "authors": ["F. Liu", "Y. Liu"],
    453       "year": 2024,
    454       "arxiv_id": "2404.00971",
    455       "relevance": "Prior work on code hallucination taxonomy for standalone functions; this paper extends to repository-level."
    456     },
    457     {
    458       "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions",
    459       "authors": ["L. Huang", "W. Yu"],
    460       "year": 2023,
    461       "arxiv_id": "2311.05232",
    462       "relevance": "Foundational survey on LLM hallucinations in the general domain."
    463     },
    464     {
    465       "title": "Quality and trust in LLM-generated code",
    466       "authors": ["C. Spiess", "D. Gros"],
    467       "year": 2024,
    468       "arxiv_id": "2402.02047",
    469       "relevance": "Studies trust and quality issues in LLM-generated code, directly relevant to reliability concerns."
    470     },
    471     {
    472       "title": "ClarifyGPT: Empowering LLM-based code generation with intention clarification",
    473       "authors": ["F. Mu", "L. Shi"],
    474       "year": 2023,
    475       "arxiv_id": "2310.10996",
    476       "relevance": "Alternative approach to mitigating code generation errors through requirement clarification."
    477     },
    478     {
    479       "title": "EvoCodeBench: An evolving code generation benchmark aligned with real-world code repositories",
    480       "authors": ["J. Li", "G. Li"],
    481       "year": 2024,
    482       "arxiv_id": "2404.00599",
    483       "relevance": "Related repository-level code generation benchmark with temporal evolution."
    484     }
    485   ]
    486 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs