scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24886B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation",
      6     "authors": [
      7       "Ziyao Zhang",
      8       "Yanlin Wang",
      9       "Chong Wang",
     10       "Jiachi Chen",
     11       "Zibin Zheng"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2409.20550",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All four abstract claims—taxonomy with 3 categories/8 subtypes, distribution analysis, root cause identification, and RAG mitigation—are supported by corresponding empirical sections in the paper.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The RAG-vs-Raw comparison is a controlled experiment where the only variable is retrieval augmentation, supporting the causal claim that RAG improves Pass@1; however, no statistical significance tests are run.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The study uses only 230 Python tasks from CoderEval, yet the paper title and conclusions speak broadly about 'practical code generation' without consistently bounding findings to this narrow scope.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The four root causes (training data quality, intention understanding, knowledge acquisition, context awareness) are presented as conclusions rather than hypotheses, with no competing explanations considered for the observed model differences.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Pass@1 is clearly defined as functional correctness measured by test case pass rates, and the hallucination taxonomy is derived from direct comparison to ground-truth code; the paper does not conflate these.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section VII 'Threats to Validity' is a dedicated section covering external, internal, and construct validity threats.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are identified: Python-only language focus, limited dataset scale (230 tasks), absence of formal inter-rater reliability measure, and potential model bias in annotation.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly limits scope to Python, acknowledges the RAG mitigation as 'preliminary' and 'pilot study,' and notes 'modest' improvement levels.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly disclosed: Sun Yat-sen University and Nanyang Technological University.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funder is disclosed, so this criterion does not apply.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Hallucination is defined in Section II with reference to NLP literature (input-conflicting, context-conflicting, fact-conflicting categories), and repository-level code generation is distinguished from standalone function generation.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Five explicit contributions are bullet-pointed at the end of the introduction: taxonomy, distribution analysis, root cause analysis, RAG mitigation, and replication package.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II substantively compares this work to Liu et al.'s standalone-function hallucination study, explaining how the repository-level focus and holistic analysis approach differ from prior work.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The abstract explicitly links to a GitHub replication package at github.com/DeepSoftwareAnalytics/LLMCodingHallucination containing code, data, and experimental results.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The replication package includes annotated data; the base CoderEval dataset is also publicly available.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or dependency specification is mentioned; only model names and generation hyperparameters are provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper only provides a GitHub link; no step-by-step reproduction instructions are included in the paper itself.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Table I reports Pass@1 scores and absolute improvements with no confidence intervals or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to the Raw vs RAG-based mitigation comparisons despite quantitative claims of improvement.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Table I reports absolute Pass@1 improvements (e.g., ↑1.31%, ↑3.05%) against baseline values, providing interpretable effect sizes.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 230-task dataset is adopted from CoderEval without power analysis or justification for adequacy; the paper itself lists this as a threat to validity.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread is reported across the 10 generated code snippets per task.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "The 'Raw method' (docstrings + function signatures only) serves as the baseline against which RAG-based mitigation is compared.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The RAG approach is based on RepoCoder (2023), a contemporary and relevant method for repository-level code generation.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation of RAG components is performed; variables like window size, sliding step, number of retrieved snippets, or similarity metric are not varied systematically.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "Only Pass@1 is used to evaluate the RAG mitigation; no additional metrics such as BLEU, CodeBLEU, or hallucination rate reduction are reported.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "Three volunteer annotators with Python experience independently labeled all generated code snippets for hallucination taxonomy, constituting systematic human evaluation of model outputs.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "CoderEval's test cases are used to evaluate functional correctness, and the RAG construction explicitly excludes ground-truth code lines to prevent answer leakage.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Figure 2 shows hallucination distribution by all 8 subtypes, and Figure 11 breaks down hallucination counts per category for each of the 6 models.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Figures 3–10 provide 8 detailed code examples illustrating specific hallucination failures with ground-truth comparison.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper explicitly acknowledges that 'performance improvement in our experiments is modest' and characterizes the RAG mitigation as a 'pilot study' with significant limitations.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Exact model versions are specified: GPT-3.5-Turbo, CodeGen-350M-Mono, PanGu-α-2.6B, DeepSeekCoder-6.7B, CodeLlama-7b-Python-hf, StarCoder2-7B.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The paper states the Raw method uses 'basic docstrings and function signatures' but no actual prompt templates or system instructions are shown.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Temperature (0.6), sampling strategy (nuclear), RAG window size (20 lines), and sliding step (2 lines) are all reported.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; models are evaluated in direct generation mode.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "The three-stage annotation pipeline (initial open coding on 10%, preliminary taxonomy, full taxonomy with iterative refinement) is documented in Section III-C.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "The replication package at GitHub explicitly includes data and experimental results alongside code.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "CoderEval dataset selection is described, and the generation process (10 snippets per task via nuclear sampling) is documented.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "Three volunteer annotators are described as having 'extensive Python programming experience' with specific experience levels (two >10 years, one 4 years).",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline from initial 10% sampling through preliminary taxonomy to full annotation with iterative refinement is described in Section III-C.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs are not stated for any of the 6 evaluated models.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "Whether models were trained on CoderEval tasks or related repository code is not discussed; only RAG answer leakage (not pre-training contamination) is addressed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "CoderEval was released in 2024 and some models may have been trained on data from these repositories; this possibility is not discussed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human subjects study in the IRB sense; annotation is internal methodology work.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants study requiring ethics review.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or API cost figures are reported for running 6 LLMs × 230 tasks × 10 generations.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No total compute budget or hardware specifications are provided.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "LLM code hallucinations can be taxonomized into 3 major categories (Task Requirement Conflicts 43.53%, Factual Knowledge Conflicts 31.91%, Project Context Conflicts 24.56%) with 8 subtypes",
    375       "evidence": "Manual open coding of 1,380+ code snippets from 6 LLMs on 23 initial tasks, then full annotation of all 230 tasks; distribution shown in Figure 2",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Task Requirement Conflicts are the most prevalent hallucination type across all studied LLMs",
    380       "evidence": "Figure 11 shows Task Requirement Conflicts dominate for all 6 models; stated as RQ2 finding",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "RAG-based mitigation consistently improves Pass@1 for all 6 LLMs",
    385       "evidence": "Table I shows improvements ranging from +0.87% (DeepSeekCoder) to +3.05% (CodeLlama); the authors themselves call this 'modest'",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Four factors contribute to hallucinations: training data quality, intention understanding, knowledge acquisition capacity, and repository-level context awareness",
    390       "evidence": "Section IV-C presents these as inferred root causes based on the observed hallucination patterns, supported by illustrative examples rather than controlled experiments",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "Models trained on diverse text+code corpora (DeepSeekCoder, CodeLlama) produce fewer Task Requirement Conflict hallucinations than code-only trained models (CodeGen, StarCoder2)",
    395       "evidence": "Figure 11 shows lower Task Requirement Conflict counts for DeepSeekCoder and CodeLlama; authors attribute this to training corpora composition",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "API Knowledge Conflicts are the dominant subtype of Factual Knowledge Conflicts, accounting for 20.41% of all hallucinations",
    400       "evidence": "Percentage breakdown reported in Section IV-A2 based on manual annotation results",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "empirical",
    406     "benchmark-eval",
    407     "case-study",
    408     "qualitative"
    409   ],
    410   "key_findings": "The study establishes a hallucination taxonomy for repository-level LLM code generation with three categories: Task Requirement Conflicts (43.53%), Factual Knowledge Conflicts (31.91%), and Project Context Conflicts (24.56%). API Knowledge Conflicts are the most frequent single subtype (20.41%). A lightweight RAG mitigation using bag-of-words similarity retrieval from the code repository consistently improves Pass@1 across all 6 models, though improvements are modest (less than 3.5 percentage points absolute). The paper identifies four contributing factors to hallucinations but tests only one mitigation approach, leaving the root cause hypotheses largely unvalidated experimentally.",
    411   "red_flags": [
    412     {
    413       "flag": "No statistical significance testing",
    414       "detail": "All comparisons between Raw and RAG-based methods in Table I lack significance tests or confidence intervals; the improvements may not be statistically reliable."
    415     },
    416     {
    417       "flag": "No inter-rater reliability metric",
    418       "detail": "The internal validity threat section acknowledges 'the absence of formal inter-rater reliability measure' (e.g., Cohen's kappa) for the annotation, relying only on discussion and consensus."
    419     },
    420     {
    421       "flag": "Overgeneralized title and framing",
    422       "detail": "The paper studies only 230 Python tasks from one benchmark (CoderEval) but claims findings about 'practical code generation' broadly."
    423     },
    424     {
    425       "flag": "Very small absolute improvements",
    426       "detail": "RAG improvements range from +0.87% to +3.05% Pass@1, with some models (CodeGen, StarCoder2) starting at 1.30% and 0.04% respectively, making the practical significance of findings questionable."
    427     },
    428     {
    429       "flag": "Root causes untested",
    430       "detail": "The four identified root causes are plausible hypotheses derived from examples rather than experimentally validated causal factors."
    431     },
    432     {
    433       "flag": "Contamination not addressed",
    434       "detail": "Whether any of the 6 evaluated models were pre-trained on code from the CoderEval repositories is not discussed, leaving benchmark contamination as an unaddressed threat."
    435     }
    436   ],
    437   "cited_papers": [
    438     {
    439       "title": "CoderEval: A Benchmark of Pragmatic Code Generation with Generative Pre-trained Models",
    440       "relevance": "Primary evaluation dataset used in this study; establishes the repository-level code generation task"
    441     },
    442     {
    443       "title": "Exploring and Evaluating Hallucinations in LLM-Powered Code Generation",
    444       "relevance": "Directly compared prior work on hallucinations in standalone function generation that this paper extends to repository-level settings"
    445     },
    446     {
    447       "title": "RepoCoder: Repository-Level Code Completion through Iterative Retrieval and Generation",
    448       "relevance": "Basis for the RAG mitigation approach implemented in this paper"
    449     },
    450     {
    451       "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions",
    452       "relevance": "Foundational taxonomy of hallucination types (input-conflicting, context-conflicting, fact-conflicting) that the code hallucination taxonomy maps onto"
    453     },
    454     {
    455       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    456       "relevance": "Standard code generation benchmark referenced for context on standalone function generation performance"
    457     },
    458     {
    459       "title": "ClassEval: A Manually-Crafted Benchmark for Evaluating LLMs on Class-Level Code Generation",
    460       "relevance": "Related benchmark for more complex code generation scenarios cited alongside CoderEval"
    461     },
    462     {
    463       "title": "Survey of Hallucination in Natural Language Generation",
    464       "relevance": "Foundation for the hallucination definition and categorization framework used in this paper"
    465     },
    466     {
    467       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    468       "relevance": "One of the six LLMs evaluated in the empirical study"
    469     }
    470   ],
    471   "engagement_factors": {
    472     "practical_relevance": {
    473       "score": 3,
    474       "justification": "Directly addresses a pain point for developers using LLMs for code generation, provides a taxonomy of failure modes and a usable RAG mitigation."
    475     },
    476     "surprise_contrarian": {
    477       "score": 1,
    478       "justification": "Confirms the widely-known problem that LLMs hallucinate; the taxonomy is novel but the finding that RAG helps modestly is unsurprising."
    479     },
    480     "fear_safety": {
    481       "score": 2,
    482       "justification": "Explicitly demonstrates that LLMs can generate code with security vulnerabilities (e.g., unsafe YAML loading) that pass static checks."
    483     },
    484     "drama_conflict": {
    485       "score": 1,
    486       "justification": "No controversy; incremental empirical work with straightforward findings."
    487     },
    488     "demo_ability": {
    489       "score": 2,
    490       "justification": "Public GitHub replication package exists; practitioners could run the RAG mitigation on their own repositories."
    491     },
    492     "brand_recognition": {
    493       "score": 1,
    494       "justification": "Sun Yat-sen University and NTU are respected but not top-tier AI lab brands; no industry affiliation."
    495     }
    496   },
    497   "hn_data": {
    498     "threads": [
    499       {
    500         "hn_id": "40963623",
    501         "title": "New large value estimates for Dirichlet polynomials",
    502         "points": 11,
    503         "comments": 1,
    504         "url": "https://news.ycombinator.com/item?id=40963623",
    505         "created_at": "2024-07-14T22:20:27Z"
    506       },
    507       {
    508         "hn_id": "40733098",
    509         "title": "Decentralized AI: Permissionless LLM Inference on POKT Network",
    510         "points": 3,
    511         "comments": 0,
    512         "url": "https://news.ycombinator.com/item?id=40733098",
    513         "created_at": "2024-06-19T22:30:41Z"
    514       },
    515       {
    516         "hn_id": "45441508",
    517         "title": "Melegros: Monolithic Elephant-Inspired Gripper with Optical Sensors",
    518         "points": 1,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=45441508",
    521         "created_at": "2025-10-01T18:39:54Z"
    522       },
    523       {
    524         "hn_id": "39692734",
    525         "title": "Using Fiber Optic Bundles to Miniaturize Vision-Based Tactile Sensors",
    526         "points": 1,
    527         "comments": 0,
    528         "url": "https://news.ycombinator.com/item?id=39692734",
    529         "created_at": "2024-03-13T15:40:36Z"
    530       }
    531     ],
    532     "top_points": 11,
    533     "total_points": 16,
    534     "total_comments": 1
    535   }
    536 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs