scan-v5.json (26073B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "De-Hallucinator: Mitigating LLM Hallucinations in Code Generation Tasks via Iterative Grounding", 6 "authors": [ 7 "Aryaz Eghbali", 8 "Michael Pradel" 9 ], 10 "year": 2024, 11 "venue": "arXiv", 12 "arxiv_id": "2401.01701", 13 "doi": null 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "All quantitative claims in the abstract (23.3–50.6% edit distance improvement, 23.9–61.0% API recall improvement, 63.2% fixed hallucinations, 15.5% coverage gain) are directly supported by Tables 3 and 4 in the evaluation.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper makes causal claims about De-Hallucinator improving code quality; a controlled comparison against baselines (initial prompt, RAG prompt) with Wilcoxon statistical tests and an ablation over hyperparameters k and n provides adequate support for these directional claims.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": true, 32 "justification": "The limitations section explicitly states 'our conclusions are valid only for these languages' (Python and JavaScript), and the abstract specifies evaluation across 'two code generation tasks, two programming languages, and five state-of-the-art LLMs.'", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper does not discuss whether improvement could stem from simply adding more tokens to the prompt rather than the specific iterative API-retrieval mechanism; the RAG vs. iterative comparison partially addresses this but the paper does not frame it as an alternative explanation.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "The paper clearly explains each metric—edit distance quantifies token edits a developer needs to make, exact API match measures the specific phenomenon of interest, and passing tests/coverage directly measure test generation quality—without overclaiming these as general productivity measures.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "Section 6 is dedicated to 'LIMITATIONS AND THREATS TO VALIDITY' and contains multiple distinct limitations beyond a single sentence.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "The section names specific threats: the assumption that APIs exist when needed (and a proposed workaround), evaluation restricted to Python and JavaScript, and potential non-representativeness of the project sample; the first two are concrete and specific.", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper explicitly states conclusions are valid only for Python and JavaScript, and notes the project sample may not represent all projects, providing meaningful scope boundaries.", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": false, 72 "justification": "No funding acknowledgment or grant information appears anywhere in the paper.", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Both authors clearly disclose their affiliation as Software Lab, University of Stuttgart, with institutional email addresses.", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": false, 83 "answer": false, 84 "justification": "No funding is disclosed, so funder independence cannot be assessed.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "There is no competing interests statement or declaration of financial interests in the paper.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "Hallucination is defined as inventing non-existent APIs; API reference is formally defined in Definition 3.1 with three subtypes (function, class, attribute); initial prompt, RAG prompt, and iterative prompt are each defined with examples.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper lists four explicit contributions at the end of the introduction: empirical motivation, a new technique, a novel algorithm, and empirical evidence of effectiveness across tasks and LLMs.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 7 provides a detailed related work discussion that distinguishes De-Hallucinator from CoCoMIC, RepoCoder, HyDE, TestPilot, and RAG, explaining specifically how each differs in mechanism and coupling to the underlying LLM.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": true, 121 "justification": "The data availability statement provides two public GitHub URLs (https://github.com/AryazE/dehallucinator and https://github.com/AryazE/testpilot) containing the implementation, datasets, and evaluation scripts.", 122 "source": "haiku" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "The code completion dataset is constructed from 11 public GitHub projects with specific commit hashes listed in Table 2; the test generation dataset reuses public JavaScript projects from TestPilot.", 128 "source": "haiku" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper mentions specific libraries (HuggingFace transformers, CodeQL, NLTK, scikit-learn Ball Tree) and hardware, but provides no requirements.txt, Dockerfile, or equivalent environment specification in the paper itself.", 134 "source": "haiku" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper points to GitHub repositories but provides no step-by-step instructions for reproducing the evaluation within the paper itself.", 140 "source": "haiku" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": false, 147 "justification": "Tables 3 and 4 report only mean values without confidence intervals, standard deviations, or error bars for any of the main results.", 148 "source": "haiku" 149 }, 150 "significance_tests": { 151 "applies": true, 152 "answer": true, 153 "justification": "The paper uses the Wilcoxon signed-rank test with the Pratt method for all comparative claims; statistically significant results are bolded in Tables 3 and 4.", 154 "source": "haiku" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Relative percentage improvements are reported alongside absolute numbers throughout Tables 3 and 4, providing effect size context (e.g., 23.3–50.6% edit distance reduction).", 160 "source": "haiku" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "The choice of 50 tasks for the preliminary study, 10 completions per project (440 total), and 12 JavaScript projects is described procedurally but not justified through power analysis or comparable benchmarks.", 166 "source": "haiku" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": false, 171 "justification": "No variance, standard deviation, or confidence intervals are reported for any metric; all tables show only mean values.", 172 "source": "haiku" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": true, 179 "justification": "Two baselines are included: the conventional initial prompt (no retrieval) and RAG prompt (retrieval from initial prompt only), compared against the iterative De-Hallucinator approach.", 180 "source": "haiku" 181 }, 182 "baselines_contemporary": { 183 "applies": true, 184 "answer": true, 185 "justification": "All baseline LLMs (CodeGen, CodeGen 2.5, UniXCoder, StarCoder+, GPT-3.5-turbo-0125) and TestPilot as the test generation baseline are state-of-the-art as of the 2024 submission.", 186 "source": "haiku" 187 }, 188 "ablation_study": { 189 "applies": true, 190 "answer": true, 191 "justification": "RQ3 (Section 5.4) provides a systematic ablation over the number of iterations k ∈ {1,2,3} and the number of API references n ∈ {2,10,20,40} for code completion and n ∈ {3,5,10} for test generation.", 192 "source": "haiku" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": true, 197 "justification": "Three metrics are used for code completion (edit distance, normalized edit similarity, exact API match) and three for test generation (passing tests, coverage, fixed hallucinations).", 198 "source": "haiku" 199 }, 200 "human_evaluation": { 201 "applies": true, 202 "answer": true, 203 "justification": "The preliminary study uses two authors independently classifying 50 completion pairs with Cohen's kappa of 0.76; RQ2 includes manual inspection of 20 completion tasks per LLM to verify correct API augmentation.", 204 "source": "haiku" 205 }, 206 "held_out_test_set": { 207 "applies": true, 208 "answer": true, 209 "justification": "The evaluation uses a constructed dataset of completion tasks from 11 Python and 12 JavaScript projects; completions already predicted correctly by the baseline are excluded to form a meaningful test set.", 210 "source": "haiku" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Results in Table 3 are broken down per LLM model (UniXCoder, CodeGen v1, CodeGen v2.5, StarCoder+), and Table 5 provides per-model breakdown of correct API augmentation rates.", 216 "source": "haiku" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Section 5.3 explicitly discusses failure cases: 'For cases where the approach fails to add the correct API reference into the prompt, the main reason is that the initial completion has low relevance w.r.t. the ground truth.'", 222 "source": "haiku" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "Table 4 shows RAG & iterative combined performs worse on coverage than iterative alone; Figure 7 shows n=40 API references hurts performance; these negative results are reported and discussed.", 228 "source": "haiku" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": true, 235 "justification": "Exact HuggingFace model IDs are provided (Salesforce/codegen-2B-mono, Salesforce/codegen25-7b-mono, microsoft/unixcoder-base, bigcode/starcoderplus) and the OpenAI model version GPT-3.5-turbo-0125 is specified.", 236 "source": "haiku" 237 }, 238 "prompts_provided": { 239 "applies": true, 240 "answer": true, 241 "justification": "Figures 4, 5, and 6 show complete example prompts including the API Reference block format, and Section 3.4 describes the prompt construction with concrete examples.", 242 "source": "haiku" 243 }, 244 "hyperparameters_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "Key hyperparameters are reported: k=3 iterations, n=20 API references for code completion, n=3 for test generation, max new tokens=256, temperature=0.1, 4 completions per prompt.", 248 "source": "haiku" 249 }, 250 "scaffolding_described": { 251 "applies": true, 252 "answer": true, 253 "justification": "The three-stage pipeline (pre-analysis → retrieval → prompt construction → LLM query) is described in detail across Sections 3.2–3.5 with both general description and task-specific instantiation.", 254 "source": "haiku" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": true, 259 "justification": "Section 5.1.3 documents dataset construction: removing API usages, filtering functions >25 lines, removing API-related imports, and excluding already-correct completions are all described with rationale.", 260 "source": "haiku" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": true, 267 "justification": "Specific project commits are listed in Table 2 for all 23 projects, and implementation/datasets are released at the GitHub URLs, enabling independent data reconstruction.", 268 "source": "haiku" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "The collection process is documented: randomly sampling from a curated awesome-python list, sampling by application domain, selecting 5 functions per project, applying inclusion/exclusion criteria.", 274 "source": "haiku" 275 }, 276 "recruitment_methods_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants were recruited; the study uses public code repositories and internal author annotation for the preliminary study.", 280 "source": "haiku" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": true, 285 "justification": "The full pipeline from project selection → API extraction via CodeQL → embedding → Ball Tree indexing → retrieval → prompt construction → evaluation is documented across Sections 3–5.", 286 "source": "haiku" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No training data cutoffs are stated for any of the five evaluated models, despite evaluating on public GitHub code that may have been in training corpora.", 294 "source": "haiku" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": true, 298 "answer": false, 299 "justification": "The paper excludes already-correct completions 'to avoid any potential memorizations' but does not analyze or discuss training data overlap with the evaluated GitHub projects.", 300 "source": "haiku" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": true, 304 "answer": false, 305 "justification": "The evaluation projects are popular public GitHub repositories that were almost certainly available before the training cutoffs of the evaluated models; this is not discussed or analyzed.", 306 "source": "haiku" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": false, 312 "answer": false, 313 "justification": "No human subjects study; the manual annotation is performed by the authors.", 314 "source": "haiku" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants; IRB approval is not applicable.", 320 "source": "haiku" 321 }, 322 "demographics_reported": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "blinding_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "haiku" 345 }, 346 "attrition_reported": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": true, 356 "answer": true, 357 "justification": "Section 5.5 reports retrieval latency (21–227ms for code completion, 0.1–17ms for test generation) and LLM query times (1.3s for GPT-3.5 to 66.7s for CodeGen v2.5 on local GPU).", 358 "source": "haiku" 359 }, 360 "compute_budget_stated": { 361 "applies": true, 362 "answer": true, 363 "justification": "Hardware is specified (two Nvidia T4 GPUs, single Tesla V100 with 32GB) and pre-analysis times are given per project (under 80 seconds for Python, 3.5s average for JavaScript).", 364 "source": "haiku" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "API hallucinations affect 44% of all function-level code completion tasks and 59% of failed tasks in a preliminary study of 50 functions.", 372 "evidence": "Manual classification of 50 code completion tasks with inter-rater agreement Cohen's kappa=0.76; 22 of 37 failed completions had at least one missing/wrong API usage.", 373 "supported": "moderate" 374 }, 375 { 376 "claim": "De-Hallucinator improves edit distance by 23.3–50.6% over conventional prompts across four LLMs.", 377 "evidence": "Table 3 shows reductions from 50.6% (UniXCoder) to 23.3% (CodeGen v1) with Wilcoxon statistical significance; all improvements are statistically significant.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "De-Hallucinator improves exact API match recall by 23.9–61.0% across four LLMs for code completion.", 382 "evidence": "Table 3 reports exact API match counts for initial vs. iterative prompts, with statistically significant improvements for all models.", 383 "supported": "strong" 384 }, 385 { 386 "claim": "De-Hallucinator fixes 63.2% more hallucinated tests and increases statement coverage by 15.5% for test generation.", 387 "evidence": "Table 4 shows iterative prompts improve fixed hallucinations from 19.3 to 31.4 (63.2% relative) and coverage from 32.1 to 37.0 (15.5% relative) with Wilcoxon significance.", 388 "supported": "strong" 389 }, 390 { 391 "claim": "Iterative prompts (using model output for retrieval) outperform plain RAG prompts (using initial prompt for retrieval).", 392 "evidence": "Table 3 consistently shows iterative prompts achieve lower edit distance and higher API match than RAG prompts; Table 4 shows iterative alone achieves higher coverage than RAG & iterative combined.", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Even k=1 iteration provides clear improvements over the baseline, making the approach useful when LLM query cost is high.", 397 "evidence": "Figure 7 shows k=1 already provides substantial relative improvement in exact API match across all four models.", 398 "supported": "moderate" 399 } 400 ], 401 "methodology_tags": [ 402 "benchmark-eval", 403 "case-study" 404 ], 405 "key_findings": "De-Hallucinator addresses LLM hallucinations of project-specific APIs by iteratively augmenting prompts with retrieved API references, using the model's own (hallucinated) predictions to guide better retrieval. Evaluated on 5 LLMs across 11 Python and 12 JavaScript open-source projects, the approach consistently and significantly improves code completion (23.3–50.6% edit distance reduction, 23.9–61.0% API recall improvement) and test generation (63.2% more hallucinated tests fixed, 15.5% coverage increase). The iterative retrieval mechanism outperforms plain RAG, and even a single iteration provides meaningful gains. The approach requires no fine-tuning and treats the LLM as a black box.", 406 "red_flags": [ 407 { 408 "flag": "No variance reported", 409 "detail": "All main result tables (Tables 3 and 4) report only mean values with no standard deviation, confidence intervals, or error bars, making it impossible to assess result stability." 410 }, 411 { 412 "flag": "Contamination not addressed", 413 "detail": "All 11 Python and 12 JavaScript projects are popular public GitHub repositories almost certainly present in the training corpora of the evaluated models; the paper does not analyze or discuss train-test overlap beyond excluding already-correct completions." 414 }, 415 { 416 "flag": "Small preliminary study", 417 "detail": "The motivational claim that 44% of failed completions involve hallucinated APIs is based on manual inspection of only 50 functions from 10 projects by the authors themselves." 418 }, 419 { 420 "flag": "No confound isolation for 'more context'", 421 "detail": "The paper does not rule out that improvement comes simply from adding more tokens to the prompt rather than from the specific iterative API-retrieval mechanism; a control adding random API references would strengthen the causal claim." 422 } 423 ], 424 "cited_papers": [ 425 { 426 "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation", 427 "relevance": "TestPilot is the baseline system for the test generation task; De-Hallucinator extends TestPilot with iterative API grounding." 428 }, 429 { 430 "title": "Repository-Level Code Completion Through Iterative Retrieval and Generation (RepoCoder)", 431 "relevance": "Concurrent work on repository-level code completion using iterative retrieval; directly compared to as the closest prior approach." 432 }, 433 { 434 "title": "CoCoMIC: Code Completion By Jointly Modeling In-file and Cross-file Context", 435 "relevance": "Prior work addressing project-specific context for code completion via fine-tuning; contrasted with De-Hallucinator's black-box approach." 436 }, 437 { 438 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 439 "relevance": "Foundational RAG technique that De-Hallucinator extends with iterative retrieval using model predictions." 440 }, 441 { 442 "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)", 443 "relevance": "Key baseline work on evaluating code LLMs; establishes the benchmark context for code generation evaluation." 444 }, 445 { 446 "title": "Repository-Level Prompt Generation for Large Language Models of Code", 447 "relevance": "Prior work on repository-level prompt generation using a separate trained model; contrasted with De-Hallucinator's model-agnostic approach." 448 }, 449 { 450 "title": "StarCoder: may the source be with you!", 451 "relevance": "One of four LLMs evaluated in the code completion experiments." 452 }, 453 { 454 "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions", 455 "relevance": "Prior empirical study on LLM hallucinations in code generation; motivates the research problem." 456 } 457 ], 458 "engagement_factors": { 459 "practical_relevance": { 460 "score": 3, 461 "justification": "The technique works with off-the-shelf LLMs as black boxes, is deployable in IDEs, and addresses a concrete pain point that developers already report with AI coding assistants." 462 }, 463 "surprise_contrarian": { 464 "score": 1, 465 "justification": "Using the model's hallucinated output to retrieve better context is a clever observation, but iterative refinement and RAG are established concepts so the insight is incremental." 466 }, 467 "fear_safety": { 468 "score": 0, 469 "justification": "No AI safety or risk concerns; the paper addresses a usability/accuracy problem in code generation tools." 470 }, 471 "drama_conflict": { 472 "score": 0, 473 "justification": "No controversy; the paper presents a technical improvement without challenging major assumptions or competing with prominent labs." 474 }, 475 "demo_ability": { 476 "score": 3, 477 "justification": "Code is publicly released on GitHub for both tasks, and the approach works with any off-the-shelf LLM making it immediately tryable." 478 }, 479 "brand_recognition": { 480 "score": 1, 481 "justification": "University of Stuttgart Software Lab is a respected academic group but not a widely-recognized brand in industry or general AI discourse." 482 } 483 }, 484 "hn_data": { 485 "threads": [ 486 { 487 "hn_id": "38939558", 488 "title": "Large Legal Fictions: Profiling Legal Hallucinations in Large Language Models", 489 "points": 2, 490 "comments": 0, 491 "url": "https://news.ycombinator.com/item?id=38939558", 492 "created_at": "2024-01-10T14:57:07Z" 493 } 494 ], 495 "top_points": 2, 496 "total_points": 2, 497 "total_comments": 0 498 } 499 }