scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32158B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "De-Hallucinator: Mitigating LLM Hallucinations in Code Generation Tasks via Iterative Grounding",
      6     "authors": [
      7       "Aryaz Eghbali",
      8       "Michael Pradel"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv",
     12     "arxiv_id": "2401.01701",
     13     "doi": null
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "Abstract claims of 23.3–50.6% edit distance improvement, 23.9–61.0% recall improvement, 63.2% fixed hallucinations, and 15.5% coverage increase are all directly supported by Tables 3 and 4 in the results.",
     21         "source": "opus"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper claims De-Hallucinator 'improves' generated code. The experimental design is a controlled comparison: same models, same datasets, with and without De-Hallucinator's augmented prompts. This controlled single-variable manipulation adequately supports the causal claims.",
     27         "source": "opus"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Section 6 explicitly states: 'We implement and evaluate De-Hallucinator for Python and JavaScript, and although our general approach could be applied to any language, our conclusions are valid only for these languages.' The title says 'Code Generation Tasks' broadly, but limitations bound the claims.",
     33         "source": "opus"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Section 6 discusses limitations (API availability assumption, language generalization, project representativeness) but does not discuss alternative explanations for the observed improvements, such as whether the gains come from additional context length rather than API-specific retrieval, or whether retrieval noise could explain variable results.",
     39         "source": "opus"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper measures edit distance, exact API match, and test pass rates, and frames claims at the same granularity ('more accurate predictions', 'mitigated hallucinations'). No broader framing beyond what was measured (e.g., no productivity claims).",
     45         "source": "opus"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 6 is titled 'Limitations and Threats to Validity' and provides substantive discussion of multiple limitations.",
     53         "source": "opus"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 6 discusses specific threats: API availability assumption (developer may write usage before implementing API), language-specific conclusions (Python and JavaScript only), and project representativeness with specific mitigation (diverse selection of popular projects).",
     59         "source": "opus"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Section 6 explicitly states: 'our conclusions are valid only for these languages' (Python and JavaScript). The paper identifies the specific scenario where the approach fails (APIs not yet implemented).",
     65         "source": "opus"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding source is mentioned anywhere in the paper. No acknowledgments section listing grants or sponsors.",
     73         "source": "opus"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Both authors are listed as affiliated with Software Lab, University of Stuttgart. They are not evaluating a commercial product they created.",
     79         "source": "opus"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "Funding is not disclosed, so independence of funders cannot be assessed.",
     85         "source": "opus"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests or financial disclosure statement is present in the paper.",
     91         "source": "opus"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Definition 3.1 formally defines 'API reference' with three subtypes; hallucination and iterative prompting are explained through a concrete running example.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The introduction lists four explicit bullet-point contributions: empirical motivation study, the technique itself, the novel algorithm, and empirical evidence across five LLMs.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 7 explicitly contrasts De-Hallucinator with CoCoMIC, RepoCoder, ReACC, HyDE, and TestPilot, explaining mechanistic differences rather than simply listing papers.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Data-Availability Statement provides two GitHub repositories: https://github.com/AryazE/dehallucinator and https://github.com/AryazE/testpilot, described as containing 'implementation, datasets, and evaluation scripts.'",
    122           "source": "opus"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "The Data-Availability Statement states 'Our implementation, datasets, and evaluation scripts are publicly available' at the provided GitHub URLs.",
    128           "source": "opus"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "The paper specifies hardware (Nvidia T4, Tesla V100) and mentions HuggingFace transformers and specific model IDs, but does not provide a requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment.",
    134           "source": "opus"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "The paper describes the experimental setup in detail (Section 5.1) but does not include step-by-step reproduction instructions or a 'Reproducing Results' section. The code release may contain a README, but the paper itself lacks explicit reproduction steps.",
    140           "source": "opus"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "Tables 3 and 4 report only point estimates (e.g., '50.6%' improvement). No confidence intervals, error bars, or ± notation appear anywhere in the results.",
    148           "source": "opus"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": true,
    153           "justification": "Section 5.2 states 'The approach shows statistically significant (using the Wilcoxon test and Pratt method) improvements over the baseline consistently for all metrics and all models.' Same tests used for test generation results (Table 4).",
    154           "source": "opus"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Tables 3 and 4 report both absolute values and relative improvements with baseline context (e.g., edit distance 47.2→30.1, '36.3% improvement'), providing sufficient information to assess effect magnitude.",
    160           "source": "opus"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "The evaluation uses 440 code completion tasks (11 projects × 10 tasks × 4 models) and 12 JavaScript projects for test generation. No justification is given for why these sample sizes are sufficient, and no power analysis is discussed.",
    166           "source": "opus"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "Results in Tables 3 and 4 are reported as single aggregate values without standard deviation, interquartile range, or any spread measure across projects or tasks.",
    172           "source": "opus"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "The paper compares against an initial prompt baseline (no retrieval) and a RAG prompt baseline for code completion. For test generation, the baseline is the original TestPilot implementation (Section 5.1.2).",
    180           "source": "opus"
    181         },
    182         "baselines_contemporary": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The baseline models (CodeGen 2022, CodeGen 2.5 2023, UniXCoder 2022, StarCoder+ 2023, GPT-3.5-turbo-0125) were contemporary at the time of writing. TestPilot is a state-of-the-art test generation tool.",
    186           "source": "opus"
    187         },
    188         "ablation_study": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Tables 3 and 4 separately report results for RAG prompts and iterative prompts, showing the contribution of each component. Figures 7 and 8 ablate over hyperparameters k and n, and Table 4 compares 'RAG & iterative' vs 'Iterative' alone.",
    192           "source": "opus"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "Code completion uses three metrics: edit distance, normalized edit similarity, and exact API match (Section 5.1.4). Test generation uses passing tests, coverage, and fixed hallucinations.",
    198           "source": "opus"
    199         },
    200         "human_evaluation": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Section 2.1 describes manual classification of 50 completion tasks by two authors with inter-rater agreement (Cohen's kappa 0.76). Section 5.3 manually inspects 20 completion tasks per LLM to assess retrieval quality.",
    204           "source": "opus"
    205         },
    206         "held_out_test_set": {
    207           "applies": true,
    208           "answer": false,
    209           "justification": "No explicit dev/test split is described. Hyperparameters k and n are tuned using the same data that reports final results (Figures 7 and 8 show tuning experiments, and the selected defaults are used for Tables 3 and 4).",
    210           "source": "opus"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Results are broken down by model (Table 3), by prompt type (RAG vs iterative), and by hyperparameter settings (Figures 7, 8). Table 5 breaks down API retrieval success per model. Table 2 lists per-project details.",
    216           "source": "opus"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Section 5.3 discusses failure cases: 'For cases where the approach fails to add the correct API reference into the prompt, the main reason is that the initial completion has low relevance w.r.t. the ground truth.' Section 6 also discusses when the approach cannot help.",
    222           "source": "opus"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "Table 4 shows that combining RAG & iterative prompts yields lower coverage (33.7%) than iterative alone (37.0%), demonstrating that adding RAG can hurt. RAG prompts alone show minimal improvement for UniXCoder's exact API match (0.0% improvement in Table 3).",
    228           "source": "opus"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Exact HuggingFace model IDs are provided: 'Salesforce/codegen-2B-mono', 'Salesforce/codegen25-7b-mono', 'microsoft/unixcoder-base', 'bigcode/starcoderplus', and 'GPT-3.5-turbo-0125' with the API snapshot date.",
    236           "source": "opus"
    237         },
    238         "prompts_provided": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "Figures 4, 5, and 6 show concrete examples of actual prompts including API reference sections. Section 3.4 describes the prompt construction format. The code is also publicly released containing the implementation.",
    242           "source": "opus"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": true,
    246           "answer": true,
    247           "justification": "Key hyperparameters are reported: max tokens = 256 (Section 5.1.2), temperature = 0.1, 4 completions per prompt (test generation), k = 3 iterations, n = 20 API references (code completion), n = 3 (test generation), prompt size limit of 2,048 tokens.",
    248           "source": "opus"
    249         },
    250         "scaffolding_described": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "The iterative retrieval-augmented pipeline is described in full detail across Sections 3.1–3.5, including the pre-analysis index, embedding-based retrieval, Ball Tree search, prompt construction, and iterative loop. Figure 3 provides an architectural overview.",
    254           "source": "opus"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Section 5.1.3 documents dataset construction: random project selection from curated lists, API usage removal for ground truth, multi-line call handling, import removal to prevent data leakage, and filtering of already-correct predictions (with exact counts: 18, 51, 76, and 31 filtered per model).",
    260           "source": "opus"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "The Data-Availability Statement provides links to datasets and evaluation scripts at two GitHub repositories, enabling independent verification of the results.",
    268           "source": "opus"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Section 5.1.3 describes the collection procedure in detail: random selection of 10 Python projects from awesome-python (by domain sampling), random selection of 5 functions per project with a 25-line filter (preliminary study), and API-usage-removal-based benchmark construction for the main evaluation.",
    274           "source": "opus"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants. Data comes from public open-source GitHub projects selected from a curated list.",
    280           "source": "opus"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "The pipeline is documented: select projects from curated list → extract API usages → remove API usage lines and related imports → query LLMs → filter already-correct predictions (with counts per model: 18, 51, 76, 31) → evaluate with metrics. For test generation, TestPilot's pipeline is reused with token budget matching.",
    286           "source": "opus"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "The paper does not state training data cutoff dates for any of the five LLMs used (CodeGen, CodeGen 2.5, UniXCoder, StarCoder+, GPT-3.5-turbo), despite evaluating them on code from public GitHub repositories that may have been in their training data.",
    294           "source": "opus"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The paper filters out exact-match completions 'to avoid any potential memorizations' (Section 5.1.3), but does not analyze whether the evaluation projects appeared in model training data. Exact-match filtering only catches the most obvious contamination.",
    300           "source": "opus"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "The evaluation projects are all popular open-source GitHub repositories (e.g., scikit-learn with 58.5k stars, black with 37.6k stars) that almost certainly appeared in training data. The paper does not discuss this contamination risk beyond filtering exact-match predictions.",
    306           "source": "opus"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants in the study. The evaluation is entirely automated with supplementary manual code inspection.",
    314           "source": "opus"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants. The study uses public open-source code and automated evaluation.",
    320           "source": "opus"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants in the study.",
    326           "source": "opus"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants in the study.",
    332           "source": "opus"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants in the study.",
    338           "source": "opus"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants in the study.",
    344           "source": "opus"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants in the study.",
    350           "source": "opus"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": true,
    357           "justification": "Section 5.5 reports detailed timing: pre-analysis takes under 1 second per 1,000 LoC, retrieval takes 21–227ms per iteration, LLM query time ranges 1.3–66.7 seconds per query. Token limits per package are stated for test generation (130k).",
    358           "source": "opus"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": true,
    363           "justification": "Section 5.1.5 specifies hardware (two Nvidia T4 GPUs with 16GB each, single Nvidia Tesla V100 with 32GB, 48-core Intel Xeon at 2.20GHz). Section 5.5 provides per-operation timing breakdowns, and test generation uses a 130k token limit per package.",
    364           "source": "opus"
    365         }
    366       },
    367       "experimental_rigor": {
    368         "seed_sensitivity_reported": {
    369           "applies": true,
    370           "answer": false,
    371           "justification": "The paper does not report results across multiple random seeds. For test generation, temperature is 0.1 with 4 completions, but no seed variation analysis is performed.",
    372           "source": "opus"
    373         },
    374         "number_of_runs_stated": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "The paper does not state how many times the full experiment was run. It generates k=3 iterations and 4 completions per prompt (test generation) but does not state whether the entire evaluation was repeated across multiple runs.",
    378           "source": "opus"
    379         },
    380         "hyperparameter_search_budget": {
    381           "applies": true,
    382           "answer": false,
    383           "justification": "RQ3 studies k∈{1,2,3} and n∈{2,10,20,40} for code completion, n∈{3,5,10} for test generation, but the total search budget (compute spent on hyperparameter exploration) is not reported.",
    384           "source": "opus"
    385         },
    386         "best_config_selection_justified": {
    387           "applies": true,
    388           "answer": false,
    389           "justification": "Hyperparameters k=3, n=20 (code completion) and n=3 (test generation) are selected based on RQ3 experiments (Figures 7, 8) using the same data that reports final results. No separate validation set is used for selection.",
    390           "source": "opus"
    391         },
    392         "multiple_comparison_correction": {
    393           "applies": true,
    394           "answer": false,
    395           "justification": "Wilcoxon tests are performed across 4 models × 3 metrics for code completion and 3 metrics for test generation, but no correction for multiple comparisons (Bonferroni, Holm, etc.) is mentioned.",
    396           "source": "opus"
    397         },
    398         "self_comparison_bias_addressed": {
    399           "applies": true,
    400           "answer": false,
    401           "justification": "The authors evaluate their own De-Hallucinator system against baselines without acknowledging author-evaluation bias. While baselines are off-the-shelf models (reducing this concern), the benchmark construction and metric selection were done by the same authors.",
    402           "source": "opus"
    403         },
    404         "compute_budget_vs_performance": {
    405           "applies": true,
    406           "answer": true,
    407           "justification": "For test generation, Section 5.1.2 states 'we set the token limit of De-Hallucinator to the amount of tokens used by the baseline,' explicitly matching compute budgets. Figures 7-8 show performance across different k values (number of queries), implicitly showing performance vs. compute tradeoff.",
    408           "source": "opus"
    409         },
    410         "benchmark_construct_validity": {
    411           "applies": true,
    412           "answer": false,
    413           "justification": "The benchmark is constructed by removing API usages from existing code and treating the original as ground truth. The paper does not discuss whether this constructed benchmark validly measures real-world code completion needs or whether the removal-based approach introduces artifacts.",
    414           "source": "opus"
    415         },
    416         "scaffold_confound_addressed": {
    417           "applies": true,
    418           "answer": true,
    419           "justification": "The same underlying LLMs are used with and without De-Hallucinator, isolating the scaffolding effect. The comparison is controlled: same model, same data, only the prompting strategy changes. Results are shown across four different models.",
    420           "source": "opus"
    421         }
    422       },
    423       "data_leakage": {
    424         "temporal_leakage_addressed": {
    425           "applies": true,
    426           "answer": false,
    427           "justification": "The evaluation projects (e.g., scikit-learn, black, seaborn) are popular open-source projects whose code existed before the models' training cutoffs. The paper does not discuss whether models may have seen the test code during training.",
    428           "source": "opus"
    429         },
    430         "feature_leakage_addressed": {
    431           "applies": true,
    432           "answer": false,
    433           "justification": "No discussion of whether the evaluation setup leaks information not available in real IDE usage scenarios. The API reference augmentation itself is the technique being tested, but no analysis of potential feature leakage in the benchmark construction is provided.",
    434           "source": "opus"
    435         },
    436         "non_independence_addressed": {
    437           "applies": true,
    438           "answer": false,
    439           "justification": "The 10 completion tasks per project are drawn from the same codebase and may share structural similarities (imports, coding style). No discussion of potential non-independence between evaluation examples.",
    440           "source": "opus"
    441         },
    442         "leakage_detection_method": {
    443           "applies": true,
    444           "answer": false,
    445           "justification": "The paper filters out exact-match predictions to avoid memorization effects, but does not apply a concrete leakage detection method such as canary strings, membership inference tests, n-gram overlap analysis, or temporal splits.",
    446           "source": "opus"
    447         }
    448       }
    449     }
    450   },
    451   "claims": [
    452     {
    453       "claim": "API hallucinations affect 44% of all studied function-level code completion tasks and 59% of tasks where the baseline LLM fails.",
    454       "evidence": "Preliminary study on 50 manually classified completions using CodeGen 2.5 with inter-rater Cohen's kappa = 0.76 (Section 2.1).",
    455       "supported": "moderate"
    456     },
    457     {
    458       "claim": "De-Hallucinator improves edit distance by 23.3–50.6% relative to conventional prompting across four LLMs.",
    459       "evidence": "Table 3 shows statistically significant improvements (Wilcoxon/Pratt) for all four models on all metrics.",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "De-Hallucinator improves exact API match recall by 23.9–61.0% relative to baseline.",
    464       "evidence": "Table 3 exact API match row; e.g., CodeGen v2.5 improves from 8.3 to 13.4 (61.0% relative gain), all statistically significant.",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "De-Hallucinator reduces hallucinated tests by 63.2% and increases statement coverage by 15.5% in JavaScript test generation.",
    469       "evidence": "Table 4 comparing iterative De-Hallucinator vs. TestPilot baseline; coverage and fixed-hallucination improvements are statistically significant.",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Most improvement is achieved in the first iteration; additional iterations yield diminishing returns.",
    474       "evidence": "Figure 7 shows k=1 already captures most of the exact API match improvement across all four LLMs; k=2 and k=3 add incrementally.",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "RAG alone (non-iterative) is less effective than iterative retrieval, especially when the initial prompt lacks relevant code.",
    479       "evidence": "Table 4 shows RAG & iterative combined achieves lower coverage (33.7%) than iterative alone (37.0%) for test generation; Section 5.2 explains the mechanism.",
    480       "supported": "strong"
    481     }
    482   ],
    483   "methodology_tags": [
    484     "benchmark-eval",
    485     "case-study"
    486   ],
    487   "key_findings": "De-Hallucinator mitigates API hallucinations in LLM-based code generation by iteratively augmenting prompts with project-specific API references derived from the model's own (hallucinated) predictions rather than from the original prompt. Evaluated on 440 code completion tasks across four LLMs and 12 JavaScript test generation projects, the approach consistently and significantly outperforms baselines: 23.3–50.6% edit distance improvement, 23.9–61.0% exact API match improvement, 63.2% fewer hallucinated tests, and 15.5% higher statement coverage. The technique requires no model fine-tuning, works as a black-box wrapper, and most of its benefit is realized in the first iteration, making it practical for production deployment.",
    488   "red_flags": [
    489     {
    490       "flag": "No variance or confidence intervals",
    491       "detail": "Tables 3 and 4 report mean improvements with no standard deviation, standard error, or confidence intervals; spread across 11 or 12 projects is unknown."
    492     },
    493     {
    494       "flag": "Small preliminary study",
    495       "detail": "The core motivation claim (44% of completions involve API hallucinations) is based on only 50 manually examined tasks from 10 projects using a single model."
    496     },
    497     {
    498       "flag": "No contamination analysis",
    499       "detail": "Test projects are public GitHub repos that likely predate training cutoffs; filtering memorized completions is a partial mitigation but no formal contamination analysis is conducted for any of the five models."
    500     },
    501     {
    502       "flag": "No formal sample size justification",
    503       "detail": "Ten completions per project is chosen for convenience; no power analysis supports that this sample size is adequate for the comparative claims made."
    504     }
    505   ],
    506   "cited_papers": [
    507     {
    508       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation (TestPilot)",
    509       "relevance": "Direct baseline system extended for the JavaScript test generation experiments"
    510     },
    511     {
    512       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    513       "relevance": "One of the primary LLMs evaluated in code completion experiments"
    514     },
    515     {
    516       "title": "StarCoder: may the source be with you!",
    517       "relevance": "Largest LLM baseline (15.5B parameters) evaluated in code completion"
    518     },
    519     {
    520       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    521       "relevance": "Foundational RAG technique that De-Hallucinator extends with iterative prompting"
    522     },
    523     {
    524       "title": "CoCoMIC: Code Completion By Jointly Modeling In-file and Cross-file Context",
    525       "relevance": "Key prior work on cross-file context; De-Hallucinator explicitly contrasts its black-box approach vs. CoCoMIC's fine-tuning requirement"
    526     },
    527     {
    528       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    529       "relevance": "Concurrent work on iterative retrieval for code completion; distinguished by retrieving code fragments vs. De-Hallucinator's API signatures"
    530     },
    531     {
    532       "title": "Repository-level prompt generation for large language models of code",
    533       "relevance": "Related work on RAG-based context selection for code; contrasted with De-Hallucinator's model-agnostic approach"
    534     },
    535     {
    536       "title": "Measuring GitHub Copilot's Impact on Productivity",
    537       "relevance": "Cited to establish practical relevance and adoption of AI code completion tools"
    538     }
    539   ],
    540   "engagement_factors": {
    541     "practical_relevance": {
    542       "score": 2,
    543       "justification": "Presents a usable technique for reducing API hallucinations in code completion with open-source code available, though requires integration work."
    544     },
    545     "surprise_contrarian": {
    546       "score": 1,
    547       "justification": "The iterative grounding idea is clever but the finding that LLMs hallucinate project-specific APIs is well-known, not surprising."
    548     },
    549     "fear_safety": {
    550       "score": 0,
    551       "justification": "No safety, security, or risk angle — purely about improving code generation accuracy."
    552     },
    553     "drama_conflict": {
    554       "score": 0,
    555       "justification": "No controversy or conflict; straightforwardly improves on baselines without challenging any company or popular belief."
    556     },
    557     "demo_ability": {
    558       "score": 1,
    559       "justification": "Code is on GitHub but requires setting up CodeQL, embedding models, and specific LLMs — significant setup effort."
    560     },
    561     "brand_recognition": {
    562       "score": 0,
    563       "justification": "University of Stuttgart authors, not a well-known AI lab; models used (CodeGen, UniXcoder) are not household names."
    564     }
    565   },
    566   "hn_data": {
    567     "threads": [
    568       {
    569         "hn_id": "38939558",
    570         "title": "Large Legal Fictions: Profiling Legal Hallucinations in Large Language Models",
    571         "points": 2,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=38939558",
    574         "created_at": "2024-01-10T14:57:07Z"
    575       }
    576     ],
    577     "top_points": 2,
    578     "total_points": 2,
    579     "total_comments": 0
    580   }
    581 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs