scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30941B)
      1 {
      2   "paper": {
      3     "title": "De-Hallucinator: Mitigating LLM Hallucinations in Code Generation Tasks via Iterative Grounding",
      4     "authors": [
      5       "Aryaz Eghbali",
      6       "Michael Pradel"
      7     ],
      8     "year": 2024,
      9     "venue": "arXiv",
     10     "arxiv_id": "2401.01701"
     11   },
     12   "scan_version": 3,
     13   "active_modules": [
     14     "experimental_rigor",
     15     "data_leakage"
     16   ],
     17   "methodology_tags": [
     18     "benchmark-eval"
     19   ],
     20   "key_findings": "De-Hallucinator iteratively augments LLM prompts with project-specific API references retrieved based on the model's own predictions, improving code completion edit distance by 23.3–50.6% and exact API match by 23.9–61.0% across four LLMs. For test generation with GPT-3.5-turbo, it fixes 63.2% of hallucination-induced test failures and increases statement coverage by 15.5%. A preliminary study shows API hallucinations affect 44% of function-level code completion tasks. The first iteration provides the largest gains, with diminishing returns from additional iterations.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Data-Availability Statement provides two GitHub repositories: https://github.com/AryazE/dehallucinator and https://github.com/AryazE/testpilot, described as containing 'implementation, datasets, and evaluation scripts.'"
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The Data-Availability Statement states 'Our implementation, datasets, and evaluation scripts are publicly available' at the provided GitHub URLs."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper specifies hardware (Nvidia T4, Tesla V100) and mentions HuggingFace transformers and specific model IDs, but does not provide a requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper describes the experimental setup in detail (Section 5.1) but does not include step-by-step reproduction instructions or a 'Reproducing Results' section. The code release may contain a README, but the paper itself lacks explicit reproduction steps."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 3 and 4 report only point estimates (e.g., '50.6%' improvement). No confidence intervals, error bars, or ± notation appear anywhere in the results."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 5.2 states 'The approach shows statistically significant (using the Wilcoxon test and Pratt method) improvements over the baseline consistently for all metrics and all models.' Same tests used for test generation results (Table 4)."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Tables 3 and 4 report both absolute values and relative improvements with baseline context (e.g., edit distance 47.2→30.1, '36.3% improvement'), providing sufficient information to assess effect magnitude."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The evaluation uses 440 code completion tasks (11 projects × 10 tasks × 4 models) and 12 JavaScript projects for test generation. No justification is given for why these sample sizes are sufficient, and no power analysis is discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Results in Tables 3 and 4 are reported as single aggregate values without standard deviation, interquartile range, or any spread measure across projects or tasks."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper compares against an initial prompt baseline (no retrieval) and a RAG prompt baseline for code completion. For test generation, the baseline is the original TestPilot implementation (Section 5.1.2)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The baseline models (CodeGen 2022, CodeGen 2.5 2023, UniXCoder 2022, StarCoder+ 2023, GPT-3.5-turbo-0125) were contemporary at the time of writing. TestPilot is a state-of-the-art test generation tool."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Tables 3 and 4 separately report results for RAG prompts and iterative prompts, showing the contribution of each component. Figures 7 and 8 ablate over hyperparameters k and n, and Table 4 compares 'RAG & iterative' vs 'Iterative' alone."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Code completion uses three metrics: edit distance, normalized edit similarity, and exact API match (Section 5.1.4). Test generation uses passing tests, coverage, and fixed hallucinations."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section 2.1 describes manual classification of 50 completion tasks by two authors with inter-rater agreement (Cohen's kappa 0.76). Section 5.3 manually inspects 20 completion tasks per LLM to assess retrieval quality."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No explicit dev/test split is described. Hyperparameters k and n are tuned using the same data that reports final results (Figures 7 and 8 show tuning experiments, and the selected defaults are used for Tables 3 and 4)."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by model (Table 3), by prompt type (RAG vs iterative), and by hyperparameter settings (Figures 7, 8). Table 5 breaks down API retrieval success per model. Table 2 lists per-project details."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 5.3 discusses failure cases: 'For cases where the approach fails to add the correct API reference into the prompt, the main reason is that the initial completion has low relevance w.r.t. the ground truth.' Section 6 also discusses when the approach cannot help."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Table 4 shows that combining RAG & iterative prompts yields lower coverage (33.7%) than iterative alone (37.0%), demonstrating that adding RAG can hurt. RAG prompts alone show minimal improvement for UniXCoder's exact API match (0.0% improvement in Table 3)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims of 23.3–50.6% edit distance improvement, 23.9–61.0% recall improvement, 63.2% fixed hallucinations, and 15.5% coverage increase are all directly supported by Tables 3 and 4 in the results."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper claims De-Hallucinator 'improves' generated code. The experimental design is a controlled comparison: same models, same datasets, with and without De-Hallucinator's augmented prompts. This controlled single-variable manipulation adequately supports the causal claims."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 6 explicitly states: 'We implement and evaluate De-Hallucinator for Python and JavaScript, and although our general approach could be applied to any language, our conclusions are valid only for these languages.' The title says 'Code Generation Tasks' broadly, but limitations bound the claims."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Section 6 discusses limitations (API availability assumption, language generalization, project representativeness) but does not discuss alternative explanations for the observed improvements, such as whether the gains come from additional context length rather than API-specific retrieval, or whether retrieval noise could explain variable results."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures edit distance, exact API match, and test pass rates, and frames claims at the same granularity ('more accurate predictions', 'mitigated hallucinations'). No broader framing beyond what was measured (e.g., no productivity claims)."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Exact HuggingFace model IDs are provided: 'Salesforce/codegen-2B-mono', 'Salesforce/codegen25-7b-mono', 'microsoft/unixcoder-base', 'bigcode/starcoderplus', and 'GPT-3.5-turbo-0125' with the API snapshot date."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Figures 4, 5, and 6 show concrete examples of actual prompts including API reference sections. Section 3.4 describes the prompt construction format. The code is also publicly released containing the implementation."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Key hyperparameters are reported: max tokens = 256 (Section 5.1.2), temperature = 0.1, 4 completions per prompt (test generation), k = 3 iterations, n = 20 API references (code completion), n = 3 (test generation), prompt size limit of 2,048 tokens."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The iterative retrieval-augmented pipeline is described in full detail across Sections 3.1–3.5, including the pre-analysis index, embedding-based retrieval, Ball Tree search, prompt construction, and iterative loop. Figure 3 provides an architectural overview."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.1.3 documents dataset construction: random project selection from curated lists, API usage removal for ground truth, multi-line call handling, import removal to prevent data leakage, and filtering of already-correct predictions (with exact counts: 18, 51, 76, and 31 filtered per model)."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 6 is titled 'Limitations and Threats to Validity' and provides substantive discussion of multiple limitations."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6 discusses specific threats: API availability assumption (developer may write usage before implementing API), language-specific conclusions (Python and JavaScript only), and project representativeness with specific mitigation (diverse selection of popular projects)."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 6 explicitly states: 'our conclusions are valid only for these languages' (Python and JavaScript). The paper identifies the specific scenario where the approach fails (APIs not yet implemented)."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "The Data-Availability Statement provides links to datasets and evaluation scripts at two GitHub repositories, enabling independent verification of the results."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 5.1.3 describes the collection procedure in detail: random selection of 10 Python projects from awesome-python (by domain sampling), random selection of 5 functions per project with a 25-line filter (preliminary study), and API-usage-removal-based benchmark construction for the main evaluation."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data comes from public open-source GitHub projects selected from a curated list."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline is documented: select projects from curated list → extract API usages → remove API usage lines and related imports → query LLMs → filter already-correct predictions (with counts per model: 18, 51, 76, 31) → evaluate with metrics. For test generation, TestPilot's pipeline is reused with token budget matching."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding source is mentioned anywhere in the paper. No acknowledgments section listing grants or sponsors."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Both authors are listed as affiliated with Software Lab, University of Stuttgart. They are not evaluating a commercial product they created."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "Funding is not disclosed, so independence of funders cannot be assessed."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial disclosure statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper does not state training data cutoff dates for any of the five LLMs used (CodeGen, CodeGen 2.5, UniXCoder, StarCoder+, GPT-3.5-turbo), despite evaluating them on code from public GitHub repositories that may have been in their training data."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "The paper filters out exact-match completions 'to avoid any potential memorizations' (Section 5.1.3), but does not analyze whether the evaluation projects appeared in model training data. Exact-match filtering only catches the most obvious contamination."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "The evaluation projects are all popular open-source GitHub repositories (e.g., scikit-learn with 58.5k stars, black with 37.6k stars) that almost certainly appeared in training data. The paper does not discuss this contamination risk beyond filtering exact-match predictions."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the study. The evaluation is entirely automated with supplementary manual code inspection."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants. The study uses public open-source code and automated evaluation."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in the study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Section 5.5 reports detailed timing: pre-analysis takes under 1 second per 1,000 LoC, retrieval takes 21–227ms per iteration, LLM query time ranges 1.3–66.7 seconds per query. Token limits per package are stated for test generation (130k)."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Section 5.1.5 specifies hardware (two Nvidia T4 GPUs with 16GB each, single Nvidia Tesla V100 with 32GB, 48-core Intel Xeon at 2.20GHz). Section 5.5 provides per-operation timing breakdowns, and test generation uses a 130k token limit per package."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "The paper does not report results across multiple random seeds. For test generation, temperature is 0.1 with 4 completions, but no seed variation analysis is performed."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The paper does not state how many times the full experiment was run. It generates k=3 iterations and 4 completions per prompt (test generation) but does not state whether the entire evaluation was repeated across multiple runs."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "RQ3 studies k∈{1,2,3} and n∈{2,10,20,40} for code completion, n∈{3,5,10} for test generation, but the total search budget (compute spent on hyperparameter exploration) is not reported."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Hyperparameters k=3, n=20 (code completion) and n=3 (test generation) are selected based on RQ3 experiments (Figures 7, 8) using the same data that reports final results. No separate validation set is used for selection."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Wilcoxon tests are performed across 4 models × 3 metrics for code completion and 3 metrics for test generation, but no correction for multiple comparisons (Bonferroni, Holm, etc.) is mentioned."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own De-Hallucinator system against baselines without acknowledging author-evaluation bias. While baselines are off-the-shelf models (reducing this concern), the benchmark construction and metric selection were done by the same authors."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "For test generation, Section 5.1.2 states 'we set the token limit of De-Hallucinator to the amount of tokens used by the baseline,' explicitly matching compute budgets. Figures 7-8 show performance across different k values (number of queries), implicitly showing performance vs. compute tradeoff."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The benchmark is constructed by removing API usages from existing code and treating the original as ground truth. The paper does not discuss whether this constructed benchmark validly measures real-world code completion needs or whether the removal-based approach introduces artifacts."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "The same underlying LLMs are used with and without De-Hallucinator, isolating the scaffolding effect. The comparison is controlled: same model, same data, only the prompting strategy changes. Results are shown across four different models."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "The evaluation projects (e.g., scikit-learn, black, seaborn) are popular open-source projects whose code existed before the models' training cutoffs. The paper does not discuss whether models may have seen the test code during training."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information not available in real IDE usage scenarios. The API reference augmentation itself is the technique being tested, but no analysis of potential feature leakage in the benchmark construction is provided."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "The 10 completion tasks per project are drawn from the same codebase and may share structural similarities (imports, coding style). No discussion of potential non-independence between evaluation examples."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "The paper filters out exact-match predictions to avoid memorization effects, but does not apply a concrete leakage detection method such as canary strings, membership inference tests, n-gram overlap analysis, or temporal splits."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "De-Hallucinator improves edit distance by 23.3–50.6% compared to conventional prompts across four LLMs for code completion.",
    372       "evidence": "Table 3 shows statistically significant improvements (Wilcoxon test, Pratt method) for all four models: UniXCoder 50.6%, CodeGen v1 23.3%, CodeGen v2.5 36.3%, StarCoder+ 24.9%.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "De-Hallucinator improves exact API match recall by 23.9–61.0% for code completion.",
    377       "evidence": "Table 3 shows improvements across models: UniXCoder 23.9%, CodeGen v1 55.3%, CodeGen v2.5 61.0%, StarCoder+ 32.0%.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "De-Hallucinator fixes 63.2% of tests that initially failed due to hallucinated APIs in test generation.",
    382       "evidence": "Table 4 reports fixed hallucinations increasing from 19.3 (TestPilot baseline) to 31.4 (iterative), a 63.2% relative improvement with GPT-3.5-turbo.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "API hallucinations affect 44% of function-level code completion tasks.",
    387       "evidence": "Preliminary study (Section 2.1) with 50 function-level tasks: 13 correct, 22 of 37 incorrect had missing API usages. Two-author classification with Cohen's kappa = 0.76.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "De-Hallucinator achieves 15.5% improvement in statement coverage for test generation.",
    392       "evidence": "Table 4 shows coverage increasing from 32.1% (TestPilot) to 37.0% (iterative), statistically significant by Wilcoxon test.",
    393       "supported": "moderate"
    394     }
    395   ],
    396   "red_flags": [
    397     {
    398       "flag": "No confidence intervals or variance measures",
    399       "detail": "All results in Tables 3 and 4 are point estimates without confidence intervals, error bars, or standard deviations, despite results aggregating across diverse projects and tasks."
    400     },
    401     {
    402       "flag": "Hyperparameter tuning on evaluation data",
    403       "detail": "Hyperparameters k and n are selected based on experiments (RQ3, Figures 7-8) using the same data that reports final results. No separate validation set is used, risking overfitting to the evaluation dataset."
    404     },
    405     {
    406       "flag": "Contamination risk from popular training projects",
    407       "detail": "Evaluation uses highly popular GitHub projects (scikit-learn 58.5k stars, black 37.6k stars) that almost certainly appeared in model training data. Only exact-match filtering is applied, which may not catch near-memorization."
    408     },
    409     {
    410       "flag": "Small preliminary study sample",
    411       "detail": "The claim that API hallucinations affect 44% of tasks is based on only 50 function-level completions from a single model (CodeGen 2.5 7B). This small sample is presented as motivation for the entire approach."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Evaluating Large Language Models Trained on Code",
    417       "authors": [
    418         "Mark Chen",
    419         "Jerry Tworek",
    420         "Heewoo Jun"
    421       ],
    422       "year": 2021,
    423       "arxiv_id": "2107.03374",
    424       "relevance": "Codex evaluation establishing benchmarks for LLM code generation, foundational to the code generation evaluation methodology."
    425     },
    426     {
    427       "title": "An Empirical Evaluation of Using Large Language Models for Automated Unit Test Generation",
    428       "authors": [
    429         "Max Schäfer",
    430         "Sarah Nadi",
    431         "Aryaz Eghbali",
    432         "Frank Tip"
    433       ],
    434       "year": 2024,
    435       "doi": "10.1109/TSE.2023.3334955",
    436       "relevance": "TestPilot test generation system used as the baseline for the test generation evaluation in this paper."
    437     },
    438     {
    439       "title": "StarCoder: may the source be with you!",
    440       "authors": [
    441         "Raymond Li",
    442         "Loubna Ben Allal"
    443       ],
    444       "year": 2023,
    445       "arxiv_id": "2305.06161",
    446       "relevance": "Open-source code LLM used as one of four evaluated models for code completion."
    447     },
    448     {
    449       "title": "CodeGen: An Open Large Language Model for Code with Multi-Turn Program Synthesis",
    450       "authors": [
    451         "Erik Nijkamp",
    452         "Bo Pang",
    453         "Hiroaki Hayashi"
    454       ],
    455       "year": 2022,
    456       "relevance": "Code generation LLM family used as two of the four evaluated models (CodeGen 2B, CodeGen 2.5 7B)."
    457     },
    458     {
    459       "title": "Retrieval-augmented generation for knowledge-intensive nlp tasks",
    460       "authors": [
    461         "Patrick Lewis",
    462         "Ethan Perez",
    463         "Aleksandra Piktus"
    464       ],
    465       "year": 2020,
    466       "relevance": "Foundational RAG technique that De-Hallucinator extends with iterative retrieval based on model predictions."
    467     },
    468     {
    469       "title": "A Large-Scale Survey on the Usability of AI Programming Assistants: Successes and Challenges",
    470       "authors": [
    471         "Jenny T. Liang",
    472         "Chenyang Yang",
    473         "Brad A. Myers"
    474       ],
    475       "year": 2024,
    476       "doi": "10.1145/3597503.3608128",
    477       "relevance": "Survey documenting developer perceptions of AI programming assistant limitations including project-specific API issues."
    478     },
    479     {
    480       "title": "An Empirical Evaluation of GitHub Copilot's Code Suggestions",
    481       "authors": [
    482         "Nhan Nguyen",
    483         "Sarah Nadi"
    484       ],
    485       "year": 2022,
    486       "doi": "10.1145/3524842.3528470",
    487       "relevance": "Empirical study of Copilot code suggestion quality, documenting hallucination of non-existing APIs."
    488     },
    489     {
    490       "title": "Code Generation Tools (Almost) for Free? A Study of Few-Shot, Pre-Trained Language Models on Code",
    491       "authors": [
    492         "Patrick Bareiß",
    493         "Beatriz Souza",
    494         "Marcelo d'Amorim",
    495         "Michael Pradel"
    496       ],
    497       "year": 2022,
    498       "arxiv_id": "2206.01335",
    499       "relevance": "Study of few-shot LLM code generation capabilities relevant to understanding LLM code generation limitations."
    500     },
    501     {
    502       "title": "Repository-level prompt generation for large language models of code",
    503       "authors": [
    504         "Disha Shrivastava",
    505         "Hugo Larochelle",
    506         "Daniel Tarlow"
    507       ],
    508       "year": 2023,
    509       "relevance": "Closely related work on repository-level context selection for code completion, training a separate model for context ranking."
    510     },
    511     {
    512       "title": "CODAMOSA: Escaping Coverage Plateaus in Test Generation with Pre-trained Large Language Models",
    513       "authors": [
    514         "Caroline Lemieux",
    515         "Jeevana Priya Inala",
    516         "Shuvendu K Lahiri",
    517         "Siddhartha Sen"
    518       ],
    519       "year": 2023,
    520       "relevance": "Uses LLMs to augment automated test generation when stuck, relevant to LLM-assisted test generation approaches."
    521     },
    522     {
    523       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    524       "authors": [
    525         "Fengji Zhang",
    526         "Bei Chen",
    527         "Yue Zhang"
    528       ],
    529       "year": 2023,
    530       "arxiv_id": "2303.12570",
    531       "relevance": "Concurrent work on iterative retrieval for repository-level code completion, retrieving code fragments rather than API signatures."
    532     },
    533     {
    534       "title": "ReACC: A Retrieval-Augmented Code Completion Framework",
    535       "authors": [
    536         "Shuai Lu",
    537         "Nan Duan",
    538         "Hojae Han"
    539       ],
    540       "year": 2022,
    541       "arxiv_id": "2203.07722",
    542       "relevance": "Retrieval-augmented code completion using similar code pieces as dead code, precursor to retrieval-based approaches."
    543     }
    544   ],
    545   "engagement_factors": {
    546     "practical_relevance": {
    547       "score": 2,
    548       "justification": "Presents a usable technique for reducing API hallucinations in code completion with open-source code available, though requires integration work."
    549     },
    550     "surprise_contrarian": {
    551       "score": 1,
    552       "justification": "The iterative grounding idea is clever but the finding that LLMs hallucinate project-specific APIs is well-known, not surprising."
    553     },
    554     "fear_safety": {
    555       "score": 0,
    556       "justification": "No safety, security, or risk angle — purely about improving code generation accuracy."
    557     },
    558     "drama_conflict": {
    559       "score": 0,
    560       "justification": "No controversy or conflict; straightforwardly improves on baselines without challenging any company or popular belief."
    561     },
    562     "demo_ability": {
    563       "score": 1,
    564       "justification": "Code is on GitHub but requires setting up CodeQL, embedding models, and specific LLMs — significant setup effort."
    565     },
    566     "brand_recognition": {
    567       "score": 0,
    568       "justification": "University of Stuttgart authors, not a well-known AI lab; models used (CodeGen, UniXcoder) are not household names."
    569     }
    570   }
    571 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs