scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28201B)
      1 {
      2   "paper": {
      3     "title": "Statically Contextualizing Large Language Models with Typed Holes",
      4     "authors": ["Andrew Blinn", "Xiang Li", "June Hyung Kim", "Cyrus Omar"],
      5     "year": 2024,
      6     "venue": "Proc. ACM Program. Lang. (OOPSLA)",
      7     "arxiv_id": "2409.00921",
      8     "doi": "10.1145/3689728"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "Section 8 states that an artifact is available on Zenodo (reference [11]) containing MVUBench, raw experiment data, testing harness, Hazel source, and StarCoder2 model. Hazel source is also on GitHub at https://github.com/hazelgrove/hazel/."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Section 8 states the Zenodo artifact contains 'the MVUBench program sketches and solutions, the raw data of our experiments.' The artifact is password-protected to prevent scraping."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No mention of requirements.txt, Dockerfile, or detailed environment specifications. The paper mentions StarCoder2-15B can run locally but does not provide environment setup details."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "While an artifact is released, the paper does not describe step-by-step reproduction instructions. The Zenodo artifact description and artifact review process may contain such instructions, but the paper itself does not."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Results are reported as averages across 20 trials (Figures 8, 16, 18, 19) but no confidence intervals or error bars are shown. Only point estimates are provided in the bar charts."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper makes comparative claims (e.g., types + headers outperform vector retrieval) based on comparing averages across 20 trials without any statistical significance tests."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports relative improvements with baseline context: 'headers have a large multiplicative effect, increasing test performance threefold' (Section 2.9), 'error rounds... increasing performance by a factor of 4 for types without headers, and a factor of 1.5 for both types and headers.' Ratio of with-vs-without headers is reported as 3 for Hazel and 1.5 for TypeScript."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper uses 20 trials per configuration but does not justify why 20 was chosen or discuss whether this is sufficient for statistical power."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Results are averaged over 20 trials but no standard deviations, interquartile ranges, or other spread measures are reported. The paper notes model non-determinism (reference [56]) but does not quantify variance."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Three baselines are included: no-context (Section 2.8.3), exhaustive retrieval (Section 2.8.4), and vector retrieval with confounds (Section 2.8.5)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Vector retrieval using OpenAI Ada embeddings represents a standard contemporary RAG approach. Exhaustive retrieval represents the upper bound. These are reasonable baselines for 2024."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Section 2.8.2 describes a systematic feature ablation with 8 configurations (2×2×2) varying type retrieval, header retrieval, and error correction rounds independently."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper reports percentage of tests passed (correctness), token usage (Figure 15), and time elapsed (Figure 14) as separate metrics."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation of the generated code is included. Evaluation is entirely automated via unit tests. Given claims about code quality and developer productivity, human evaluation of output quality would be relevant."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "MVUBench was constructed from scratch specifically to avoid data contamination (Section 1.1.2). The test suites (10-15 tests per application) are held out from the prompt context."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down per application (TO, BO, EM, PL, PA) in Figures 8, 16, 18, and 19, showing per-task performance rather than only averages."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Several failure cases are discussed with examples: Figure 7 (hallucinated types), Figure 9 (various failure modes), Figure 12 (confounding vector retrieval chunk), Figure 13 (exotic completions). StarCoder2 degradation with headers for BO and TO is analyzed in Section 2.10."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section 2.10 reports that headers degraded performance for BO and TO with StarCoder2. Error rounds alone without type context are shown to be ineffective. Vector retrieval's confounding issues are documented."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims that type definitions are 'particularly impactful' and that the techniques are validated in both Hazel and TypeScript — both supported by the ablation results in Figures 8, 16, 18, 19."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims like 'type definitions improve completion quality' are supported by the ablation study design (Section 2.8.2), which systematically manipulates individual features in a controlled manner."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The abstract claims these methods apply to languages generally ('language servers can implement to expose capabilities'). While the paper tests Hazel and TypeScript, MVUBench consists of only 5 small MVU applications — the generalization to arbitrary programming tasks, other languages, and larger codebases is not adequately bounded."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 4 (Threats to Validity) discusses several alternative explanations: headers' effectiveness depends on functions already being implemented, MVUBench is not representative of all coding tasks, the no-context baseline may be unrealistically weak, and the RAG baseline is simplistic."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper measures unit test pass rates and discusses this as a measure of functional correctness of code completions, not broader claims about developer productivity. The claims match the granularity of the measurements."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "GPT-4-0613 is specified by exact checkpoint (Section 1.1.3, Section 2.9.1). StarCoder2-15B is specified by name and size. Ada embedding model is identified as text-embedding-ada-002 (Section 2.8.5)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Section 2.4 provides substantial prompt detail including specific instructions (quoted), few-shot examples (quoted), and negative characterization rules (quoted). The static retrieval prompt construction is fully described in Sections 2.5-2.6 with concrete examples."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Temperature 0.6 is stated (Section 2.8.2), with justification ('selected based on trial experiments as a balance between noisy variance and producing a range of interestingly distinct completions'). Context window sizes (8k for GPT-4-0613, 16k for StarCoder2) are noted."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The full 'trialogue' architecture is described in Section 2.3 with detailed steps (system message, user message with static retrieval, model response, error correction loop). Figure 1 diagrams the conversational architecture. The error correction loop is capped at 2 iterations (Section 2.7)."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 2.8.1 describes how MVUBench was constructed. Section 2.8.5 describes how the vector retrieval database was created (150-character chunks, 6 entries retrieved, cosine similarity). The static retrieval process is documented in Sections 2.5-2.6."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 4 'Threats to Validity' provides a dedicated discussion of limitations spanning multiple paragraphs covering representativeness, baseline appropriateness, and RAG simplicity."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 4 identifies specific threats: header effectiveness depends on pre-existing implementations, MVUBench is MVU-specific, the no-context baseline is unrealistically weak, the RAG baseline uses simplistic chunking, and the combined codebase for vector retrieval may not represent real large-scale codebases."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 4 explicitly states: 'MVUBench is not (and is not meant to) be representative of all coding tasks' and notes the TypeScript port is 'a very close translation' raising applicability questions. The paper notes the context size delta is 'not reflective of real-world use cases' (Section 2.9)."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 8 states the Zenodo artifact contains 'the raw data of our experiments.' This allows independent verification of the reported results."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 2.8.1 describes MVUBench construction: 5 MVU applications written from scratch, each with 10-15 tests, designed to avoid data contamination. Section 2.8.2 describes the 320-trial experiment design."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. The benchmark is constructed by the authors and evaluated automatically."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The full pipeline is documented: program sketch → static retrieval from language server → prompt construction → LLM generation → error correction loop → test evaluation. Each step is described in Sections 2.3-2.7."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 9 states: 'This work was partially funded by the National Science Foundation (Award #2238744).'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All authors are from the University of Michigan and are the developers of Hazel. Their affiliation with the evaluated system is clear from the paper."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "NSF is an independent government funding agency with no financial stake in Hazel's success or in the experimental outcomes."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is included in the paper. The authors evaluate their own system (Hazel) but do not declare whether they hold any financial interests related to it."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "The paper does not state GPT-4-0613's training cutoff date. For StarCoder2, they mention The Stack v2 but do not state a specific cutoff. This is relevant because they claim MVUBench avoids contamination."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": true,
    233         "justification": "Section 1.1.2 extensively discusses data contamination risks with existing benchmarks and explains that MVUBench was 'conceptualized and implemented from scratch, without directly adapting any code from GitHub.' Section 9 acknowledges even their benchmark may have been exposed via Copilot-enabled IDEs."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "The paper addresses contamination by constructing MVUBench from scratch (Section 1.1.2), password-protecting the Zenodo artifact (Section 8), and planning to 'control the release of these benchmarks to limit the likelihood of future contamination' (Section 1.1.2)."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Figure 14 reports time elapsed per trial. Figure 15 reports total tokens used (both sent and received). Section 2.9.1 notes 'precisely proportional to the total cost' and discusses practical latency concerns."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "The total API spend or compute budget for all 320+ trials is not stated. Token counts are shown per configuration but total expenditure across the full experiment is not reported."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "20 trials per configuration are run to account for 'model non-determinism' (Section 2.8.2) but no analysis of seed sensitivity or variance across seeds is reported. Only averages are shown."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Section 2.8.2 clearly states '20 completion trials per combination (to account for model non-determinism)' and that the experiment consists of 320 total trials."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Temperature 0.6 was 'selected based on trial experiments' (Section 2.8.2) but the number of configurations tried and selection method are not reported."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "The paper does not cherry-pick a best configuration — it reports all 8 ablation configurations plus 3 baselines, showing the full landscape. Temperature selection is briefly justified."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": false,
    313         "answer": false,
    314         "justification": "No statistical significance tests are performed, so multiple comparison correction is moot."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors evaluate their own system (Hazel Assistant with their language server) but do not acknowledge the bias of author-evaluation. They also constructed MVUBench themselves."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Figures 14 and 15 show time and token cost per configuration, allowing comparison of performance vs. compute. Section 2.9 notes the context size delta between static retrieval (890 chars avg) and exhaustive retrieval (1370 chars avg)."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section 1.1.2 provides detailed discussion of why existing benchmarks (HumanEval, MBPP, RepoEval, etc.) are unsuitable and why MVU applications test the specific capability claimed (semantic contextualization). Section 4 notes MVUBench is not representative of all tasks."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "The paper evaluates its own integrated system (Hazel Assistant) — the scaffold IS the thing being tested, not a confound between model comparisons."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "MVUBench was constructed from scratch specifically to address this: 'conceptualizing and implementing these applications from scratch, without directly adapting any code from GitHub' (Section 1.1.2). The benchmark did not exist before the models' training."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether the evaluation setup (e.g., providing type information) constitutes feature leakage relative to real-world usage scenarios where such information might not be as cleanly available."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The 5 MVU applications in MVUBench share structural patterns (all MVU architecture). No discussion of whether this structural similarity between benchmark items affects the evaluation."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "The primary leakage prevention method is constructing MVUBench from scratch and password-protecting the artifact (Section 8). Section 9 acknowledges even this may not be fully clean due to Copilot-enabled IDEs during development."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Type definitions are particularly impactful for LLM code completion contextualization.",
    363       "evidence": "Ablation study (Section 2.9, Figure 8): adding type definitions dramatically improves test pass rates from near-zero to substantial completion quality across all 5 MVU applications for both GPT-4 and StarCoder2.",
    364       "supported": "strong"
    365     },
    366     {
    367       "claim": "Function headers have a large multiplicative effect when combined with type definitions, increasing test performance threefold in Hazel.",
    368       "evidence": "Section 2.9, Figure 8: types+headers outperforms types-only by approximately 3x for Hazel GPT-4. This ratio is 1.5 for TypeScript (Section 3.2).",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "Error correction rounds are ineffective alone but multiply effectiveness when combined with static retrieval.",
    373       "evidence": "Section 2.9: error rounds alone are ineffective but multiply performance by 4x (types only) and 1.5x (types+headers) in Hazel. Less impactful in TypeScript (1.2x ratio, Section 3.2).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Static retrieval outperforms vector retrieval for semantic contextualization.",
    378       "evidence": "Figures 8, 18: types+headers outperform vector retrieval baseline. However, the paper acknowledges a confounding chunk (Figure 12) disproportionately affected vector retrieval results (Section 2.9).",
    379       "supported": "weak"
    380     },
    381     {
    382       "claim": "These techniques transfer from low-resource (Hazel) to high-resource (TypeScript) languages.",
    383       "evidence": "Section 3, Figures 18-19: TypeScript results show similar trends though with smaller margins, as the model is already more capable in TypeScript.",
    384       "supported": "moderate"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval"],
    388   "key_findings": "Static type-based contextualization significantly improves LLM code completion quality on high-context tasks. Type definitions are the most critical context to provide, with function headers and error correction loops providing additional multiplicative benefits. These results hold for both a large closed model (GPT-4) and a smaller open model (StarCoder2-15B), and transfer from a low-resource language (Hazel) to TypeScript. The paper proposes ChatLSP as a standard interface for language servers to expose this kind of static context to AI assistants.",
    389   "red_flags": [
    390     {
    391       "flag": "Authors evaluate their own system",
    392       "detail": "All four authors are Hazel developers at the University of Michigan. They designed MVUBench, the Hazel Language Server, the Hazel Assistant, and conducted all evaluations. No independent evaluation or third-party replication is included."
    393     },
    394     {
    395       "flag": "Tiny benchmark scale",
    396       "detail": "MVUBench consists of only 5 small MVU applications with 10-15 tests each. The generalization to real-world codebases of hundreds of thousands of lines, non-MVU tasks, and other programming paradigms is unclear."
    397     },
    398     {
    399       "flag": "No statistical significance testing",
    400       "detail": "Despite running 20 trials per configuration, no confidence intervals, error bars, or significance tests are reported. Claims about relative performance are based on comparing averages without quantifying uncertainty."
    401     },
    402     {
    403       "flag": "Simplistic RAG baseline may be unfairly weak",
    404       "detail": "The vector retrieval baseline uses naive 150-character uniform chunking. The authors acknowledge more sophisticated chunking strategies exist but chose the simplest approach. Section 4 concedes this is arguable."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Evaluating Large Language Models Trained on Code",
    410       "authors": ["Mark Chen", "Jerry Tworek"],
    411       "year": 2021,
    412       "arxiv_id": "2107.03374",
    413       "relevance": "Introduces HumanEval benchmark for LLM code generation evaluation."
    414     },
    415     {
    416       "title": "Program Synthesis with Large Language Models",
    417       "authors": ["Jacob Austin", "Augustus Odena"],
    418       "year": 2021,
    419       "arxiv_id": "2108.07732",
    420       "relevance": "Introduces MBPP benchmark for LLM program synthesis."
    421     },
    422     {
    423       "title": "RepoCoder: Repository-Level Code Completion Through Iterative Retrieval and Generation",
    424       "authors": ["Fengji Zhang", "Bei Chen"],
    425       "year": 2023,
    426       "doi": "10.18653/v1/2023.emnlp-main.151",
    427       "relevance": "Vector retrieval approach for repository-level code completion — direct comparison point."
    428     },
    429     {
    430       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    431       "authors": ["Naman Jain", "King Han"],
    432       "year": 2024,
    433       "arxiv_id": "2403.07974",
    434       "relevance": "Contamination-free code benchmark addressing data leakage concerns."
    435     },
    436     {
    437       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    438       "authors": ["Anton Lozhkov", "Raymond Li"],
    439       "year": 2024,
    440       "arxiv_id": "2402.19173",
    441       "relevance": "Open-source code LLM used as one of the two evaluation models."
    442     },
    443     {
    444       "title": "CoCoMIC: Code Completion by Jointly Modeling In-file and Cross-file Context",
    445       "authors": ["Yangruibo Ding", "Zijian Wang"],
    446       "year": 2024,
    447       "relevance": "Framework for joint in-file and cross-file context for code completion."
    448     },
    449     {
    450       "title": "Repair is nearly generation: multilingual program repair with LLMs",
    451       "authors": ["Harshit Joshi", "José Cambronero Sanchez"],
    452       "year": 2023,
    453       "doi": "10.1609/aaai.v37i4.25642",
    454       "relevance": "LLM-based program repair using iterative error correction loops."
    455     },
    456     {
    457       "title": "Enhancing LLM-Based Coding Tools through Native Integration of IDE-Derived Static Context",
    458       "authors": ["Yichen Li", "Yun Peng"],
    459       "year": 2024,
    460       "arxiv_id": "2402.03630",
    461       "relevance": "Closely related work using IDE/language server static context for LLM code completion."
    462     },
    463     {
    464       "title": "STALL+: Boosting LLM-based Repository-level Code Completion with Static Analysis",
    465       "authors": ["Junwei Liu", "Yixuan Chen"],
    466       "year": 2024,
    467       "arxiv_id": "2406.10018",
    468       "relevance": "General framework for applying static analysis to repository-level code completion."
    469     },
    470     {
    471       "title": "De-Hallucinator: Iterative Grounding for LLM-Based Code Completion",
    472       "authors": ["Aryaz Eghbali", "Michael Pradel"],
    473       "year": 2024,
    474       "arxiv_id": "2401.01701",
    475       "relevance": "Post-generation semantic lookup to fix hallucinated code definitions."
    476     },
    477     {
    478       "title": "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair",
    479       "authors": ["Yuxiang Wei", "Chunqiu Steven Xia"],
    480       "year": 2023,
    481       "doi": "10.1145/3611643.3616271",
    482       "relevance": "Combines LLMs with semantic code completion for program repair."
    483     },
    484     {
    485       "title": "RLCoder: Reinforcement Learning for Repository-Level Code Completion",
    486       "authors": ["Yanlin Wang", "Yanli Wang"],
    487       "year": 2024,
    488       "arxiv_id": "2407.19487",
    489       "relevance": "RL-based ranking of retrieved code snippets for repository-level completion."
    490     },
    491     {
    492       "title": "The Impact of AI on Developer Productivity: Evidence from GitHub Copilot",
    493       "authors": ["Sida Peng", "Eirini Kalliamvakou"],
    494       "year": 2023,
    495       "arxiv_id": "2302.06590",
    496       "relevance": "Key study on AI coding assistant productivity impact."
    497     }
    498   ]
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs