scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26152B)
      1 {
      2   "paper": {
      3     "title": "Library Hallucinations in LLMs: Risk Analysis Grounded in Developer Queries",
      4     "authors": ["Lukas Twist", "Jie M. Zhang", "Mark Harman", "Helen Yannakoudakis"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2509.22202"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "LLMs are highly susceptible to library hallucinations triggered by time-related prompts (up to 84% of tasks for 'from 2025') and user typos (one-character misspellings cause hallucinations in up to 26% of tasks, fake library names accepted in up to 99%). Prompt engineering mitigation is inconsistent and LLM-dependent: self-analysis and explicit-check prompts help in some cases, while chain-of-thought and step-back prompts often worsen hallucinations. The study reveals a novel connection between LLM sycophancy and typosquatting/slopsquatting supply chain risks.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "GitHub repository provided: https://github.com/itsluketwist/realistic-library-hallucinations (Section 1, Section 7, Reproducibility Statement)."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper states the full dataset, prompts, labels, and outputs are released in the GitHub repository. LibraryHalluBench benchmark with 4,628 prompts is also released (Section 7)."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Reproducibility Statement: 'The repository specifies exact dependency versions, enabling a consistent reproduction of our experiments.'"
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Reproducibility Statement: 'a detailed README.md file describing its contents, structure, installation steps, and usage procedure.' LibraryHalluBench also includes 'full usage instructions and an evaluation script' (Section 7)."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Tables report only point estimates (RHR, THR percentages) with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes many comparative claims (e.g., 'GPT-5-mini had a 32% increase from 2024 to 2025') but no statistical significance tests are reported."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Percentage differences with baselines are consistently reported (e.g., '34% more responses when the year changed from 2023 to 2024', Table 3 shows ↑/↓ changes). Baseline context is always provided."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The dataset of 356 tasks (from BigCodeBench filtering) is described but not justified via power analysis or sample size rationale. The 10% preliminary/90% main split is mentioned but no justification for adequacy."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Three responses are generated per task 'to mitigate sampling variability' (Section 3.2), but no variance, standard deviation, or spread measures across these responses are reported. Only aggregated RHR and THR are shown."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Each experiment includes a 'no description' or 'valid library' baseline condition (Tables 1 and 2), against which prompt variations are compared."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Seven contemporary LLMs are tested including GPT-5-mini (Aug 2025), DeepSeek-V3.1 (Aug 2025), and Claude-4.5-Haiku (Oct 2025). All are recent production models."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The three experiments systematically vary prompt components (adjective descriptions, year-based descriptions, error types, mitigation strategies), effectively ablating the impact of each variation."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Two metrics are used throughout: RHR (Response Hallucination Rate) and THR (Task Hallucination Rate), reported for both library name and library member hallucinations."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Manual validation of the hallucination detection pipeline on 100 random samples (Section 6). Manual review of unmatched libraries for genuine hallucinations (Section 3.6). Single author manually evaluated 200 SRSE questions for filter validation (Appendix B.2.1)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "10% of tasks reserved for preliminary experiments, remaining 90% used for main analysis (Section 3.2). Fine-tuning experiment uses equal train/test split (Appendix E.1)."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Appendix D provides detailed domain-level breakdown across seven BigCodeBench domains (computation, visualisation, general, system, time, network, cryptography) in Table 6."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Appendix C provides detailed case analyses of hallucination failures including year-based hallucinations (C.1), one-character misspellings (C.2), and ineffective prompt strategies (C.3) with specific examples per LLM."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Chain-of-thought and step-back prompting often worsened hallucination rates (Table 3, Experiment 3 summary). Fine-tuning had 'minimal impact' in most configurations (Appendix E.1). These negative results are prominently discussed."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims (26% for one-char misspellings, 99% for fake names, 84% for time prompts, inconsistent prompt engineering) are all directly supported by Tables 1-3."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The study uses controlled manipulation of prompt variables while holding other factors constant (same template, same tasks, same models). Each experiment varies a single dimension (description type, error degree, mitigation strategy), which is adequate for causal claims about prompt effects on hallucination rates."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title says 'LLMs' broadly but the study is restricted to Python libraries and 7 specific models. While Appendix E.2 tests generalizability on CodeInsight with 4 models, the paper's claims extend to 'LLM code generation' generally without sufficient bounding. The abstract and conclusions do not qualify findings to Python."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 4.1 discusses knowledge cutoffs as an alternative explanation for year-based hallucinations. Section 4.2 discusses sycophancy as a mechanism. Section 6 (Threats to Validity) addresses data leakage, construct validity, and LLM variability as alternative factors."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper clearly defines what it measures: library name hallucinations (imports of non-existent packages verified against PyPI) and library member hallucinations (invalid calls verified against documentation). Claims match measurement granularity — no broader framing beyond what was measured."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Table 4 (Appendix A) lists exact model versions: gpt-4o-mini-2024-07-18, gpt-5-mini-2025-08-07, ministral-2410, qwen2.5-coder-32b-instruct, llama-3.3-70b-instruct-turbo, deepseek-chat-v3.1, claude-haiku-4-5-20251001."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompt template given in Appendix B.1. All library directives listed verbatim in Appendix B.2.2 and B.3.2. Mitigation strategy texts given in Section 3.5. Mistake generation prompts in Appendix B.3.1."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Table 4 reports temperature and top_p for all seven models. Section 3.1 explains default parameters are used but manually configured for reproducibility."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The study uses direct API calls to LLMs with single-turn prompts."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.2 describes filtering out tasks where descriptions mention ground-truth libraries, restricting to external libraries with accessible documentation, yielding 356 tasks from 1140. Appendix B.2.1 describes SRSE filtering and clustering. Section 3.6 describes AST-based extraction."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 6 'Threats to Validity' provides dedicated discussion of internal and external validity threats."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 6 discusses specific threats: automatic data extraction validated on 100 samples, prompt realism grounded in SRSE analysis, dataset bias from excluding ground-truth-revealing tasks, LLM nondeterminism mitigated by multiple runs and fixed versions."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 3.2 explicitly states 'Our scope is restricted to Python.' Section 6 acknowledges dataset and LLM selection limitations. However, the title and abstract do not reflect these boundaries clearly."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper releases 'full outputs' in the GitHub repository (Section 1), meaning raw LLM responses are available for verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.2 describes BigCodeBench as the seed dataset, the filtering criteria (356 tasks, 30 libraries), and the 10/90 split. Section 3.6 describes the hallucination detection pipeline in detail."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data source is the BigCodeBench benchmark (standard public dataset)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: BigCodeBench → filtering (Section 3.2) → prompt generation with variations (Appendices B.2-B.3) → LLM response collection → AST extraction → PyPI/documentation verification (Section 3.6). Each stage is described with criteria."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding acknowledgment or statement found in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations clearly listed: King's College London and University College London. None of the authors are affiliated with the LLM providers being evaluated."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement found in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Table 4 (Appendix A) lists knowledge cutoff dates for all models where available (e.g., GPT-4o-mini: Oct '23, GPT-5-mini: May '24, etc.)."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Section 3.2: BigCodeBench tasks 'rephrased with the NL-Augmenter framework, reducing the risk of data leakage and ensuring realistic yet unseen problem statements.' Section 6: 'any residual data leakage is likely to only cause fewer hallucinations, making our findings conservative.'"
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 3.2 explains BigCodeBench uses NL-Augmenter rephrasing to reduce leakage. Section 6 notes residual leakage would make findings conservative. Tasks further filtered to remove those mentioning ground-truth libraries."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No API costs, token counts, or per-example costs reported despite calling 7 LLM APIs across thousands of prompts with 3 responses each."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total computational budget, API spend, or hardware information reported."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "Three responses generated per task but no seed sensitivity analysis. The Reproducibility Statement mentions 'fixed seeds when sampling datasets' but not for LLM generation, and no cross-seed variance is reported."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Section 3.2: 'For each prompt, we generate three responses to mitigate sampling variability in LLM outputs.'"
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": false,
    304         "answer": false,
    305         "justification": "Default API parameters are used; no hyperparameter search was conducted."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "No configuration selection; default parameters used throughout."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "Many comparisons across 7 models, 13+ prompt variations, and 2 hallucination types, but no statistical tests are performed at all, let alone corrections for multiple comparisons."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The study evaluates existing LLMs rather than proposing a new system. No self-comparison bias applies."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "The study does not compare systems at different compute budgets; it evaluates hallucination rates across models."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Section 3.6 discusses construct validity of hallucination detection: PyPI verification for library names, documentation scraping for members, version-aware conservative counting, and manual validation on 100 samples."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding used. Direct API calls with single-turn prompts."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "Table 4 lists knowledge cutoffs. Section 3.2 uses NL-Augmenter rephrased tasks. Section 4.1 discusses how year-based hallucinations partly relate to knowledge cutoffs but notes the behavior is 'problematic' even for years within cutoffs."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "Section 3.2 filters out tasks where the description directly mentions the ground-truth library, preventing answer leakage through prompts."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether BigCodeBench tasks are independent of each other (they derive from ODEX, potentially sharing structural patterns). Tasks from the same domain may share similar solution patterns."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "NL-Augmenter rephrasing (Section 3.2) serves as a concrete leakage prevention method, generating unseen problem statements to reduce memorization risk."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Time-related prompts produce hallucinated libraries in up to 84% of tasks (GPT-4o-mini, 'from 2025').",
    364       "evidence": "Table 1: GPT-4o-mini THR = 84.74% for 'from 2025' description. Section 4.1.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "One-character misspellings in library names trigger hallucinations in up to 26% of tasks.",
    369       "evidence": "Table 2: GPT-5-mini TUR = 25.86% for one-character misspellings. Section 4.2.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Fake library names are used in up to 99% of tasks.",
    374       "evidence": "Table 2: GPT-5-mini TUR = 99.22% for fake library names. Section 4.2.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Prompt engineering shows inconsistent mitigation: self-analysis and explicit-check reduce hallucinations in 29/36 and 30/36 instances respectively, while chain-of-thought and step-back often increase them.",
    379       "evidence": "Table 3 and Section 4.3. Average reductions for GPT-5-mini (-5.53%) and DeepSeek-V3.1 (-5.35%) but increase for Qwen-2.5-Coder (+5.08%).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Adjective-based descriptions rarely trigger library name hallucinations (≈0%).",
    384       "evidence": "Table 1: All non-year adjective descriptions show RHR < 1% across all models. Section 4.1.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Rarity-seeking language increases hallucinations, with 'hidden gem' prompts producing hallucinations in 5.35% of tasks on average.",
    389       "evidence": "Table 5 (Appendix B.4) and Section 5.2. Progressive increase: lesser known (1.92%) → not widely used (3.01%) → hidden gem (5.35%).",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Fine-tuning shows limited promise for hallucination mitigation, with only one configuration (e=10, η=1e-4) showing meaningful improvement (16% reduction for fake names).",
    394       "evidence": "Table 7 (Appendix E.1). Most configurations within ≈1% of base model.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No statistical tests for comparative claims",
    401       "detail": "The paper makes extensive comparative claims across models and conditions but reports no significance tests, confidence intervals, or error bars. With only 3 responses per task, sampling variability could explain some observed differences, particularly smaller ones."
    402     },
    403     {
    404       "flag": "No cost reporting",
    405       "detail": "The study calls 7 LLM APIs across thousands of prompts (356 tasks × 13+ conditions × 3 responses × 7 models) but reports no API costs or total compute budget."
    406     },
    407     {
    408       "flag": "Single-author manual validation",
    409       "detail": "Manual review of unmatched libraries (Section 3.6) and SRSE filter validation (Appendix B.2.1) performed by a single author with no inter-rater reliability assessment."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "We Have a Package for You! A Comprehensive Analysis of Package Hallucinations by Code Generating LLMs",
    415       "authors": ["Joseph Spracklen", "Raveen Wijewickrama", "A. H. M. Nazmus Sakib", "Anindya Maiti", "Bimal Viswanath", "Murtuza Jadliwala"],
    416       "year": 2024,
    417       "arxiv_id": "2406.10279",
    418       "relevance": "Prior work on package hallucinations in LLM code generation, establishing the slopsquatting threat model this paper extends."
    419     },
    420     {
    421       "title": "Importing Phantoms: Measuring LLM Package Hallucination Vulnerabilities",
    422       "authors": ["Arjun Krishna", "Erick Galinkin", "Leon Derczynski", "Jeffrey Martin"],
    423       "year": 2025,
    424       "arxiv_id": "2501.19012",
    425       "relevance": "Measures LLM package hallucination vulnerabilities at aggregate level; this paper extends with prompt-variation analysis."
    426     },
    427     {
    428       "title": "CodeHalu: Investigating Code Hallucinations in LLMs via Execution-based Verification",
    429       "authors": ["Yuchen Tian", "Weixiang Yan", "Qian Yang"],
    430       "year": 2024,
    431       "arxiv_id": "2405.00253",
    432       "relevance": "Taxonomy and benchmark for code hallucinations in LLMs."
    433     },
    434     {
    435       "title": "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation",
    436       "authors": ["Ziyao Zhang", "Yanlin Wang", "Chong Wang"],
    437       "year": 2024,
    438       "arxiv_id": "2409.20550",
    439       "relevance": "Examines phenomena and mitigation of code hallucinations in practical LLM usage."
    440     },
    441     {
    442       "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions",
    443       "authors": ["Terry Yue Zhuo"],
    444       "year": 2024,
    445       "arxiv_id": "2406.15877",
    446       "relevance": "Benchmark dataset used as the seed for this study's experiments."
    447     },
    448     {
    449       "title": "Hallucination by Code Generation LLMs: Taxonomy, Benchmarks, Mitigation, and Challenges",
    450       "authors": ["Yunseo Lee", "John Youngeun Song"],
    451       "year": 2025,
    452       "arxiv_id": "2504.20799",
    453       "relevance": "Taxonomy of code generation hallucinations providing framework for categorizing library hallucinations."
    454     },
    455     {
    456       "title": "De-Hallucinator: Mitigating LLM Hallucinations in Code Generation Tasks via Iterative Grounding",
    457       "authors": ["Aryaz Eghbali", "Michael Pradel"],
    458       "year": 2024,
    459       "arxiv_id": "2401.01701",
    460       "relevance": "RAG-based mitigation approach for code generation hallucinations."
    461     },
    462     {
    463       "title": "Discovering Language Model Behaviors with Model-Written Evaluations",
    464       "authors": ["Ethan Perez"],
    465       "year": 2023,
    466       "relevance": "Foundational work on sycophancy in LLMs, which this paper connects to library hallucination compliance behavior."
    467     },
    468     {
    469       "title": "Breaking the Silence: The Threats of Using LLMs in Software Engineering",
    470       "authors": ["June Sallou", "Thomas Durieux", "Annibale Panichella"],
    471       "year": 2024,
    472       "doi": "10.1145/3639476.3639764",
    473       "relevance": "Discusses threats of LLM non-determinism in software engineering, motivating this study's repeated sampling approach."
    474     },
    475     {
    476       "title": "A Survey on Large Language Models for Code Generation",
    477       "authors": ["Juyong Jiang", "Fan Wang"],
    478       "year": 2024,
    479       "arxiv_id": "2406.00515",
    480       "relevance": "Survey of LLM code generation establishing the landscape this paper's hallucination analysis sits within."
    481     },
    482     {
    483       "title": "CodeMirage: Hallucinations in Code Generated by Large Language Models",
    484       "authors": ["Vibhor Agarwal", "Yulong Pei"],
    485       "year": 2024,
    486       "arxiv_id": "2408.08333",
    487       "relevance": "Proposes taxonomy for code hallucinations relevant to the library hallucination categories studied here."
    488     }
    489   ]
    490 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs