scan-v5.json (27192B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Library Hallucinations in LLMs: Fabricated Package References in Code", 6 "authors": [ 7 "Lukas Twist", 8 "Jie M. Zhang", 9 "Mark Harman", 10 "Helen Yannakoudakis" 11 ], 12 "year": 2025, 13 "venue": "arXiv", 14 "arxiv_id": "2509.22202", 15 "doi": null 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "All quantitative abstract claims are traceable to tables: 26% one-character misspelling (Table 2, GPT-5-mini TUR=25.86%), 99% fake library acceptance (Table 2, GPT-5-mini TUR=99.22%), 84% time-based hallucination (Table 1, GPT-4o-mini 'From 2025' THR=84.74%).", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "The study systematically manipulates prompt variables (description type, error degree, mitigation strategy) as controlled independent variables while measuring hallucination rates, which is an adequate design for causal inference about prompt→hallucination relationships.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": false, 34 "justification": "The title and abstract claim findings about 'LLMs' broadly; the Python restriction appears only in the body and is not consistently foregrounded in conclusions. Seven specific models and one benchmark language cannot support broad claims about LLMs generally.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": true, 40 "justification": "The paper discusses training cutoff as a partial explanation for year-based hallucinations (Section 4.1), sycophancy as an explanation for fake-library compliance (Section 4.2), and deprecated knowledge for member hallucinations.", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper measures hallucination rates (import/member presence on PyPI) and frames security implications as consequential arguments rather than direct measurements; the distinction between measured hallucination rates and downstream security risk is maintained.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section 6 is a dedicated 'Threats to Validity' section covering both internal and external validity threats with specific discussion.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Specific threats include: automatic AST extraction validated on 100 random samples, SRSE filter validated with precision/recall=0.95, LLM nondeterminism mitigated by 3 responses per task and pinned versions, and dataset bias from filtering tasks that mention ground-truth libraries.", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper explicitly restricts to Python (Section 3.2), to library name and member hallucinations specifically (as opposed to other hallucination types), and to prompt-only settings excluding RAG and fine-tuning in the main experiments.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": false, 74 "justification": "No funding source is disclosed anywhere in the preprint; there is no acknowledgments section.", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "Author affiliations are disclosed as King's College London and University College London, both independent academic institutions from the evaluated LLM providers.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "Funding is undisclosed, so independence of funder from outcome cannot be assessed.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement or financial interests declaration appears in the paper.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "'Library name hallucinations' (invalid imports) and 'library member hallucinations' (invalid calls from valid libraries) are precisely defined in Section 2; RHR and THR metrics are defined in Table 1 caption.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 1 lists four explicit contributions: first systematic study of prompt variations on hallucination rates, concrete risks from user errors, novel connection to typosquatting, and practical mitigation insights with released code and LibraryHalluBench.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 2 explicitly positions the work against Spracklen et al. (2024) and Krishna et al. (2025), noting prior work focuses on 'aggregate evaluations without fine-grained understanding of triggers,' and explains how this paper fills that specific gap.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "Code released at https://github.com/itsluketwist/realistic-library-hallucinations, confirmed in Section 1 and the Reproducibility Statement.", 124 "source": "haiku" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "LibraryHalluBench (4,628 prompts with labels and full outputs) is released in the GitHub repository; the seed dataset BigCodeBench is publicly available.", 130 "source": "haiku" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": true, 135 "justification": "The Reproducibility Statement confirms 'The repository specifies exact dependency versions'; Appendix A provides exact model versions, platforms, and parameter configurations.", 136 "source": "haiku" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": true, 141 "justification": "The Reproducibility Statement confirms 'a detailed README.md file describing its contents, structure, installation steps, and usage procedure' is provided.", 142 "source": "haiku" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": false, 149 "justification": "No confidence intervals or error bars are reported in any table; 3 responses per task are generated to mitigate variability but variance across these is never reported.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "Numerous comparative claims are made (one model vs. another, one prompt type vs. baseline) but no statistical significance tests are applied anywhere in the paper.", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Percentage hallucination rates are reported with explicit baselines (e.g., 0% for adjective descriptions vs. 84.74% for 'from 2025'), providing effect size context even without formal statistical framing.", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "The 356-task evaluation subset is described through filtering steps but no power analysis or justification for why this sample size is adequate for the comparisons made across 7 models and multiple conditions.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": false, 173 "justification": "Three responses per task are generated to mitigate sampling variability, but variance or standard deviation across these responses is never reported in the results tables.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "'No description' is used as a baseline for Experiment 1, and 'None, valid library/member' is the baseline for Experiment 2, both consistently reported alongside experimental conditions.", 182 "source": "haiku" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "The 7 evaluated LLMs include state-of-the-art models released in 2024-2025 (GPT-5-mini, DeepSeek-V3.1, Claude-4.5-Haiku), providing competitive and contemporary comparisons.", 188 "source": "haiku" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "Each experiment systematically varies one factor at a time: description type in Exp. 1, error degree in Exp. 2, and mitigation strategy in Exp. 3, providing ablation-style analysis of each component's effect.", 194 "source": "haiku" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "Two complementary metrics are reported throughout: RHR (Response Hallucination Rate, per-response) and THR (Task Hallucination Rate, per-task), plus separate analysis of library name vs. member hallucinations.", 200 "source": "haiku" 201 }, 202 "human_evaluation": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human evaluation of system outputs; single-author manual review is used only for data quality validation of unmatched library names, not for evaluating system-generated code quality.", 206 "source": "haiku" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": true, 211 "justification": "10% of the 356-task subset is reserved for preliminary experiments, with main analysis conducted on the remaining 90% (Section 3.2).", 212 "source": "haiku" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Appendix D provides full domain-level hallucination rate breakdowns across 7 BigCodeBench domains (computation, visualisation, general, system, time, network, cryptography) for all experiments.", 218 "source": "haiku" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Appendix C provides concrete case studies showing specific hallucination examples per LLM with full prompts and responses; Section 4.3 and Appendix C also show cases where prompt mitigation strategies increased hallucinations.", 224 "source": "haiku" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Fine-tuning results (Appendix E.1) showed 'minimal impact'; chain-of-thought and step-back prompting 'often fail or even increase hallucination rates' (Section 4.3). Both are clearly reported, not buried.", 230 "source": "haiku" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "Table 4 in Appendix A reports exact model version identifiers (e.g., 'gpt-4o-mini-2024-07-18', 'claude-haiku-4-5-20251001'), platform, release date, knowledge cutoff, and parameter settings for all 7 models.", 238 "source": "haiku" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": true, 243 "justification": "Appendix B provides the complete prompt template and all exact library directives used for each experiment variant (Sections B.2.2, B.3.2, B.4); mistake generation system prompts are also provided verbatim.", 244 "source": "haiku" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "Table 4 reports temperature and top-p values for all 7 models; the paper notes default API configurations were used and these were manually set for reproducibility.", 250 "source": "haiku" 251 }, 252 "scaffolding_described": { 253 "applies": false, 254 "answer": false, 255 "justification": "No agentic scaffolding is used; models are queried via direct API calls in fresh sessions, which is explicitly described in Section 3.1.", 256 "source": "haiku" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "Dataset filtering steps are described in Section 3.2 (removing tasks mentioning ground-truth library, requiring accessible documentation); SRSE analysis pipeline is detailed in Appendix B.2.1 including filtering criteria with precision/recall validation.", 262 "source": "haiku" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "The GitHub repository contains 'prompts, labels, and full outputs' for all experiments per the Reproducibility Statement and Section 1 contribution description.", 270 "source": "haiku" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "SRSE scraping and filtering are described with precision/recall metrics (Section B.2.1); hallucination detection using PyPI comparison, package name normalization, and documentation scraping is described in Section 3.6.", 276 "source": "haiku" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants; standard public benchmarks (BigCodeBench, SRSE) are used without recruitment.", 282 "source": "haiku" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "The full pipeline from SRSE scraping → filtering → n-gram clustering → prompt generation → LLM querying → AST extraction → PyPI/documentation comparison is documented across Sections 3 and Appendix B.", 288 "source": "haiku" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": true, 295 "justification": "Table 4 lists training data cutoffs for all 7 models where available (e.g., GPT-4o-mini Oct.'23, GPT-5-mini May'24, DeepSeek-V3.1 July'25); some are marked '–' as unavailable.", 296 "source": "haiku" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 3.2 explicitly notes BigCodeBench tasks were 'rephrased with the NL-Augmenter framework, reducing the risk of data leakage'; the threats to validity section also addresses this, noting residual leakage would make findings conservative.", 302 "source": "haiku" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": true, 307 "justification": "The paper addresses potential contamination by noting BigCodeBench's rephrasing methodology and that any residual leakage 'is likely to only cause fewer hallucinations, making our findings conservative' (Section 6).", 308 "source": "haiku" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants.", 316 "source": "haiku" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants.", 322 "source": "haiku" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants.", 328 "source": "haiku" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants.", 334 "source": "haiku" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants.", 340 "source": "haiku" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants.", 346 "source": "haiku" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants.", 352 "source": "haiku" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": false, 359 "justification": "No inference costs or API costs are reported despite querying 7 models across hundreds of tasks × 3 responses × multiple conditions (thousands of API calls total).", 360 "source": "haiku" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": false, 365 "justification": "No total computational budget is stated; the paper uses API-based models but reports no cost estimates.", 366 "source": "haiku" 367 } 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Year-based prompts ('from 2025') trigger library name hallucinations in up to 84% of tasks, while adjective-based descriptions cause near-zero hallucinations.", 374 "evidence": "Table 1: GPT-4o-mini THR=84.74% for 'From 2025'; adjective-based descriptions consistently ≈0% across all LLMs.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "One-character misspellings of library names trigger hallucinations in up to 26% of tasks.", 379 "evidence": "Table 2: GPT-5-mini TUR=25.86% for one-character misspellings; other models range 0.31%–4.52%.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Fake library names are accepted and used by LLMs in up to 99% of tasks.", 384 "evidence": "Table 2: GPT-5-mini TUR=99.22%, GPT-4o-mini TUR=92.52%, Ministral-8b TUR=90.50% for fake library names.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "LLMs are substantially more robust to incorrect library members than incorrect library names.", 389 "evidence": "Table 2: Average task usage rate for fake library members ~34% vs ~68% for fake library names; member misspellings show less clear scaling with error severity.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "Chain-of-thought and step-back prompting frequently worsen library hallucination rates rather than reducing them.", 394 "evidence": "Table 3: CoT increased hallucinations in many configurations (e.g., Qwen-2.5-Coder 'From 2025' +15.16%, fake library +28.66%); step-back showed similar pattern.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "Explicit-check and self-analysis prompt strategies are the most effective mitigations, reducing hallucinations in 30/36 and 29/36 tested instances respectively.", 399 "evidence": "Table 3 and Section 4.3 summarize per-instance success counts; explicit-check reduced GPT-5-mini fake library hallucination by 32.19pp.", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Rarity-seeking language progressively increases hallucination rates: 'hidden gem' (5.35% avg) > 'not widely used' (3.01%) > 'lesser known' (1.92%).", 404 "evidence": "Table 5 in Appendix B.4 shows this gradient across all 7 LLMs; effect is modest but consistent.", 405 "supported": "moderate" 406 } 407 ], 408 "methodology_tags": [ 409 "benchmark-eval", 410 "observational" 411 ], 412 "key_findings": "This paper provides the first systematic study of how developer-inspired prompt variations affect library hallucination rates across 7 LLMs. Year-based prompts ('from 2025') cause hallucinations in up to 84% of tasks while adjective-based descriptions are largely ignored, and even single-character typos trigger hallucinations in up to 26% of tasks with fake library names accepted in up to 99% of cases. LLMs appear to prioritize user compliance (sycophancy) over factual accuracy when presented with plausible-sounding erroneous library names. Prompt engineering mitigation shows inconsistent results: reasoning-oriented strategies (chain-of-thought, step-back) frequently worsen hallucination rates, while explicit-check and self-analysis strategies provide modest but model-dependent improvements, and fine-tuning on a small dataset yielded largely minimal gains.", 413 "red_flags": [ 414 { 415 "flag": "Single-author manual validation", 416 "detail": "Hallucination detection for unmatched libraries relied on 'a web-based search by a single author' with no inter-rater reliability check, introducing potential for inconsistent labeling." 417 }, 418 { 419 "flag": "No statistical significance testing", 420 "detail": "Extensive comparative claims across models and conditions are made without any significance tests, making it impossible to distinguish meaningful differences from noise in a study with 3 responses per task." 421 }, 422 { 423 "flag": "Variance across responses unreported", 424 "detail": "Three responses per task are generated to mitigate sampling variability, but variance across these responses is never reported, obscuring how stable the reported hallucination rates are." 425 }, 426 { 427 "flag": "Overly broad security claims", 428 "detail": "The paper frames hallucination rates as evidence of typosquatting and slopsquatting risk without measuring actual downstream security exploitation or attacker behavior; the connection is plausible but not empirically validated." 429 }, 430 { 431 "flag": "No sample size justification", 432 "detail": "The 356-task evaluation subset (320 after the 10% holdout) is described through filtering logic but no power analysis justifies this as sufficient for comparing across 7 models and ~30 conditions." 433 } 434 ], 435 "cited_papers": [ 436 { 437 "title": "We Have a Package for You! A Comprehensive Analysis of Package Hallucinations by Code Generating LLMs", 438 "relevance": "Direct prior work on library hallucinations; this paper explicitly extends its aggregate evaluation approach" 439 }, 440 { 441 "title": "Importing Phantoms: Measuring LLM Package Hallucination Vulnerabilities", 442 "relevance": "Key related work on package hallucination measurement that this paper situates against" 443 }, 444 { 445 "title": "BigCodeBench: Benchmarking Code Generation with Diverse Function Calls and Complex Instructions", 446 "relevance": "Primary evaluation dataset used throughout all experiments" 447 }, 448 { 449 "title": "Hallucination by Code Generation LLMs: Taxonomy, Benchmarks, Mitigation, and Challenges", 450 "relevance": "Taxonomy of code hallucination types relevant to this paper's categorization" 451 }, 452 { 453 "title": "LLM Hallucinations in Practical Code Generation: Phenomena, Mechanism, and Mitigation", 454 "relevance": "Related empirical study on code hallucination phenomena" 455 }, 456 { 457 "title": "Beyond Typosquatting: An In-Depth Look at Package Confusion", 458 "relevance": "Foundational work on typosquatting attack patterns that LLM hallucinations may amplify" 459 }, 460 { 461 "title": "Towards Understanding Sycophancy in Language Models", 462 "relevance": "Explains the sycophancy mechanism the authors invoke to explain LLM compliance with fake library names" 463 }, 464 { 465 "title": "A Study of LLMs' Preferences for Libraries and Programming Languages", 466 "relevance": "Prior work by same authors on LLM library preferences, directly informing Experiment 1 design" 467 } 468 ], 469 "engagement_factors": { 470 "practical_relevance": { 471 "score": 3, 472 "justification": "Directly actionable for any developer using LLMs for code generation: identifies specific prompt patterns (year-based, rarity-seeking, typos) that trigger security-relevant failures." 473 }, 474 "surprise_contrarian": { 475 "score": 2, 476 "justification": "Counterintuitive finding that chain-of-thought reasoning worsens hallucinations, and that LLMs are not robust to minor typos in library names despite robustness to NL misspellings." 477 }, 478 "fear_safety": { 479 "score": 3, 480 "justification": "Explicitly demonstrates a pathway from LLM hallucinations to supply chain attacks (slopsquatting, typosquatting) with quantified rates, a current active security concern." 481 }, 482 "drama_conflict": { 483 "score": 2, 484 "justification": "Challenges the assumption (cited in paper) that LLMs are robust to minor errors; findings could be newsworthy given widespread LLM coding tool adoption." 485 }, 486 "demo_ability": { 487 "score": 3, 488 "justification": "Any user can immediately test by asking an LLM to use a library 'from 2025' or specifying a plausible-sounding fake library name and observing compliance." 489 }, 490 "brand_recognition": { 491 "score": 1, 492 "justification": "Authors from King's College London and UCL, respected institutions but no major AI lab brand; paper evaluates branded LLMs from OpenAI, Anthropic, Meta, etc." 493 } 494 }, 495 "hn_data": { 496 "threads": [ 497 { 498 "hn_id": "45417070", 499 "title": "Small Near-Earth Objects in the Taurid Resonant Swarm", 500 "points": 34, 501 "comments": 1, 502 "url": "https://news.ycombinator.com/item?id=45417070", 503 "created_at": "2025-09-29T18:25:10Z" 504 }, 505 { 506 "hn_id": "44862960", 507 "title": "Tribe: TRImodal Brain Encoder for whole-brain fMRI response prediction", 508 "points": 2, 509 "comments": 0, 510 "url": "https://news.ycombinator.com/item?id=44862960", 511 "created_at": "2025-08-11T11:20:29Z" 512 }, 513 { 514 "hn_id": "44736074", 515 "title": "Combolutional Neural Networks [pdf]", 516 "points": 2, 517 "comments": 0, 518 "url": "https://news.ycombinator.com/item?id=44736074", 519 "created_at": "2025-07-30T16:10:46Z" 520 } 521 ], 522 "top_points": 34, 523 "total_points": 38, 524 "total_comments": 1 525 } 526 }