scan-v5.json (21374B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "HalluLens: LLM Hallucination Benchmark", 6 "authors": [ 7 "Yejin Bang", 8 "Ziwei Ji", 9 "Alan Schelten", 10 "Anthony Hartshorn", 11 "Tara Fowler", 12 "Cheng Zhang", 13 "Nicola Cancedda", 14 "Pascale Fung" 15 ], 16 "year": 2025, 17 "venue": "Annual Meeting of the Association for Computational Linguistics", 18 "arxiv_id": "2504.17550", 19 "doi": "10.48550/arXiv.2504.17550" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": true, 26 "justification": "All abstract claims—new extrinsic tasks, taxonomy distinguishing hallucination from factuality, dynamic test set generation, and analysis of existing benchmarks—are substantiated by experimental results, detailed methodology, and error analysis throughout the paper.", 27 "source": "haiku" 28 }, 29 "causal_claims_justified": { 30 "applies": false, 31 "answer": false, 32 "justification": "The paper is primarily descriptive and evaluative; it reports correlational trends (larger models tend to hallucinate less) without causal inference designs such as ablation studies or controlled experiments claiming X causes Y.", 33 "source": "haiku" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": true, 38 "justification": "Claims are generally bounded to the 13 tested models and Wikipedia-grounded tasks; the paper explicitly acknowledges Wikipedia as a proxy for training data and notes this assumption may not hold universally.", 39 "source": "haiku" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": false, 44 "justification": "Performance differences across models are reported without discussing alternative explanations for observed trends (e.g., differences in RLHF strategies, instruction-tuning approaches, context window sizes) beyond high-level attribution to model size.", 45 "source": "haiku" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper explicitly distinguishes between what is measured (consistency with Wikipedia as training data proxy) and what is claimed (hallucination), directly acknowledging that Wikipedia does not cover the full training corpus of evaluated models.", 51 "source": "haiku" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": false, 58 "justification": "There is no dedicated limitations or threats-to-validity section; limitations are mentioned inline across appendices and discussion paragraphs but are never consolidated in a named section.", 59 "source": "haiku" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": true, 64 "justification": "Specific threats are discussed inline: LLM judge accuracy (96.67% abstention, 95.56% correctness), 76.8% pipeline-human alignment for LongWiki, 15.4% retrieval failure rate, and 5% of model-generated claims unverifiable within Wikipedia pages.", 65 "source": "haiku" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": false, 70 "justification": "The paper does not formally state what results do NOT show; it focuses on English LLMs without explicitly bounding what conclusions cannot be drawn, though it mentions multilingual extension as future work.", 71 "source": "haiku" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": false, 78 "justification": "No funding source is disclosed in the paper; the acknowledgment section thanks individuals by name but contains no mention of grants, contracts, or institutional research funding.", 79 "source": "haiku" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "Author affiliations are clearly disclosed on the first page: FAIR at Meta, GenAI at Meta, and HKUST.", 85 "source": "haiku" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": true, 89 "answer": false, 90 "justification": "Authors are Meta employees (FAIR and GenAI divisions) and the benchmark evaluates Meta's own Llama models (Llama 3.1, 3.3); no competing interests statement addresses this potential bias in benchmark design or result interpretation.", 91 "source": "haiku" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "There is no competing interests statement or declaration of financial interests (patents, equity, consulting) anywhere in the paper.", 97 "source": "haiku" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": true, 104 "justification": "Key terms are explicitly and carefully defined: hallucination (intrinsic vs extrinsic), factuality, false refusal rate, hallucination rate, false acceptance rate, and the distinction between hallucination and factuality is elaborated across multiple sections with examples.", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper explicitly states a threefold contribution: (1) clear hallucination taxonomy distinguishing from factuality, (2) new extrinsic hallucination tasks with dynamic regeneration, (3) comprehensive analysis of existing benchmarks highlighting their limitations.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper extensively engages with prior work—critiquing TruthfulQA's taxonomy and gold answers, comparing with SimpleQA, integrating FaithEval and ANAH 2.0, and situating new tasks relative to FactScore, VeriScore, and SAFE frameworks.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "benchmark-creation": { 123 "construct_design": { 124 "construct_validity_argued": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper argues that hallucination (inconsistency with training data) is measurable via Wikipedia-grounded tasks because Wikipedia is assumed to be in training data of most LLMs, providing a valid internal oracle without requiring external factuality verification.", 128 "source": "haiku" 129 }, 130 "difficulty_distribution_characterized": { 131 "applies": true, 132 "answer": true, 133 "justification": "PreciseWikiQA uses harmonic centrality from WikiRank to create 10 difficulty bins (0-9, hardest to easiest), sampling 500 pages per bin; LongWiki selects difficulty levels 5-9 to avoid long-tail knowledge that would cause excessive refusal.", 134 "source": "haiku" 135 }, 136 "ceiling_floor_effects_checked": { 137 "applies": true, 138 "answer": true, 139 "justification": "Results show significant model discrimination—PreciseWikiQA correct answer rates span 8.73% to 52.59%, false acceptance rates span 6.88% to 86.36%—and the paper explicitly notes saturation of HHEM for frontier models as a reason for including additional tasks.", 140 "source": "haiku" 141 }, 142 "human_baseline_included": { 143 "applies": true, 144 "answer": false, 145 "justification": "No human performance baseline is provided for any of the three new benchmark tasks; human evaluation is only used to validate LLM judge accuracy (94.77%, 96.67%), not to establish human-level performance on the tasks themselves.", 146 "source": "haiku" 147 }, 148 "scoring_rubric_justified": { 149 "applies": true, 150 "answer": true, 151 "justification": "The paper justifies including false refusal rate alongside hallucination rate to prevent gaming via abstention, and adopts Recall@K from SAFE to prevent short-response gaming of precision; the specific K=32 is adopted from prior work without independent justification.", 152 "source": "haiku" 153 } 154 }, 155 "robustness": { 156 "contamination_resistance_designed": { 157 "applies": true, 158 "answer": true, 159 "justification": "Dynamic test set generation is the central design feature for contamination resistance: questions are generated at evaluation time from Wikipedia, achieving <1.01% standard deviation across three runs; the paper calls this 'ungameable' and discusses it as a primary criterion in Section 2.4.", 160 "source": "haiku" 161 }, 162 "temporal_robustness_discussed": { 163 "applies": true, 164 "answer": true, 165 "justification": "The paper addresses temporal robustness via dynamic generation and explicitly notes that developers can use different Wikipedia versions compatible with model training cutoffs (Appendix B.1); it also acknowledges intrinsic hallucination benchmarks remain static as a limitation.", 166 "source": "haiku" 167 }, 168 "failure_modes_discussed": { 169 "applies": true, 170 "answer": true, 171 "justification": "Failure modes discussed include Wikipedia not covering full training data (5% unverifiable claims), retrieval failures in LongWiki (15.4%), LLM judge limitations (76.8% human alignment), domain-specific model behaviors (Gemma refusing all medicine), and tension between dynamicity and reproducibility.", 172 "source": "haiku" 173 }, 174 "baseline_implementations_provided": { 175 "applies": true, 176 "answer": true, 177 "justification": "Code is available at https://github.com/facebookresearch/HalluLens; all prompts for question generation, evaluation, and inference are provided in Appendix D; models used in the pipeline are documented in Table 6.", 178 "source": "haiku" 179 } 180 }, 181 "documentation": { 182 "dataset_documentation_complete": { 183 "applies": true, 184 "answer": true, 185 "justification": "Data sources (GoodWiki, ITIS database CC0, medicine drug database), collection methodology (two-step LLM generation with filtering), preprocessing steps (chunking, difficulty binning, answerability filtering), and quality validation (97.2% gold answer accuracy on 250 human-annotated samples) are thoroughly documented.", 186 "source": "haiku" 187 }, 188 "licensing_and_access_clear": { 189 "applies": true, 190 "answer": false, 191 "justification": "The paper provides a GitHub URL but does not state the license for the benchmark code or generated outputs; the underlying ITIS database is cited as CC0 but the benchmark's own licensing terms are not declared in the paper itself.", 192 "source": "haiku" 193 }, 194 "intended_use_specified": { 195 "applies": true, 196 "answer": true, 197 "justification": "The paper explicitly states what the benchmark measures (hallucination, not factuality), provides design criteria for appropriate use (Section 2.4), and warns against conflating hallucination benchmarks with factuality benchmarks as prior work routinely does.", 198 "source": "haiku" 199 } 200 } 201 } 202 }, 203 "claims": [ 204 { 205 "claim": "Dynamic test set generation for PreciseWikiQA achieves <1.01% average standard deviation over three evaluation runs, demonstrating stable and reproducible evaluation.", 206 "evidence": "Three-run stability analysis in Section B.1 reports average SDs of 0.64%, 1.01%, and 0.56% for false refusal rate, hallucination rate when not refused, and correct answer rate respectively.", 207 "supported": "strong" 208 }, 209 { 210 "claim": "GPT-4o achieves the highest correct answer rate (52.59%) on PreciseWikiQA among 13 tested models while maintaining the lowest false refusal rate (4.13%).", 211 "evidence": "Table 2 reports GPT-4o metrics: 4.13% false refusal, 45.15% hallucination when not refused, 52.59% correct answer rate—all best-in-class.", 212 "supported": "strong" 213 }, 214 { 215 "claim": "TruthfulQA contains incorrect gold answers; MC1 evaluation incorrectly judges >200 samples (~25% of the test set) as wrong when responses may be factually correct.", 216 "evidence": "Error analysis on Llama-3.1-405B using full responses in Section 5.1, with six specific examples of flawed ground truth provided.", 217 "supported": "moderate" 218 }, 219 { 220 "claim": "Larger models within the same family generally achieve lower hallucination rates, though this trend does not hold consistently across different model families.", 221 "evidence": "Tables 2, 3, 4 show consistent within-family trends for Llama, Qwen, Claude, Gemma; exceptions noted (Llama-3.3-70B vs 3.1-70B anomalies discussed explicitly).", 222 "supported": "moderate" 223 }, 224 { 225 "claim": "Llama-3.1-405B-Instruct achieves the lowest false acceptance rate (11.48% MixedEntities, 2.28% GeneratedEntities) on NonExistentRefusal.", 226 "evidence": "Table 4 reports these figures; round-robin stability analysis (Figure 10) confirms rankings are consistent across different entity generator configurations.", 227 "supported": "strong" 228 }, 229 { 230 "claim": "97.2% of automatically generated gold answers for PreciseWikiQA are correct based on human verification.", 231 "evidence": "Human annotation of 250 sample question-answer pairs reported in Appendix B.1; 0.02% unverifiable and 2 partially correct.", 232 "supported": "moderate" 233 }, 234 { 235 "claim": "The LongWiki automatic evaluation pipeline aligns with human annotators 76.8% of the time when limited to Wikipedia as reference.", 236 "evidence": "Manual annotation of 500 claims compared to pipeline output reported in Appendix B.2; retrieval failures account for 15.4% of misalignments.", 237 "supported": "moderate" 238 } 239 ], 240 "methodology_tags": [ 241 "benchmark-eval", 242 "observational" 243 ], 244 "key_findings": "HalluLens establishes a principled taxonomy distinguishing hallucination (inconsistency with training data or input context) from factuality (absolute correctness against external oracles), addressing widespread conflation in prior work. Three new dynamically-generated extrinsic hallucination benchmarks evaluated across 13 LLMs reveal GPT-4o generally performs best while Mistral models hallucinate most; there is a fundamental tradeoff between refusal rate and hallucination rate that confounds single-metric comparisons. TruthfulQA is found to have substantial methodological flaws—approximately 25% of MC1 judgments may be false positives due to incorrect gold answers and evaluation metric problems—and should be reclassified as a factuality rather than hallucination benchmark. Dynamic test set generation achieves stable evaluation (<1.01% variance) while significantly resisting benchmark contamination.", 245 "red_flags": [ 246 { 247 "flag": "No human baseline on benchmark tasks", 248 "detail": "None of the three new benchmark tasks (PreciseWikiQA, LongWiki, NonExistentRefusal) include human performance baselines; it is unclear how models compare to humans or whether difficulty calibration is appropriate." 249 }, 250 { 251 "flag": "Conflict of interest: Meta evaluating Llama", 252 "detail": "Authors from Meta's FAIR and GenAI groups designed a benchmark that evaluates Meta's own Llama models (3.1-8B through 3.1-405B, 3.3-70B); no competing interests statement addresses this potential bias." 253 }, 254 { 255 "flag": "LongWiki pipeline validity (76.8%)", 256 "detail": "The automatic evaluation pipeline for LongWiki agrees with human annotators only 76.8% of the time, raising questions about whether reported hallucination rates are reliable; this low agreement is buried in the appendix." 257 }, 258 { 259 "flag": "Wikipedia proxy assumption unverified", 260 "detail": "All extrinsic hallucination tasks assume Wikipedia is in the training data of all evaluated models; this is unverified and 5% of model-generated claims are already unverifiable within Wikipedia, indicating boundary failures." 261 }, 262 { 263 "flag": "No funding disclosure", 264 "detail": "Despite institutional affiliations with large commercial AI labs (Meta FAIR, Meta GenAI), no funding sources or financial interests are declared anywhere in the paper." 265 }, 266 { 267 "flag": "No dedicated limitations section", 268 "detail": "Limitations are scattered across appendices and inline discussion without consolidation, making systematic assessment of the benchmark's weaknesses difficult for readers." 269 } 270 ], 271 "cited_papers": [ 272 { 273 "title": "Survey of Hallucination in Natural Language Generation", 274 "relevance": "Foundational taxonomy paper (Ji et al. 2023) defining intrinsic/extrinsic hallucination that HalluLens directly builds upon and extends to LLM context" 275 }, 276 { 277 "title": "TruthfulQA: Measuring How Models Mimic Human Falsehoods", 278 "relevance": "Major benchmark extensively critiqued; paper argues TruthfulQA measures factuality not hallucination and contains incorrect gold answers" 279 }, 280 { 281 "title": "A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions", 282 "relevance": "Huang et al. 2023 survey whose taxonomy conflates hallucination with factuality; contrasted with HalluLens taxonomy as a key motivation" 283 }, 284 { 285 "title": "The Dawn After the Dark: An Empirical Study on Factuality Hallucination in Large Language Models (HaluEval 2.0)", 286 "relevance": "Existing factuality/hallucination benchmark analyzed and reclassified as factuality evaluation in the HalluLens framework" 287 }, 288 { 289 "title": "FaithEval: Can Your Language Model Stay Faithful to Context", 290 "relevance": "Intrinsic hallucination benchmark incorporated into HalluLens for evaluating faithfulness to noisy or counterfactual input contexts" 291 }, 292 { 293 "title": "Measuring Short-Form Factuality in Large Language Models (SimpleQA)", 294 "relevance": "Factuality benchmark compared with PreciseWikiQA; discussed as adaptable to extrinsic hallucination with metric modifications" 295 }, 296 { 297 "title": "FActScore: Fine-grained Atomic Evaluation of Factual Precision in Long Form Text Generation", 298 "relevance": "Claim-level factoid evaluation framework whose pipeline was adapted for LongWiki hallucination evaluation with Wikipedia-only retrieval" 299 }, 300 { 301 "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models", 302 "relevance": "Motivates the dynamic test set generation approach; cited as evidence that MMLU and TruthfulQA are likely contaminated" 303 }, 304 { 305 "title": "ANAH: Analytical Annotation of Hallucinations in Large Language Models", 306 "relevance": "Intrinsic hallucination benchmark (ANAH 2.0 with reference setup) incorporated into HalluLens for evaluating hallucination with accurate input contexts" 307 }, 308 { 309 "title": "VeriScore: Evaluating the Factuality of Verifiable Claims in Long-form Text Generation", 310 "relevance": "Precision+recall framework that influenced LongWiki's F1@K metric design; compared to FactScore's precision-only approach" 311 } 312 ], 313 "engagement_factors": { 314 "practical_relevance": { 315 "score": 3, 316 "justification": "Provides runnable code, clear metrics, and directly evaluates 13 production LLMs; practitioners can immediately benchmark their own models against the dynamic evaluation suite." 317 }, 318 "surprise_contrarian": { 319 "score": 2, 320 "justification": "Finding that TruthfulQA—one of the most widely cited benchmarks—has ~25% false positive judgments and measures factuality rather than hallucination challenges widespread benchmark usage." 321 }, 322 "fear_safety": { 323 "score": 1, 324 "justification": "Hallucination undermines AI trust and safety but the paper frames this as an evaluation methodology problem rather than raising alarm about specific deployment risks." 325 }, 326 "drama_conflict": { 327 "score": 2, 328 "justification": "Explicitly critiques TruthfulQA's correctness and methodology, and challenges the field's conflation of hallucination with factuality in a way likely to generate academic pushback." 329 }, 330 "demo_ability": { 331 "score": 3, 332 "justification": "Code is publicly available on GitHub (facebookresearch/HalluLens); anyone can run the dynamic test set generation against any LLM immediately with provided prompts." 333 }, 334 "brand_recognition": { 335 "score": 3, 336 "justification": "Published by Meta FAIR and Meta GenAI researchers; evaluates high-profile models including GPT-4o, Claude-3, and the full Llama 3 family; accepted at ACL 2025." 337 } 338 }, 339 "hn_data": { 340 "threads": [ 341 { 342 "hn_id": "40201212", 343 "title": "A manufacturable platform for photonic quantum computing", 344 "points": 2, 345 "comments": 0, 346 "url": "https://news.ycombinator.com/item?id=40201212" 347 } 348 ], 349 "top_points": 2, 350 "total_points": 2, 351 "total_comments": 0 352 } 353 }