scan.json (34290B)
1 { 2 "paper": { 3 "title": "HalluLens: LLM Hallucination Benchmark", 4 "authors": [ 5 "Yejin Bang", 6 "Ziwei Ji", 7 "Alan Schelten", 8 "Anthony Hartshorn", 9 "Tara Fowler", 10 "Cheng Zhang", 11 "Nicola Cancedda", 12 "Pascale Fung" 13 ], 14 "year": 2025, 15 "venue": "Annual Meeting of the Association for Computational Linguistics", 16 "arxiv_id": "2504.17550", 17 "doi": "10.48550/arXiv.2504.17550" 18 }, 19 "scan_version": 3, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "HalluLens introduces a taxonomy distinguishing LLM hallucination (inconsistency with training data or input) from factuality (correctness vs. world knowledge), and proposes three dynamically-generated extrinsic hallucination tasks. Across 13 LLMs, GPT-4o achieves the highest correct answer rate (52.59%) on PreciseWikiQA while Llama-3.1-405B has the lowest hallucination-when-not-refused rate (26.84%) but high false refusal (56.77%), revealing a precision-recall trade-off. The paper demonstrates that TruthfulQA has significant flaws including ~25% potentially incorrect MC1 scores and outdated gold answers. Dynamic test set generation achieves low cross-run variance (<1.01% std dev) while mitigating data leakage.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper provides a GitHub link on the first page: 'Code: https://github.com/facebookresearch/HalluLens'." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The benchmark uses publicly available datasets (GoodWiki, ITIS database, medicine list) and the test sets are dynamically generated via the released code pipeline. The data sources are open-source as noted in Section 3 and Appendix B." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed library version listings are mentioned in the paper." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "While code is released on GitHub, the paper itself contains no step-by-step reproduction instructions or 'Reproducing Results' section." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": true, 50 "justification": "Figure 5 and Figure 7 show error bars from three runs. Section B.1 reports: 'average standard deviation of 0.64%, 1.01%, and 0.56% for false refusal rate, hallucination rate when not refused, and correct answer rate respectively.'" 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "Kendall's τ correlations with statistical significance (p ≤ 0.01) are reported for NonExistentRefusal subtask agreement (Section 3.3.3, Figure 8). However, no significance tests are applied to the main model performance comparisons — claims like 'GPT-4o achieves the highest correct answer scores' are based on comparing point estimates without tests." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Results are reported as absolute percentages with full context (e.g., false refusal rate, hallucination rate, correct answer rate in Table 2), allowing direct assessment of effect magnitudes across models." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "Sample sizes are stated (5,000 for PreciseWikiQA, 250 for LongWiki, 8,000+1,950 for NonExistentRefusal) and the PreciseWikiQA size follows from the 10 difficulty bins × 500 pages structure. However, no formal power analysis or statistical justification for why these sizes are sufficient is provided." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": true, 70 "justification": "Standard deviations across three trials are reported: <1.01% for PreciseWikiQA (Section B.1), '1.85%, 0.95%, 1.20%, 0.84%' for LongWiki metrics (Section B.2). Error bars are shown in Figures 5 and 7." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "13 models from multiple families (Llama, Qwen, Gemma, Mistral, Claude, GPT) are evaluated against each other. Existing benchmarks (TruthfulQA, SimpleQA, HHEM, ANAH, FaithEval) are analyzed for comparison in Sections 4-5." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Models evaluated include GPT-4o (2024-08-06), Claude-3-sonnet (2024-03-07), Llama-3.3-70B-Instruct, Qwen2.5, and Gemma-2, which were all contemporary at the time of writing." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": false, 87 "justification": "There is no formal ablation study of the benchmark design components. The round-robin entity generation comparison (Figure 9) and difficulty-level analysis (Figure 4, 6) are sensitivity analyses, not ablations of specific design choices." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Multiple metrics are used per task: PreciseWikiQA uses false refusal rate, hallucination rate when not refused, and correct answer rate (Section 3.1.1). LongWiki adds precision, recall@K, and F1@K (Section 3.2.1). NonExistentRefusal uses false acceptance rate (Section 3.3.1)." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": true, 97 "justification": "Multiple human evaluations validate the pipeline: 250 samples annotated for gold answer quality (97.2% correct, Section B.1), 440 responses for evaluator agreement (94.77%, Section B.3), 500 claims for LongWiki pipeline validation (76.8% agreement, Section B.2)." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Test sets are dynamically generated at evaluation time and not used for any tuning or selection decisions. No model fine-tuning is performed — all models are evaluated in their released form." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down by task, subtask (MixedEntities vs GeneratedEntities in Table 4/7), domain (animal, plant, bacteria, medicine in Table 7), difficulty level (10 bins in Figure 4/6), and place frequency (Figure 11)." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Failure patterns are discussed qualitatively: Claude models expressing uncertainty while still accepting non-existent entities (Section 3.3.3), Gemma blanket-refusing medicine queries (Table 7), retrieval failures in LongWiki pipeline (15.4% failure rate, Section B.2), and specific TruthfulQA failure examples (Section 5.1)." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The LongWiki evaluation pipeline achieves only 76.8% agreement with human annotators (Section B.2). 5% of claims are not verifiable within Wikipedia. The retrieval step fails 15.4% of the time. These limitations of their own approach are reported transparently." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims three contributions: (1) clear hallucination taxonomy — supported by Section 2; (2) new extrinsic hallucination tasks with dynamic generation — supported by Section 3 with results in Tables 2-4; (3) comprehensive analysis of existing benchmarks — supported by Sections 4-5." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper makes causal interpretations without adequate justification: 'places with middle-level frequency are closer to the knowledge boundary, causing the model to be uncertain' (Section B.3.2), 'RLHF can incur an alignment tax' (Section 2.3). These are stated as causal mechanisms but are observational interpretations without experimental manipulation." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title 'LLM Hallucination Benchmark' implies broad applicability, but the benchmark tests only English-language models, uses Wikipedia as the primary knowledge proxy, and covers limited domains. The paper notes 'extending this work to other languages is important' (Section 6) but does not bound the main claims to the tested setting." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not systematically consider alternative explanations for model performance differences. For instance, using Llama-3.1-70B as the judge to evaluate Llama-family models could introduce evaluation bias. The effect of different instruct-tuning strategies on refusal patterns is noted (Section 3.3.3) but not explored as a confound." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "A central contribution is distinguishing the proxy (benchmark performance) from the construct (hallucination). The paper explicitly separates hallucination from factuality (Section 2.1), acknowledges that Wikipedia is a proxy for training data (not the actual training data), and notes the 5% gap where claims are not verifiable in Wikipedia." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "Specific model versions are listed with snapshot dates for commercial models: 'Claude-3-haiku (2024-02-29), Claude-3-sonnet (2024-03-07), and GPT-4o (2024-08-06)' (Section 3). Open-source models are specified with exact names and sizes (e.g., 'Llama-3.1-8B-Instruct', 'Qwen2.5-7B-Instruct')." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompt text is provided in Appendix D for all pipeline stages: question generation (D.1), answerability judgment, hallucination judgment, abstention judgment, LongWiki generation (D.2), NonExistentRefusal evaluation (D.3), and the 10 inference prompt templates (D.3.1)." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section B states: 'We evaluate all models under the same decoding setup, using a temperature of zero and top-p of one.' Section 3.2.2 states 'maximum length of 1024 tokens' for LongWiki." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The benchmark directly queries models via standard inference." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "The full pipeline is documented: GoodWiki source selection with harmonic centrality binning (Section 3.1.2), question generation with answerability filtering, answer length filtering (>10 words removed), difficulty control (levels 5-9 for LongWiki), ITIS database for MixedEntities, and Brave Search verification for GeneratedEntities (Section 3.3.2)." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Limitations are discussed in subsections throughout (B.2 discussion, B.3 discussion) but not in a consolidated section." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Specific threats are discussed: Wikipedia covers only a fraction of training data (5% unverifiable claims, Section B.2), reference retrieval fails 15.4% of the time (Section B.2), LLM-evaluator agreement with humans is 76.8% for LongWiki (Section B.2), and Gemma's blanket medicine refusal biases results (Table 7, Section B.3.1)." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "The paper does not explicitly state what the benchmark does NOT measure or which populations/settings are excluded. The English-only limitation is acknowledged only in passing in the Related Work section, and no explicit scope boundaries are drawn for the main results." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "While the code pipeline is released, the raw model outputs and generated test sets from the reported experiments are not made available for independent verification." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Data collection is thoroughly described: GoodWiki dataset of 44,754 Wikipedia pages (Section 3.1.2), ITIS database with 145,000+ taxonomic records (Section 3.3.2), medicine list covering 250,000 drugs, harmonic centrality-based difficulty binning, and entity generation via round-robin with three LLMs." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants are recruited. Data sources are public databases and benchmarks." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The full pipeline is documented with counts at each stage: 44,754 GoodWiki pages → 5,000 pages sampled (500 per difficulty bin) → 5,000 QA pairs for PreciseWikiQA; 250 prompts for LongWiki (50 per difficulty level 5-9); 8,000 MixedEntities + 1,950 GeneratedEntities. Filtering criteria (answer length ≤10 words, answerability check) are specified." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No explicit funding disclosure appears in the paper. The acknowledgments thank individuals but do not mention grants, corporate sponsorship, or funding agencies." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: 'FAIR at Meta, GenAI at Meta, HKUST' with footnote 'Work done during Internship at FAIR.' Meta employees evaluating Llama models (Meta's product) is evident from the author list." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "Meta researchers evaluate Meta's own Llama models alongside competitors. Meta has a financial interest in Llama models performing well on hallucination benchmarks. The funder is not independent of the outcome." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests statement or financial disclosure appears in the paper, despite the clear conflict of Meta employees evaluating Meta products." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": true, 238 "answer": false, 239 "justification": "Training cutoff dates for the evaluated models are not stated. The paper notes GoodWiki uses Wikipedia pages from September 2023 and assumes Wikipedia is in training data, but does not state when each model's training data ends." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": true, 243 "answer": true, 244 "justification": "Train/test overlap is a central concern of the paper. Section 1 discusses how 'static test sets are especially vulnerable to obsolescence' and Section 2.4 identifies 'robustness against unintentional data leakage' as a key benchmark criterion. The dynamic generation approach is specifically designed to prevent overlap." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": true, 248 "answer": true, 249 "justification": "Benchmark contamination is extensively addressed. The paper introduces dynamic test generation to prevent contamination (Section 3), analyzes contamination of existing benchmarks like TruthfulQA (Section 5.1, noting 'it is now saturated due to inclusion in training data'), and cites Deng et al. (2024) on widespread contamination of benchmarks." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in the study. Human annotations are used for pipeline validation, not as a human subjects study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants — the paper evaluates LLM outputs using automated pipelines with human validation of the pipeline itself." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in the study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in the study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in the study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in the study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in the study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No inference cost, API spend, tokens consumed, or wall-clock time is reported for running the benchmark, despite evaluating 13 models across thousands of test instances with LLM-based evaluation." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No computational budget is stated. The benchmark requires generating dynamic test sets, running inference on 13 models, and running LLM-based evaluation, but the total compute cost is not quantified." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": true, 305 "justification": "Results are reported across three trials with standard deviations. Section B.1: 'average standard deviation of 0.64%, 1.01%, and 0.56%'. Section B.2: 'average standard deviation across the models of 1.85%, 0.95%, 1.20%, 0.84%'. Temperature is set to 0, so variance comes from dynamic test generation." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "Explicitly stated: 'Each experiment consists of three trials' (Section B). Table 2 caption: 'average of three trials of evaluation.'" 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": true, 315 "justification": "No hyperparameter search was performed — fixed standard settings (temperature=0, top-p=1) are used uniformly across all models (Section B), making the search budget transparently zero." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "No configuration selection is performed. All models are evaluated with the same fixed settings (temperature=0, top-p=1), eliminating selection bias." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "13 models are compared across multiple tasks and metrics with no correction for multiple comparisons. Kendall's τ correlations report significance (Figure 8) but without family-wise error rate correction across the many comparisons made." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "Meta researchers evaluate Meta's Llama models alongside competitors but do not acknowledge or address the potential bias of evaluating their own models. Additionally, using Llama-3.1-70B as the evaluation judge for all models including Llama variants is not discussed as a potential bias." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Models range from 7B to 405B parameters with vastly different compute requirements, but performance is not reported as a function of compute budget. No cost-normalized comparisons are provided." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": true, 340 "justification": "This is a central contribution. Section 2 extensively discusses what hallucination means vs factuality, Section 5 analyzes construct validity of TruthfulQA (showing it measures factuality, not hallucination), and Section 5.2 discusses when factuality benchmarks can/cannot serve as hallucination benchmarks." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No scaffolding is used — models are directly queried via standard inference." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": true, 352 "justification": "Temporal leakage is a central concern. The paper introduces dynamic test generation specifically to prevent it (Section 3): 'To reduce the risk of test sets being memorized or leaked, we dynamically generate new test questions during evaluation.' The GoodWiki source date (September 2023) is disclosed." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup (e.g., the phrasing of prompts, the structure of questions) could leak information about expected answers to the tested models." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of whether generated test questions share structural patterns that could correlate with training data exposure. Questions are drawn from Wikipedia pages that may have overlapping content or similar structures." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": true, 367 "justification": "Dynamic test generation serves as a concrete leakage prevention method. For NonExistentRefusal, entities are verified via Brave Search API to confirm non-existence (Section 3.3.2). The paper also demonstrates leakage in TruthfulQA as evidence for the need for dynamic approaches." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Existing hallucination benchmarks conflate hallucination with factuality, requiring distinct evaluation approaches with separate taxonomies.", 374 "evidence": "Section 2 provides a detailed taxonomy distinguishing extrinsic/intrinsic hallucination from factuality. Section 5.1 demonstrates TruthfulQA is primarily a factuality benchmark, with ~200 samples (~25% of test set) scored as incorrect by MC1 that could be factually correct.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Dynamic test set generation produces stable evaluation results with low variance across runs.", 379 "evidence": "Three trials show average std dev of 0.64%/1.01%/0.56% for PreciseWikiQA metrics (Section B.1), 1.85%/0.95%/1.20%/0.84% for LongWiki metrics (Section B.2), and consistent rankings across NonExistentRefusal trials (Figure 7).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "GPT-4o achieves the highest correct answer rate (52.59%) on PreciseWikiQA among tested models.", 384 "evidence": "Table 2 shows GPT-4o at 52.59% correct answer rate, 4.13% false refusal, and 45.15% hallucination when not refused, averaged over three trials.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Larger models within the same family tend to hallucinate less and refuse less frequently.", 389 "evidence": "Tables 2-4 show trends: Llama-3.1 8B→70B→405B shows decreasing hallucination rates (48.37%→37.30%→26.84%). Qwen 7B→14B and Gemma 9B→27B show similar patterns. However, Llama-3.3-70B deviates with higher false acceptance than 3.1-405B (40.82% vs 6.88%).", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "TruthfulQA contains inaccurate gold answers and the MC1 log-probability metric produces misleading results.", 394 "evidence": "Section 5.1 provides specific examples: incorrect gold answers (kindergarten importance, cannabis legality in Asia), subjective/ambiguous prompts ('big' oil reserves), and the MC1 tone-matching problem where correct model responses score as incorrect because the false option matches the model's generation tone better.", 395 "supported": "strong" 396 }, 397 { 398 "claim": "Models' false acceptance rate varies with knowledge boundary proximity — middle-frequency places show the highest false acceptance rates.", 399 "evidence": "Figure 11 shows the pattern across N-gram frequency groups (low, middle, high). The explanation is that middle-frequency places are closer to the model's knowledge boundary, making the model uncertain (Section B.3.2).", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "The automatic LLM evaluator achieves high agreement with human judgments across all tasks.", 404 "evidence": "Evaluator accuracy: 96.67% for abstention judgment and 95.56% for correctness judgment in PreciseWikiQA (Section B.1), 94.77% agreement for NonExistentRefusal (Section B.3), but only 76.8% for LongWiki claim verification (Section B.2).", 405 "supported": "moderate" 406 } 407 ], 408 "red_flags": [ 409 { 410 "flag": "Conflict of interest — Meta evaluating Meta models", 411 "detail": "Meta (FAIR/GenAI) researchers evaluate Meta's Llama models alongside competitors. Llama-3.1-405B achieves the best false acceptance rates on NonExistentRefusal (6.88% average), but the conflict is never acknowledged." 412 }, 413 { 414 "flag": "LLM-as-judge circularity", 415 "detail": "Llama-3.1-70B-Instruct is used as the evaluation judge across all tasks (Table 6). This same-family model judges other Llama variants, creating potential bias from shared training data or output style preferences. The paper validates judge accuracy on human annotations but does not test for family-specific bias." 416 }, 417 { 418 "flag": "Selective model coverage", 419 "detail": "Only Claude-3-haiku and Claude-3-sonnet are included — not Claude-3-Opus or Claude-3.5-Sonnet (available by April 2025). No Gemini models are tested despite being a major competitor. This selection could favor Meta models by excluding stronger competitors." 420 }, 421 { 422 "flag": "No significance tests for main model comparisons", 423 "detail": "Performance differences between 13 models are presented as rankings based on point estimates (averaged over 3 runs) without any statistical significance tests. Small differences may not be meaningful." 424 }, 425 { 426 "flag": "LongWiki evaluation pipeline has low agreement", 427 "detail": "The LongWiki verification pipeline agrees with human annotators only 76.8% of the time (Section B.2), and the retrieval step fails to find relevant evidence 15.4% of the time. Results from this task should be interpreted with caution." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "Survey of hallucination in natural language generation", 433 "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"], 434 "year": 2023, 435 "relevance": "Foundational hallucination taxonomy (intrinsic/extrinsic) that HalluLens builds upon and extends to the LLM context." 436 }, 437 { 438 "title": "A survey on hallucination in large language models: Principles, taxonomy, challenges, and open questions", 439 "authors": ["Lei Huang", "Weijiang Yu"], 440 "year": 2023, 441 "arxiv_id": "2311.05232", 442 "relevance": "Major LLM hallucination survey that expanded the definition to include factual errors — HalluLens argues this conflation is problematic." 443 }, 444 { 445 "title": "Siren's song in the AI ocean: a survey on hallucination in large language models", 446 "authors": ["Yue Zhang", "Yafu Li"], 447 "year": 2023, 448 "arxiv_id": "2309.01219", 449 "relevance": "Another LLM hallucination survey with a different taxonomy that HalluLens critiques for conflating hallucination with factuality." 450 }, 451 { 452 "title": "TruthfulQA: Measuring how models mimic human falsehoods", 453 "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"], 454 "year": 2022, 455 "relevance": "Widely-used benchmark that HalluLens critically analyzes, revealing flawed gold answers and metric limitations." 456 }, 457 { 458 "title": "Measuring short-form factuality in large language models", 459 "authors": ["Jason Wei"], 460 "year": 2024, 461 "arxiv_id": "2411.04368", 462 "relevance": "SimpleQA benchmark for factuality that HalluLens compares against and discusses as a potential hallucination proxy with metric modifications." 463 }, 464 { 465 "title": "The dawn after the dark: An empirical study on factuality hallucination in large language models", 466 "authors": ["Junyi Li", "Jie Chen"], 467 "year": 2024, 468 "relevance": "HaluEval 2.0 benchmark that HalluLens analyzes as a 'factuality hallucination' benchmark overlapping factuality and extrinsic hallucination." 469 }, 470 { 471 "title": "FActScore: Fine-grained atomic evaluation of factual precision in long form text generation", 472 "authors": ["Sewon Min", "Kalpesh Krishna"], 473 "year": 2023, 474 "relevance": "Foundational framework for claim-level factuality evaluation that HalluLens adapts for its LongWiki evaluation pipeline." 475 }, 476 { 477 "title": "VeriScore: Evaluating the factuality of verifiable claims in long-form text generation", 478 "authors": ["Yixiao Song", "Yekyung Kim", "Mohit Iyyer"], 479 "year": 2024, 480 "relevance": "Improved claim extraction and verification method that HalluLens adopts for its LongWiki evaluation pipeline." 481 }, 482 { 483 "title": "Long-form factuality in large language models", 484 "authors": ["Jerry Wei"], 485 "year": 2024, 486 "arxiv_id": "2403.18802", 487 "relevance": "SAFE evaluator and LongFact benchmark for long-form factuality that HalluLens compares against, noting its reliance on internet search makes it factuality-focused." 488 }, 489 { 490 "title": "FaithEval: Can your language model stay faithful to context", 491 "authors": ["Yifei Ming"], 492 "year": 2024, 493 "relevance": "Intrinsic hallucination benchmark testing faithfulness to noisy or contradictory contexts, included in HalluLens as an existing intrinsic evaluation task." 494 }, 495 { 496 "title": "ANAH-v2: Scaling analytical hallucination annotation of large language models", 497 "authors": ["Yuzhe Gu", "Ziwei Ji"], 498 "year": 2024, 499 "relevance": "Intrinsic hallucination benchmark with reference-based QA evaluation, included as an existing task in HalluLens." 500 }, 501 { 502 "title": "Investigating data contamination in modern benchmarks for large language models", 503 "authors": ["Chunyuan Deng", "Yilun Zhao"], 504 "year": 2024, 505 "relevance": "Demonstrates widespread benchmark contamination in LLM training data, motivating HalluLens's dynamic test generation approach." 506 }, 507 { 508 "title": "Survey on factuality in large language models: Knowledge, retrieval and domain-specificity", 509 "authors": ["Cunxiang Wang"], 510 "year": 2023, 511 "arxiv_id": "2310.07521", 512 "relevance": "Factuality survey that highlights the importance of distinguishing factuality from hallucination, supporting HalluLens's taxonomic contribution." 513 }, 514 { 515 "title": "A multitask, multilingual, multimodal evaluation of ChatGPT on reasoning, hallucination, and interactivity", 516 "authors": ["Yejin Bang"], 517 "year": 2023, 518 "relevance": "Early systematic evaluation of ChatGPT hallucination across tasks, by the same first author, establishing that LLMs generate content from internal knowledge." 519 } 520 ], 521 "engagement_factors": { 522 "practical_relevance": { 523 "score": 2, 524 "justification": "Practitioners can use the released benchmark suite to evaluate hallucination in their LLM deployments, and the dynamic generation prevents gaming." 525 }, 526 "surprise_contrarian": { 527 "score": 2, 528 "justification": "Challenges the widely-held conflation of hallucination with factuality and demonstrates TruthfulQA has ~25% problematic scores — contrarian to common benchmark usage." 529 }, 530 "fear_safety": { 531 "score": 1, 532 "justification": "Hallucination undermines AI system trust, but this paper measures the problem rather than demonstrating a new attack or existential risk." 533 }, 534 "drama_conflict": { 535 "score": 1, 536 "justification": "The TruthfulQA critique has mild drama ('contains incorrect gold answers'), but the paper maintains a measured academic tone." 537 }, 538 "demo_ability": { 539 "score": 2, 540 "justification": "Code released at github.com/facebookresearch/HalluLens — practitioners can run the benchmark, though setup effort is needed." 541 }, 542 "brand_recognition": { 543 "score": 2, 544 "justification": "From Meta FAIR, a well-known AI lab. Published at ACL, a top NLP venue. Evaluates popular models (GPT-4o, Claude, Llama)." 545 } 546 } 547 }