scan.json (17566B)
1 { 2 "paper": { 3 "title": "A Survey of Hallucination in \"Large\" Foundation Models", 4 "authors": ["Vipula Rawte", "Amit Sheth", "Amitava Das"], 5 "year": 2023, 6 "venue": "arXiv", 7 "arxiv_id": "2309.05922", 8 "doi": "10.48550/arXiv.2309.05922" 9 }, 10 "scan_version": 2, 11 "active_modules": ["survey_methodology"], 12 "methodology_tags": ["meta-analysis"], 13 "key_findings": "This survey provides the first comprehensive taxonomy of hallucination across all major foundation model modalities (text, image, video, audio), not just LLMs. It catalogs detection methods, mitigation strategies, datasets, and evaluation metrics across ~25 papers. The paper classifies hallucination types by modality and covers domain-specific hallucination in medicine and law. Future directions proposed include automated hallucination evaluation, knowledge graph integration, and human-AI collaboration for detection.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides a GitHub link for open-source resources: https://github.com/vr25/hallucination-foundation-model-survey (Section 1.3.1)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No structured dataset of surveyed papers, extracted metadata, or analysis data is released. The GitHub repo is mentioned for resources but no analysis dataset is provided." 25 }, 26 "environment_specified": { 27 "applies": false, 28 "answer": false, 29 "justification": "This is a survey paper with no computational experiments requiring an environment specification." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No instructions are provided for reproducing the survey's paper selection process or analysis methodology." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": false, 40 "answer": false, 41 "justification": "Survey paper with no original experiments or statistical analyses." 42 }, 43 "significance_tests": { 44 "applies": false, 45 "answer": false, 46 "justification": "Survey paper with no original experiments or statistical analyses." 47 }, 48 "effect_sizes_reported": { 49 "applies": false, 50 "answer": false, 51 "justification": "Survey paper with no original experiments or statistical analyses." 52 }, 53 "sample_size_justified": { 54 "applies": false, 55 "answer": false, 56 "justification": "Survey paper with no original experiments or statistical analyses." 57 }, 58 "variance_reported": { 59 "applies": false, 60 "answer": false, 61 "justification": "Survey paper with no original experiments or statistical analyses." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": false, 68 "justification": "The survey does not compare itself against prior surveys in a structured way. It mentions Ji et al. (2023) and Zhang et al. (2023c) as prior surveys but does not systematically compare coverage, scope, or quality." 69 }, 70 "baselines_contemporary": { 71 "applies": false, 72 "answer": false, 73 "justification": "No experimental evaluation is conducted." 74 }, 75 "ablation_study": { 76 "applies": false, 77 "answer": false, 78 "justification": "No system with components to ablate; this is a survey." 79 }, 80 "multiple_metrics": { 81 "applies": false, 82 "answer": false, 83 "justification": "No experimental evaluation is conducted." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "No system outputs to evaluate; this is a survey." 89 }, 90 "held_out_test_set": { 91 "applies": false, 92 "answer": false, 93 "justification": "No experimental evaluation is conducted." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 1 provides a detailed per-paper breakdown across multiple dimensions (detection, mitigation, tasks, datasets, evaluation metrics) organized by modality (text, image, video, audio)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "The survey does not discuss where its own approach fails or what types of hallucination research it may have missed." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": false, 108 "justification": "No negative findings or failed approaches are reported. The survey presents all covered work without critical assessment of what doesn't work." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims to provide 'an extensive overview' of hallucination across LFMs, classify types, establish evaluation criteria, and examine mitigation strategies. The paper does cover these topics across Sections 2-7." 116 }, 117 "causal_claims_justified": { 118 "applies": false, 119 "answer": false, 120 "justification": "The paper makes no causal claims; it is a descriptive survey." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims coverage of hallucination in 'Large Foundation Models' broadly, but the actual coverage is limited to ~25 papers mostly from March-September 2023. The paper does not bound its scope to this time period or acknowledge gaps." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": false, 129 "answer": false, 130 "justification": "Pure survey/taxonomy paper presenting no empirical results of its own." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": false, 134 "answer": false, 135 "justification": "No measurements or proxy outcomes; this is a survey paper." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": false, 141 "answer": false, 142 "justification": "Survey paper; no models are used for experiments." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "No prompting is used; this is a survey." 148 }, 149 "hyperparameters_reported": { 150 "applies": false, 151 "answer": false, 152 "justification": "Survey paper; no experiments are conducted." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "No paper selection pipeline or search methodology is documented. It is unclear how the ~25 surveyed papers were identified or whether a systematic search was conducted." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no limitations section. The paper goes directly from Section 6 (hallucination not always harmful) to Section 7 (conclusion and future directions) without discussing limitations of the survey itself." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed for the survey methodology." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what it excludes. The temporal boundary (March-September 2023 per Fig. 2) is shown but not discussed as a limitation. No mention of excluded modalities, languages, or paper types." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data (e.g., complete list of papers considered, search queries, inclusion/exclusion decisions) is available for verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": false, 191 "justification": "The paper does not describe how papers were identified, what databases were searched, or what search terms were used." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants; data source is published papers, but the paper selection process is not a recruitment scenario." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "No pipeline is documented. There is no description of how papers were found, screened, or selected for inclusion." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is provided anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: all from AI Institute, University of South Carolina, USA." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "Survey paper; does not evaluate any pre-trained model on a benchmark." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Survey paper; no model evaluation." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Survey paper; no model evaluation." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "Survey paper; no computational method of its own." 285 }, 286 "compute_budget_stated": { 287 "applies": false, 288 "answer": false, 289 "justification": "Survey paper; no computational experiments." 290 } 291 }, 292 "survey_methodology": { 293 "prisma_or_structured_protocol": { 294 "applies": true, 295 "answer": false, 296 "justification": "No PRISMA diagram, no structured search protocol, no reproducible search queries. The paper appears to be an ad-hoc collection of papers without a systematic review methodology." 297 }, 298 "quality_assessment_of_sources": { 299 "applies": true, 300 "answer": false, 301 "justification": "The survey treats all included papers equally. Table 1 catalogs features (detection, mitigation, tasks, datasets, metrics) but does not assess the methodological quality or rigor of any source paper." 302 }, 303 "publication_bias_discussed": { 304 "applies": true, 305 "answer": false, 306 "justification": "No discussion of publication bias. The survey does not consider whether its sources are biased toward positive results or whether negative results on hallucination mitigation are underrepresented." 307 } 308 } 309 }, 310 "claims": [ 311 { 312 "claim": "This is the first comprehensive survey of hallucination across all major modalities of foundation models (text, image, video, audio).", 313 "evidence": "Section 1.3 states prior surveys covered only NLG (Ji et al., 2023) or LLMs (Zhang et al., 2023c), while this survey covers text, image, video, and audio.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "57.0% of model-generated sentences in video captioning contain factual errors.", 318 "evidence": "Section 4 cites Liu and Wan (2023) for this statistic based on human evaluation.", 319 "supported": "moderate" 320 }, 321 { 322 "claim": "InstructBLIP exhibits a 30% rate of hallucinatory text including non-existent objects, inaccurate descriptions, and erroneous relationships.", 323 "evidence": "Section 3 cites Gunjal et al. (2023) for this finding.", 324 "supported": "moderate" 325 } 326 ], 327 "red_flags": [ 328 { 329 "flag": "No systematic review methodology", 330 "detail": "The survey has no documented search strategy, no inclusion/exclusion criteria, no PRISMA flow diagram, and no description of how papers were found. It is impossible to assess completeness or reproduce the paper selection." 331 }, 332 { 333 "flag": "No quality assessment of sources", 334 "detail": "All surveyed papers are presented uncritically. No assessment of methodological rigor means the survey may be laundering weak results alongside strong ones." 335 }, 336 { 337 "flag": "Very narrow temporal coverage presented as comprehensive", 338 "detail": "Fig. 2 shows papers only from March-September 2023, and the survey covers roughly 25 papers total, yet claims to be a 'comprehensive' survey. Important prior work on hallucination predating 2023 receives minimal coverage." 339 }, 340 { 341 "flag": "Descriptive only, no synthesis", 342 "detail": "The paper summarizes each work individually without comparative analysis, identifying patterns, or drawing synthetic conclusions about what approaches actually work." 343 }, 344 { 345 "flag": "No limitations section", 346 "detail": "The survey does not discuss any limitations of its own approach, scope, or methodology." 347 } 348 ], 349 "cited_papers": [ 350 { 351 "title": "Survey of Hallucination in Natural Language Generation", 352 "authors": ["Ziwei Ji", "Nayeon Lee", "Rita Frieske"], 353 "year": 2023, 354 "relevance": "Prior comprehensive survey on hallucination in NLG, precursor to this multi-modal survey." 355 }, 356 { 357 "title": "Siren's Song in the AI Ocean: A Survey on Hallucination in Large Language Models", 358 "authors": ["Yue Zhang", "Yafu Li", "Leyang Cui"], 359 "year": 2023, 360 "arxiv_id": "2309.01219", 361 "relevance": "Concurrent survey on LLM hallucination, complementary to this multi-modal survey." 362 }, 363 { 364 "title": "HaluEval: A Large-Scale Hallucination Evaluation Benchmark for Large Language Models", 365 "authors": ["Junyi Li"], 366 "year": 2023, 367 "arxiv_id": "2305.11747", 368 "relevance": "Major hallucination evaluation benchmark for LLMs." 369 }, 370 { 371 "title": "SelfCheckGPT: Zero-Resource Black-Box Hallucination Detection for Generative Large Language Models", 372 "authors": ["Potsawee Manakul", "Adian Liusie", "Mark J. F. Gales"], 373 "year": 2023, 374 "relevance": "Zero-resource hallucination detection method for LLMs, relevant to AI safety and reliability." 375 }, 376 { 377 "title": "Check Your Facts and Try Again: Improving Large Language Models with External Knowledge and Automated Feedback", 378 "authors": ["Baolin Peng", "Michel Galley"], 379 "year": 2023, 380 "arxiv_id": "2302.12813", 381 "relevance": "LLM-AUGMENTER system for mitigating hallucination using external knowledge and automated feedback." 382 }, 383 { 384 "title": "Evaluating Object Hallucination in Large Vision-Language Models", 385 "authors": ["Yifan Li", "Yifan Du", "Kun Zhou"], 386 "year": 2023, 387 "arxiv_id": "2305.10355", 388 "relevance": "Introduces POPE evaluation method for object hallucination in LVLMs." 389 }, 390 { 391 "title": "On the Opportunities and Risks of Foundation Models", 392 "authors": ["Rishi Bommasani"], 393 "year": 2021, 394 "arxiv_id": "2108.07258", 395 "relevance": "Foundational paper defining foundation models and their risks including hallucination." 396 }, 397 { 398 "title": "Med-HALT: Medical Domain Hallucination Test for Large Language Models", 399 "authors": ["Logesh Kumar Umapathi", "Ankit Pal", "Malaikannan Sankarasubbu"], 400 "year": 2023, 401 "arxiv_id": "2307.15343", 402 "relevance": "Domain-specific hallucination benchmark for medical LLMs, relevant to AI safety in high-stakes domains." 403 } 404 ] 405 }