scan.json (18378B)
1 { 2 "paper": { 3 "title": "Cracking the Code of Hallucination in LVLMs with Vision-aware Head Divergence", 4 "authors": ["Jinghan He", "Kuan Zhu", "Haiyun Guo", "Junfeng Fang", "Zhenglin Hua", "Yuheng Jia", "Ming Tang", "Tat-Seng Chua", "Jinqiao Wang"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2412.13949" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub link provided in abstract: https://github.com/jinghan1he/VHR." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Uses publicly available benchmarks: CHAIR (MSCOCO), POPE, and LLaVA-Bench, all standard public datasets." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment specification found in the paper. Only model names are mentioned." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions in the paper. Implementation details section describes hyperparameters but not how to run the code." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Standard deviations are reported for CHAIR results (e.g., 37.76±2.76) but no confidence intervals. The ± notation represents std dev across 5 random splits, not confidence intervals." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "No statistical significance tests are reported. Claims of superiority are based on comparing point estimates across methods without any formal tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Effect sizes are reported with baseline context, e.g., 'reductions of up to 16.36 in CHAIRS and 4.61 in CHAIRI on LLaVA-1.5' with full tables showing baseline values." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "500 images sampled from COCO validation set with no justification for why 500 was chosen. No power analysis." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Standard deviations reported for CHAIR results across 5 random splits (e.g., 37.76±2.76). Table 1 includes ± notation throughout." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Six baselines compared: Greedy, Beam search, DoLa, VCD, OPERA, CODE, and EAH (Section 4.3)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include recent 2024 methods: CODE (Kim et al., 2024), EAH (Zhang et al., 2024), OPERA (Huang et al., 2024), VCD (Leng et al., 2024)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 4.6 provides ablation studies on adaptive head selection (fixed VHR), outlier removal, and number of reinforced layers (Table 4, Figure 5)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple metrics used: CHAIRS, CHAIRI, F1 on POPE, and accuracy/detailedness/naturalness on LLaVA-Bench." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of the system's outputs. LLaVA-Bench uses GPT-4V as judge, which is automated, not human evaluation." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Uses established benchmarks (CHAIR on COCO validation set, POPE, LLaVA-Bench) that are separate from any training data." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "POPE results averaged over three splits (random, popular, adversarial). CHAIR reports both sentence-level and object-level metrics. LLaVA-Bench reports accuracy, detailedness, naturalness separately." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": false, 98 "justification": "No systematic failure analysis. Only success cases shown in qualitative examples (Figure 7). No discussion of where VHR fails." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Ablation shows fixed VHR performs worse (Table 4). Figure 5 shows too many reinforced layers degrades quality. LLaVA-Bench shows slight decreases in naturalness for some models." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of 'superior performance compared to state-of-the-art' and 'negligible additional time overhead' are supported by Tables 1-3 and Figure 6." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims about vision-aware heads driving hallucination are supported by ablation studies (Table 4) showing that fixed heads and including outliers degrade performance. The VHD analysis (Figure 3) provides correlational evidence linking low T-VHD to hallucinated tokens." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "Title and abstract make broad claims about 'LVLMs' but results are only on three 7B models (InstructBLIP-7b, LLaVA-1.5-7b, LLaVA-NeXT-7b). No testing on larger models or different architectures." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for why VHR works. The paper assumes language bias is the primary cause without considering other factors like vision encoder limitations or training data distribution." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific model versions given: InstructBLIP-7b, LLaVA-1.5-7b, LLaVA-NeXT-7b. These are specific enough to identify the exact models." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Exact prompts provided: 'Please describe this image in detail.' for CHAIR, and POPE uses 'Is there a <object> in the image?' format." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 4.4: α=2, last 14 layers for LLaVA, last 18 for InstructBLIP, max_new_token=512, beams=5 for beam search methods." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding used. VHR is a decoding-time intervention method, not an agentic system." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Data preprocessing described: 500 images randomly sampled from COCO 2014 validation set, experiments repeated 5 times with different random seeds." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Dedicated 'Limitations' section after Section 6 discusses focus on attention mechanism and potential other architectural factors not addressed." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Limitations section identifies specific threats: focus only on multi-head attention mechanism, other components (vision encoder, FFN) may also contribute to hallucinations but were not addressed." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit statement about what the results do NOT show. The limitations section mentions future work but doesn't bound claims to tested models/settings." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental outputs or generated descriptions are made available. Only aggregated metrics reported." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Data collection described: 500 images randomly sampled from COCO 2014 validation set, using established benchmark protocols for CHAIR, POPE, and LLaVA-Bench." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Uses standard benchmarks with automated evaluation." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Pipeline is straightforward and documented: sample images → generate descriptions with each method → evaluate with CHAIR/POPE/LLaVA-Bench metrics. Evaluation protocols are standard." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgements section lists funding: National Key R&D Program of China, National Natural Science Foundation of China, Beijing Municipal Science and Technology Project, and others." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations clearly listed: Chinese Academy of Sciences, University of Chinese Academy of Sciences, National University of Singapore, Southeast University, Wuhan AI Research." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funders are government research grants (NSFC, National Key R&D Program) with no financial stake in the specific outcome of this hallucination mitigation method." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement found in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No mention of training data cutoff dates for the LVLMs evaluated. The models (LLaVA, InstructBLIP) could have been trained on COCO data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether COCO validation images or POPE questions were in the training data of the evaluated models. LLaVA models are known to train on COCO-related data." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "CHAIR uses MSCOCO (2014) and the evaluated models were trained after 2014. No discussion of contamination risk despite this being a known concern for COCO-based benchmarks." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Figure 6 provides detailed inference time comparison between VHR and all baseline methods, showing VHR adds negligible overhead." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No mention of total compute budget, GPU hardware used, or total experimental time." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "VHR achieves reductions of up to 16.36 in CHAIRS and 4.61 in CHAIRI on LLaVA-1.5 compared to greedy decoding", 286 "evidence": "Table 1: LLaVA-1.5 greedy CHAIRS=49.68, VHR CHAIRS=33.32; greedy CHAIRI=14.32, VHR CHAIRI=9.71", 287 "supported": "strong" 288 }, 289 { 290 "claim": "VHR outperforms all existing decoding methods on POPE across all three LVLMs", 291 "evidence": "Table 2: VHR achieves highest F1 scores on all three models (85.52, 85.47, 88.87)", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Hallucinated words correspond to lower T-VHD scores, linking language bias to hallucination", 296 "evidence": "Figure 3 shows distributional separation between hallucinated and correct instances at both sentence and word levels on 500 COCO images", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "VHR introduces negligible additional time overhead compared to baseline methods", 301 "evidence": "Figure 6 shows inference time comparison; VHR requires only one extra forward pass at the first generation step", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Adaptive per-sample head selection is essential; fixing heads degrades performance significantly", 306 "evidence": "Table 4: fixed VHR shows CHAIRS increases from 33.32 to 44.72 on LLaVA-1.5", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "The paper introduces Vision-aware Head Divergence (VHD), a metric quantifying attention head sensitivity to visual context in LVLMs, revealing that only a few heads are vision-sensitive while others rely on language priors. Building on this, Vision-aware Head Reinforcement (VHR) amplifies vision-aware heads during generation, reducing hallucination by up to 16.36 CHAIRS points on LLaVA-1.5 while adding negligible inference overhead. VHR consistently outperforms six baseline decoding methods across three LVLMs on CHAIR, POPE, and LLaVA-Bench benchmarks.", 312 "red_flags": [ 313 { 314 "flag": "Limited model diversity", 315 "detail": "All three evaluated models are 7B parameter LVLMs. No evaluation on larger models (13B, 70B+) or different architectures (e.g., Qwen-VL, GPT-4V), limiting generalizability of claims about 'LVLMs' broadly." 316 }, 317 { 318 "flag": "No significance testing", 319 "detail": "Claims of outperformance are based on comparing means without any statistical significance tests, despite standard deviations being available. Some improvements (e.g., POPE F1 differences of <1 point) may not be statistically significant." 320 }, 321 { 322 "flag": "Potential train-test contamination", 323 "detail": "LLaVA models are trained on COCO-related data, and CHAIR evaluates on COCO validation set. No discussion of whether this overlap affects the evaluation or comparison fairness across methods." 324 } 325 ], 326 "cited_papers": [ 327 { 328 "title": "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language Models", 329 "authors": ["Yung-Sung Chuang", "Yujia Xie", "Hongyin Luo", "Yoon Kim", "James Glass", "Pengcheng He"], 330 "year": 2023, 331 "arxiv_id": "2309.03883", 332 "relevance": "Contrastive decoding baseline for reducing hallucination in LLMs, directly relevant to LLM reliability." 333 }, 334 { 335 "title": "Visual Contrastive Decoding", 336 "authors": ["Sicong Leng"], 337 "year": 2024, 338 "relevance": "Contrastive decoding method for LVLM hallucination mitigation, key baseline in this evaluation." 339 }, 340 { 341 "title": "OPERA: Alleviating Hallucination in Multi-Modal Large Language Models via Over-Trust Penalty and Retrospection-Allocation", 342 "authors": ["Qidong Huang"], 343 "year": 2024, 344 "relevance": "Beam search-based hallucination mitigation for LVLMs, representing decoding strategy approaches." 345 }, 346 { 347 "title": "Object Hallucination in Image Captioning", 348 "authors": ["Anna Rohrbach"], 349 "year": 2018, 350 "relevance": "Foundational CHAIR benchmark for evaluating object hallucination, widely used in LLM/LVLM evaluation." 351 }, 352 { 353 "title": "POPE: Polling-based Object Probing Evaluation for Object Hallucination", 354 "authors": ["Yifan Li"], 355 "year": 2023, 356 "relevance": "Standard hallucination evaluation benchmark for vision-language models." 357 }, 358 { 359 "title": "LLaVA: Visual Instruction Tuning", 360 "authors": ["Haotian Liu"], 361 "year": 2024, 362 "relevance": "One of the most widely used open-source LVLMs, central to evaluating LLM-based multimodal systems." 363 }, 364 { 365 "title": "Visual Description Grounding Reduces Hallucinations and Boosts Reasoning in LVLMs", 366 "authors": ["Sreyan Ghosh"], 367 "year": 2024, 368 "arxiv_id": "2405.15683", 369 "relevance": "Addresses language bias in LVLMs through grounding, directly relevant to hallucination mitigation research." 370 }, 371 { 372 "title": "InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning", 373 "authors": ["Wenliang Dai"], 374 "year": 2023, 375 "arxiv_id": "2305.06500", 376 "relevance": "Major instruction-tuned vision-language model used as evaluation target." 377 } 378 ] 379 }