scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18378B)
      1 {
      2   "paper": {
      3     "title": "Cracking the Code of Hallucination in LVLMs with Vision-aware Head Divergence",
      4     "authors": ["Jinghan He", "Kuan Zhu", "Haiyun Guo", "Junfeng Fang", "Zhenglin Hua", "Yuheng Jia", "Ming Tang", "Tat-Seng Chua", "Jinqiao Wang"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2412.13949"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub link provided in abstract: https://github.com/jinghan1he/VHR."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Uses publicly available benchmarks: CHAIR (MSCOCO), POPE, and LLaVA-Bench, all standard public datasets."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment specification found in the paper. Only model names are mentioned."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions in the paper. Implementation details section describes hyperparameters but not how to run the code."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Standard deviations are reported for CHAIR results (e.g., 37.76±2.76) but no confidence intervals. The ± notation represents std dev across 5 random splits, not confidence intervals."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No statistical significance tests are reported. Claims of superiority are based on comparing point estimates across methods without any formal tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported with baseline context, e.g., 'reductions of up to 16.36 in CHAIRS and 4.61 in CHAIRI on LLaVA-1.5' with full tables showing baseline values."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "500 images sampled from COCO validation set with no justification for why 500 was chosen. No power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Standard deviations reported for CHAIR results across 5 random splits (e.g., 37.76±2.76). Table 1 includes ± notation throughout."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Six baselines compared: Greedy, Beam search, DoLa, VCD, OPERA, CODE, and EAH (Section 4.3)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include recent 2024 methods: CODE (Kim et al., 2024), EAH (Zhang et al., 2024), OPERA (Huang et al., 2024), VCD (Leng et al., 2024)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 4.6 provides ablation studies on adaptive head selection (fixed VHR), outlier removal, and number of reinforced layers (Table 4, Figure 5)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple metrics used: CHAIRS, CHAIRI, F1 on POPE, and accuracy/detailedness/naturalness on LLaVA-Bench."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the system's outputs. LLaVA-Bench uses GPT-4V as judge, which is automated, not human evaluation."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Uses established benchmarks (CHAIR on COCO validation set, POPE, LLaVA-Bench) that are separate from any training data."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "POPE results averaged over three splits (random, popular, adversarial). CHAIR reports both sentence-level and object-level metrics. LLaVA-Bench reports accuracy, detailedness, naturalness separately."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No systematic failure analysis. Only success cases shown in qualitative examples (Figure 7). No discussion of where VHR fails."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Ablation shows fixed VHR performs worse (Table 4). Figure 5 shows too many reinforced layers degrades quality. LLaVA-Bench shows slight decreases in naturalness for some models."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 'superior performance compared to state-of-the-art' and 'negligible additional time overhead' are supported by Tables 1-3 and Figure 6."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about vision-aware heads driving hallucination are supported by ablation studies (Table 4) showing that fixed heads and including outliers degrade performance. The VHD analysis (Figure 3) provides correlational evidence linking low T-VHD to hallucinated tokens."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "Title and abstract make broad claims about 'LVLMs' but results are only on three 7B models (InstructBLIP-7b, LLaVA-1.5-7b, LLaVA-NeXT-7b). No testing on larger models or different architectures."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for why VHR works. The paper assumes language bias is the primary cause without considering other factors like vision encoder limitations or training data distribution."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Specific model versions given: InstructBLIP-7b, LLaVA-1.5-7b, LLaVA-NeXT-7b. These are specific enough to identify the exact models."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Exact prompts provided: 'Please describe this image in detail.' for CHAIR, and POPE uses 'Is there a <object> in the image?' format."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 4.4: α=2, last 14 layers for LLaVA, last 18 for InstructBLIP, max_new_token=512, beams=5 for beam search methods."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding used. VHR is a decoding-time intervention method, not an agentic system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Data preprocessing described: 500 images randomly sampled from COCO 2014 validation set, experiments repeated 5 times with different random seeds."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Dedicated 'Limitations' section after Section 6 discusses focus on attention mechanism and potential other architectural factors not addressed."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Limitations section identifies specific threats: focus only on multi-head attention mechanism, other components (vision encoder, FFN) may also contribute to hallucinations but were not addressed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statement about what the results do NOT show. The limitations section mentions future work but doesn't bound claims to tested models/settings."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental outputs or generated descriptions are made available. Only aggregated metrics reported."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection described: 500 images randomly sampled from COCO 2014 validation set, using established benchmark protocols for CHAIR, POPE, and LLaVA-Bench."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Uses standard benchmarks with automated evaluation."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Pipeline is straightforward and documented: sample images → generate descriptions with each method → evaluate with CHAIR/POPE/LLaVA-Bench metrics. Evaluation protocols are standard."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgements section lists funding: National Key R&D Program of China, National Natural Science Foundation of China, Beijing Municipal Science and Technology Project, and others."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations clearly listed: Chinese Academy of Sciences, University of Chinese Academy of Sciences, National University of Singapore, Southeast University, Wuhan AI Research."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funders are government research grants (NSFC, National Key R&D Program) with no financial stake in the specific outcome of this hallucination mitigation method."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No mention of training data cutoff dates for the LVLMs evaluated. The models (LLaVA, InstructBLIP) could have been trained on COCO data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether COCO validation images or POPE questions were in the training data of the evaluated models. LLaVA models are known to train on COCO-related data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "CHAIR uses MSCOCO (2014) and the evaluated models were trained after 2014. No discussion of contamination risk despite this being a known concern for COCO-based benchmarks."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Figure 6 provides detailed inference time comparison between VHR and all baseline methods, showing VHR adds negligible overhead."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No mention of total compute budget, GPU hardware used, or total experimental time."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "VHR achieves reductions of up to 16.36 in CHAIRS and 4.61 in CHAIRI on LLaVA-1.5 compared to greedy decoding",
    286       "evidence": "Table 1: LLaVA-1.5 greedy CHAIRS=49.68, VHR CHAIRS=33.32; greedy CHAIRI=14.32, VHR CHAIRI=9.71",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "VHR outperforms all existing decoding methods on POPE across all three LVLMs",
    291       "evidence": "Table 2: VHR achieves highest F1 scores on all three models (85.52, 85.47, 88.87)",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Hallucinated words correspond to lower T-VHD scores, linking language bias to hallucination",
    296       "evidence": "Figure 3 shows distributional separation between hallucinated and correct instances at both sentence and word levels on 500 COCO images",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "VHR introduces negligible additional time overhead compared to baseline methods",
    301       "evidence": "Figure 6 shows inference time comparison; VHR requires only one extra forward pass at the first generation step",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Adaptive per-sample head selection is essential; fixing heads degrades performance significantly",
    306       "evidence": "Table 4: fixed VHR shows CHAIRS increases from 33.32 to 44.72 on LLaVA-1.5",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "The paper introduces Vision-aware Head Divergence (VHD), a metric quantifying attention head sensitivity to visual context in LVLMs, revealing that only a few heads are vision-sensitive while others rely on language priors. Building on this, Vision-aware Head Reinforcement (VHR) amplifies vision-aware heads during generation, reducing hallucination by up to 16.36 CHAIRS points on LLaVA-1.5 while adding negligible inference overhead. VHR consistently outperforms six baseline decoding methods across three LVLMs on CHAIR, POPE, and LLaVA-Bench benchmarks.",
    312   "red_flags": [
    313     {
    314       "flag": "Limited model diversity",
    315       "detail": "All three evaluated models are 7B parameter LVLMs. No evaluation on larger models (13B, 70B+) or different architectures (e.g., Qwen-VL, GPT-4V), limiting generalizability of claims about 'LVLMs' broadly."
    316     },
    317     {
    318       "flag": "No significance testing",
    319       "detail": "Claims of outperformance are based on comparing means without any statistical significance tests, despite standard deviations being available. Some improvements (e.g., POPE F1 differences of <1 point) may not be statistically significant."
    320     },
    321     {
    322       "flag": "Potential train-test contamination",
    323       "detail": "LLaVA models are trained on COCO-related data, and CHAIR evaluates on COCO validation set. No discussion of whether this overlap affects the evaluation or comparison fairness across methods."
    324     }
    325   ],
    326   "cited_papers": [
    327     {
    328       "title": "DoLa: Decoding by Contrasting Layers Improves Factuality in Large Language Models",
    329       "authors": ["Yung-Sung Chuang", "Yujia Xie", "Hongyin Luo", "Yoon Kim", "James Glass", "Pengcheng He"],
    330       "year": 2023,
    331       "arxiv_id": "2309.03883",
    332       "relevance": "Contrastive decoding baseline for reducing hallucination in LLMs, directly relevant to LLM reliability."
    333     },
    334     {
    335       "title": "Visual Contrastive Decoding",
    336       "authors": ["Sicong Leng"],
    337       "year": 2024,
    338       "relevance": "Contrastive decoding method for LVLM hallucination mitigation, key baseline in this evaluation."
    339     },
    340     {
    341       "title": "OPERA: Alleviating Hallucination in Multi-Modal Large Language Models via Over-Trust Penalty and Retrospection-Allocation",
    342       "authors": ["Qidong Huang"],
    343       "year": 2024,
    344       "relevance": "Beam search-based hallucination mitigation for LVLMs, representing decoding strategy approaches."
    345     },
    346     {
    347       "title": "Object Hallucination in Image Captioning",
    348       "authors": ["Anna Rohrbach"],
    349       "year": 2018,
    350       "relevance": "Foundational CHAIR benchmark for evaluating object hallucination, widely used in LLM/LVLM evaluation."
    351     },
    352     {
    353       "title": "POPE: Polling-based Object Probing Evaluation for Object Hallucination",
    354       "authors": ["Yifan Li"],
    355       "year": 2023,
    356       "relevance": "Standard hallucination evaluation benchmark for vision-language models."
    357     },
    358     {
    359       "title": "LLaVA: Visual Instruction Tuning",
    360       "authors": ["Haotian Liu"],
    361       "year": 2024,
    362       "relevance": "One of the most widely used open-source LVLMs, central to evaluating LLM-based multimodal systems."
    363     },
    364     {
    365       "title": "Visual Description Grounding Reduces Hallucinations and Boosts Reasoning in LVLMs",
    366       "authors": ["Sreyan Ghosh"],
    367       "year": 2024,
    368       "arxiv_id": "2405.15683",
    369       "relevance": "Addresses language bias in LVLMs through grounding, directly relevant to hallucination mitigation research."
    370     },
    371     {
    372       "title": "InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning",
    373       "authors": ["Wenliang Dai"],
    374       "year": 2023,
    375       "arxiv_id": "2305.06500",
    376       "relevance": "Major instruction-tuned vision-language model used as evaluation target."
    377     }
    378   ]
    379 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs