scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34133B)
      1 {
      2   "paper": {
      3     "title": "MM-RLHF: The Next Step Forward in Multimodal LLM Alignment",
      4     "authors": [
      5       "Yi-Fan Zhang",
      6       "Tao Yu",
      7       "Haochen Tian",
      8       "Chaoyou Fu",
      9       "Peiyan Li",
     10       "Jianshu Zeng",
     11       "Wulin Xie",
     12       "Yang Shi",
     13       "Huanyu Zhang",
     14       "Junkang Wu",
     15       "Xue Wang",
     16       "Yibo Hu",
     17       "Bin Wen",
     18       "Fan Yang",
     19       "Zhang Zhang",
     20       "Tingting Gao",
     21       "Di Zhang",
     22       "Liang Wang",
     23       "Rong Jin",
     24       "Tieniu Tan"
     25     ],
     26     "year": 2025,
     27     "venue": "arXiv",
     28     "arxiv_id": "2502.10391",
     29     "doi": "10.48550/arXiv.2502.10391"
     30   },
     31   "scan_version": 3,
     32   "active_modules": ["experimental_rigor", "data_leakage"],
     33   "methodology_tags": ["benchmark-eval"],
     34   "key_findings": "MM-RLHF introduces a 120k human-annotated multimodal preference dataset and a Critique-Based Reward Model that generates critiques before scoring, achieving SOTA among open-source 7B reward models and surpassing several 72B models. Combined with Dynamic Reward Scaling (MM-DPO), alignment training on LLaVA-OV-7B yields substantial gains in conversational benchmarks (~19.5%) and safety (~60% reduction in unsafe behavior) while also improving hallucination, math reasoning, and video understanding. The paper also finds that self-improvement of small-scale MLLMs (<7B) is currently unrealistic due to model capacity constraints and limitations in existing reward signal quality.",
     35   "checklist": {
     36     "artifacts": {
     37       "code_released": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper provides a project page URL (https://mm-rlhf.github.io/) but no explicit source code repository link (e.g., GitHub). No working code URL is provided in the paper text."
     41       },
     42       "data_released": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper describes the MM-RLHF dataset (120k pairs) in detail and provides a project page URL, but does not include an explicit dataset download link or statement that the dataset is publicly available for download."
     46       },
     47       "environment_specified": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper mentions '32×H800 (80G) GPUs' for hardware but provides no requirements.txt, Dockerfile, conda environment, or detailed library version listing."
     51       },
     52       "reproduction_instructions": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No step-by-step reproduction instructions, README commands, or replication scripts are provided in the paper or appendix."
     56       }
     57     },
     58     "statistical_methodology": {
     59       "confidence_intervals_or_error_bars": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Tables 2, 3, 4, and 5 report only point estimates. No confidence intervals, error bars, or ± notation is present for any experimental result."
     63       },
     64       "significance_tests": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper makes many comparative claims (e.g., '19.5% increase', 'surpassing several 72B models') based solely on comparing numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     68       },
     69       "effect_sizes_reported": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table 2 reports both baseline and aligned model scores with absolute differences (e.g., '+7.20', '+4.60'), providing sufficient context to assess magnitude. Table 3 similarly shows before/after differences for safety metrics."
     73       },
     74       "sample_size_justified": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No justification is provided for key sample size decisions: why 30k queries were sampled from 10M, why the 4:5:1 sampling ratio was chosen (beyond 'diversity goals'), or why 120k pairs are sufficient for alignment."
     78       },
     79       "variance_reported": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No standard deviation, variance, or spread measure is reported across experimental runs. Results appear to be single-run numbers with no indication of result stability."
     83       }
     84     },
     85     "evaluation_design": {
     86       "baselines_included": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Table 2 compares aligned models against their unaligned baselines (LLaVA-OV-7B, LLaVA-OV-0.5B, InternVL2-1B). Table 4 compares the reward model against LLaVA-OV-7B, LLaVA-Critic, and GPT-4o. Table 5 compares against 10+ open and closed-source models."
     90       },
     91       "baselines_contemporary": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Baselines include GPT-4o (2024-08-06), Claude-3.5-Sonnet (2024-06-22), Gemini-1.5-Pro (2024-09-24), Qwen2-VL-72B, InternVL2-26B — all contemporary state-of-the-art models."
     95       },
     96       "ablation_study": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Figure 12(a) ablates MM-RLHF dataset, implicit reward, and MM-DPO components. Table 4 ablates reward model components (w/o Task 1, w/o enhanced annotations). Figure 12(b) studies sensitivity to hyperparameters w and k."
    100       },
    101       "multiple_metrics": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper evaluates across 27 benchmarks spanning 10 dimensions: conversation, general knowledge, chart/document, OCR, real-world, math, hallucination, video, multi-image, and safety."
    105       },
    106       "human_evaluation": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "All system evaluation is automated (standard benchmarks + GPT-4o as judge). The human annotation is used for dataset creation, not for evaluating the aligned model's outputs. Section 5.1 states 'For all benchmarks requiring GPT-assisted evaluation, we consistently employ GPT-4o as the evaluation model.'"
    110       },
    111       "held_out_test_set": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Evaluation uses established benchmarks (POPE, MME, MathVista, etc.) with standard test splits. MM-RLHF-RewardBench is separately sampled from the dataset (Section 3.2: 'we randomly sample 10 examples from each category'). Training data is distinct from evaluation benchmarks."
    115       },
    116       "per_category_breakdown": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Table 2 provides per-benchmark results across 8 evaluation dimensions. Table 3 provides per-task safety breakdowns. Table 4 provides per-category reward model accuracy (MCQ, Long, Short, Safety, Video)."
    120       },
    121       "failure_cases_discussed": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Section 5.2 discusses 'Limited gains in high-resolution benchmarks' with specific analysis of why. Section 5.4 discusses why self-improvement fails for small MLLMs. Appendix D shows failure examples from annotation (Figures 8-11)."
    125       },
    126       "negative_results_reported": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Multiple negative results: high-resolution benchmarks show no improvement or decreases (Table 2, e.g., MME-RealWorld cn -0.62%); self-improvement is 'currently unrealistic' (Section 5.4); multiple sampling of critiques 'does not yield significant performance gains' (Section 5.3); implicit reward strategy doesn't work for MLLMs (Appendix E)."
    130       }
    131     },
    132     "claims_and_evidence": {
    133       "abstract_claims_supported": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "The abstract's claim of '19.5% increase in conversational abilities' for LLaVA-OV-7B is supported by Table 2's conversation metrics showing substantial gains. The '60% improvement in safety' aligns with Table 3 showing unsafety rate dropping from 40.2% to 13.9% for LLaVA-OV-7B. The body's average figures (11%, 57%) refer to averages across all models, which is a different but consistent scope."
    137       },
    138       "causal_claims_justified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper makes causal claims ('alignment training leads to improvements', 'Dynamic Reward Scaling enhances MM-DPO') supported by ablation studies: Figure 12(a) shows incremental contributions of dataset, reward model, and DPO algorithm via controlled single-variable manipulation."
    142       },
    143       "generalization_bounded": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The title 'The Next Step Forward in Multimodal LLM Alignment' implies broad generality, but experiments cover only 3 base models (LLaVA-OV-7B, LLaVA-OV-0.5B, InternVL2-1B), all ≤7B parameters. No explicit bounding to these specific model families or scales. The paper does not discuss whether results generalize to larger models or different architectures."
    147       },
    148       "alternative_explanations_discussed": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No systematic consideration of alternative explanations for the main results. Could improvements stem from the additional SFT loss component? From exposure to more diverse data during alignment rather than the alignment algorithm itself? Section 5.2 notes model-specific preferences but doesn't discuss confounds."
    152       },
    153       "proxy_outcome_distinction": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "The paper measures benchmark scores and frames them as 'alignment with human preferences', 'conversational abilities', and 'safety improvement.' No discussion of whether benchmark performance on automated metrics (many GPT-4o-judged) is a faithful proxy for actual human preference alignment. The gap between automated benchmark scores and genuine human satisfaction is not acknowledged."
    157       }
    158     },
    159     "setup_transparency": {
    160       "model_versions_specified": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Table 5 specifies model versions with dates: 'GPT-4o (2024-08-06)', 'Claude-3.5-Sonnet (2024-06-22)', 'GPT-4o-mini (2024-07-18)', 'Gemini-1.5-Pro (2024-09-24)'. Open-source models are specified by name and size (e.g., 'Qwen2-VL-72B-Instruct', 'LLaVA-OV-7B')."
    164       },
    165       "prompts_provided": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "Table 7 shows one prompt (for augmenting human annotations). Appendix B provides annotation guidelines. However, the prompts used for model response generation, GPT-4o evaluation, and safety data construction are not provided. The reader cannot reconstruct the full prompting setup."
    169       },
    170       "hyperparameters_reported": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5.1 reports: SFT loss weight search over {0, 0.1, 0.25, 0.5, 1.0}, learning rate search over {1e-7, 5e-7, 1e-6, 5e-6, 1e-5}, βori = 0.1, default w = 0.5, k = 0.5. Vision encoder frozen. Maximum 8 samples for self-improvement experiments."
    174       },
    175       "scaffolding_described": {
    176         "applies": false,
    177         "answer": false,
    178         "justification": "The paper does not use agentic scaffolding. It focuses on dataset construction, reward modeling, and alignment training, none of which involve agent-based workflows."
    179       },
    180       "data_preprocessing_documented": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 2.2 documents data filtering: predefined sampling weights across 3 categories (MCQ 4.14%, Long 12.17%, Short 83.68% → adjusted ratio 4:5:1), CLIP-based KNN clustering with 100 cluster centers, and deduplication strategy. Table 1 shows final composition. Figure 1 illustrates the full pipeline."
    184       }
    185     },
    186     "limitations_and_scope": {
    187       "limitations_section_present": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Section 6 ('Conclusion and Future Work') briefly mentions underutilized annotation granularity and high-resolution data limitations, but this is primarily forward-looking rather than a substantive limitations discussion."
    191       },
    192       "threats_to_validity_specific": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Several study-specific threats are discussed, though scattered: 'our dataset contains relatively few ultra-high-resolution images' (Section 5.2), 'our filtering strategy is based on image similarity rather than resolution' (Section 5.2), 'the cost of human annotation poses scalability challenges' (Section 2.3.2), and the finding that implicit reward doesn't transfer from LLMs to MLLMs (Appendix E)."
    196       },
    197       "scope_boundaries_stated": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The paper does not explicitly state what the results do NOT show. There is no systematic discussion of excluded populations/settings, untested model scales, or claims the authors are NOT making. The broad title suggests general applicability without explicit bounding."
    201       }
    202     },
    203     "data_integrity": {
    204       "raw_data_available": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The project page (https://mm-rlhf.github.io/) is referenced but no explicit raw data download link is provided in the paper. The 120k annotated pairs, raw annotations, and scoring details are not confirmed as publicly downloadable."
    208       },
    209       "data_collection_described": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Section 2.1 describes data sources (LLaVA-OV, VLfeedback, LLaVA-RLHF, lrv-instruction, Unimm-Chat, SharedGPT-4 video, VLGuard). Section 2.2 describes filtering. Section 2.3 describes annotation procedures including timing ('average of over 8 minutes' per question) and quality control."
    213       },
    214       "recruitment_methods_described": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Section 2.3.2 states 'over 50 annotators, supported by 8 multimodal research experts with strong English proficiency and academic backgrounds' but does not describe how these annotators were recruited — via what platform, institution, or selection process. The recruitment channel and potential selection bias are not discussed."
    218       },
    219       "data_pipeline_documented": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "The full pipeline is documented in Figure 1 and Sections 2.1-2.3: 10M instruction samples → clustering/deduplication/sampling → ~30k queries → response generation with 4 SOTA models → human annotation → 120k comparison pairs. Table 1 provides final composition counts. Appendix C details safety data construction."
    223       }
    224     },
    225     "conflicts_of_interest": {
    226       "funding_disclosed": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No funding acknowledgments section is present in the paper. The note mentions 'Work done during an internship at KuaiShou Group' but no grants or funding sources are disclosed."
    230       },
    231       "affiliations_disclosed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "Author affiliations are clearly listed: KuaiShou, CASIA, NJU, USTC, PKU, Alibaba, Meta AI. The note states 'Work done during an internship at KuaiShou Group.'"
    235       },
    236       "funder_independent_of_outcome": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "KuaiShou (a Chinese tech company that develops short-video platforms and apps) has a potential commercial interest in improved MLLM alignment for its products. The work was done at KuaiShou (4 authors affiliated), and no statement of funder independence is provided."
    240       },
    241       "financial_interests_declared": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No competing interests or financial interests statement is present in the paper. Authors from KuaiShou, Alibaba, and Meta AI may have commercial interests related to the findings."
    245       }
    246     },
    247     "contamination": {
    248       "training_cutoff_stated": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No training data cutoff dates are stated for the base models (LLaVA-OV-7B, InternVL2-1B) or for the models used in response generation (GPT-4o, Claude 3.5 Sonnet, Qwen2-VL-72B). This is necessary to assess whether benchmark examples could have been in pre-training data."
    252       },
    253       "train_test_overlap_discussed": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "No analysis of whether benchmark test examples (POPE, MME, MathVista, etc.) appeared in the pre-training data of the evaluated models. No overlap analysis or decontamination is mentioned."
    257       },
    258       "benchmark_contamination_addressed": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "Many benchmarks used (POPE 2023, MME 2023, TextVQA 2019, VQAv2 2015) predate the models' likely training cutoffs. No discussion of contamination risk for these established benchmarks."
    262       }
    263     },
    264     "human_studies": {
    265       "pre_registered": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "The paper has no human participants as research subjects. Human annotators are workers creating the dataset, not study participants."
    269       },
    270       "irb_or_ethics_approval": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human subjects research is conducted. Annotators create training data but are not research participants."
    274       },
    275       "demographics_reported": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human subjects study. The annotators are described as having 'strong English proficiency and academic backgrounds' but this is workforce description, not participant demographics."
    279       },
    280       "inclusion_exclusion_criteria": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human subjects study is conducted."
    284       },
    285       "randomization_described": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human subjects study is conducted."
    289       },
    290       "blinding_described": {
    291         "applies": false,
    292         "answer": false,
    293         "justification": "No human subjects study is conducted."
    294       },
    295       "attrition_reported": {
    296         "applies": false,
    297         "answer": false,
    298         "justification": "No human subjects study is conducted."
    299       }
    300     },
    301     "cost_and_practicality": {
    302       "inference_cost_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No inference cost, API costs, tokens consumed, or per-example latency is reported for the alignment training, reward model, or the aligned models."
    306       },
    307       "compute_budget_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper mentions '32×H800 (80G) GPUs' as hardware but does not state GPU hours, wall-clock training time, or total compute budget. The annotation cost ('over 8 minutes per question' × 30k queries) is partially described but total cost is not quantified."
    311       }
    312     },
    313     "experimental_rigor": {
    314       "seed_sensitivity_reported": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No results across multiple random seeds are reported. All results appear to be from single runs with no seed sensitivity analysis."
    318       },
    319       "number_of_runs_stated": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The number of experimental runs is never explicitly stated. Results are presented without indicating how many runs produced them."
    323       },
    324       "hyperparameter_search_budget": {
    325         "applies": true,
    326         "answer": true,
    327         "justification": "Section 5.1 specifies the search space: grid search over SFT loss weight {0, 0.1, 0.25, 0.5, 1.0} and learning rate {1e-7, 5e-7, 1e-6, 5e-6, 1e-5}. Figure 12(b) shows the w × k search grid. The method (grid search) and space (25 configurations for main search) are clear."
    328       },
    329       "best_config_selection_justified": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "The paper mentions selecting the 'best-performing configuration' from the grid search but does not specify which metric or validation set was used for selection. The final hyperparameter choices are not justified beyond 'default values of w = 0.5 and k = 0.5 work well.'"
    333       },
    334       "multiple_comparison_correction": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper makes comparisons across 27 benchmarks, 10 dimensions, and multiple models without any correction for multiple comparisons (no Bonferroni, Holm, or similar corrections)."
    338       },
    339       "self_comparison_bias_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The authors evaluate their own MM-RLHF dataset, MM-RLHF-Reward model, and MM-DPO algorithm without acknowledging self-evaluation bias. Their reward model is used to score pairs that then train their own alignment method."
    343       },
    344       "compute_budget_vs_performance": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No performance-vs-compute analysis is provided. The paper does not discuss whether the improvements justify the compute cost, or compare methods at matched compute budgets."
    348       },
    349       "benchmark_construct_validity": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The paper uses 27 benchmarks without questioning whether they measure what is claimed. For example, GPT-4o-judged conversation benchmarks may not reflect actual human preference alignment, but this validity gap is not discussed."
    353       },
    354       "scaffold_confound_addressed": {
    355         "applies": false,
    356         "answer": false,
    357         "justification": "No scaffolding comparisons are made. The paper compares alignment methods applied to the same base models, not different scaffolds."
    358       }
    359     },
    360     "data_leakage": {
    361       "temporal_leakage_addressed": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No discussion of temporal leakage. Many benchmarks (VQAv2 from 2015, POPE from 2023, etc.) predate the models' training, but the paper does not address whether solutions appeared in training data."
    365       },
    366       "feature_leakage_addressed": {
    367         "applies": true,
    368         "answer": false,
    369         "justification": "No discussion of whether the evaluation setup leaks information. For instance, whether the alignment training data overlaps with any evaluation benchmarks is not analyzed."
    370       },
    371       "non_independence_addressed": {
    372         "applies": true,
    373         "answer": false,
    374         "justification": "No discussion of independence between training and test data. The MM-RLHF training data sources overlap with some evaluation benchmark sources (both draw from LLaVA and similar multimodal datasets), but this is not addressed."
    375       },
    376       "leakage_detection_method": {
    377         "applies": true,
    378         "answer": false,
    379         "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines)."
    380       }
    381     }
    382   },
    383   "claims": [
    384     {
    385       "claim": "Fine-tuning LLaVA-OV-7B with MM-RLHF leads to a 19.5% increase in conversational abilities and a 60% improvement in safety.",
    386       "evidence": "Table 2 shows substantial conversation improvements (e.g., WildVision win rate +22.0, LLaVA-Wild detail +12.20). Table 3 shows unsafety rate dropping from 40.2% to 13.9% for LLaVA-OV-7B. However, the exact '19.5%' calculation methodology is not specified.",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "MM-RLHF-Reward-7B achieves SOTA performance among open-source reward models, surpassing several 72B-scale models.",
    391       "evidence": "Table 5 shows MM-RLHF-Reward-7B (50.15 avg) outperforming Qwen2-VL-72B-Instruct (42.97), NVLM-D-72B (44.17), and InternVL2-26B (45.67) on VLRewardBench. Table 4 shows 85% ACC on MM-RLHF-RewardBench.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "Critique-based training significantly enhances reward model quality: ACC+ improves from 50% to 67%.",
    396       "evidence": "Table 4 shows progressive improvement: w/o Task 1 (50% ACC+) → w/o enhanced annotations (57% ACC+) → full model (67% ACC+) → with GT annotations (87% ACC+). Controlled ablation on the same dataset and base model.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Dynamic Reward Scaling (MM-DPO) further improves alignment performance beyond standard DPO.",
    401       "evidence": "Figure 12(a) shows average score increasing from 74.9 (MM-RLHF + standard DPO) to 76.5 (MM-DPO) on the 4-benchmark evaluation. However, ablation uses 1/5 of data, and the improvement margin is modest.",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "Self-improvement of small-scale MLLMs (<7B parameters) is currently unrealistic for comprehensive performance gains.",
    406       "evidence": "Figure 6 shows self-sampled data with the best available reward model (73.9 avg) significantly underperforms human-annotated data (75.0 avg). Section 5.4 discusses model capacity constraints and reward signal quality limitations.",
    407       "supported": "moderate"
    408     },
    409     {
    410       "claim": "Alignment training improves multi-image and video understanding despite no dedicated multi-image data in the training set.",
    411       "evidence": "Table 2 shows improvements for LLaVA-OV-7B: MMMU-Pro +1.33%, LLAVA-Next-Interleave +0.27%, VideoChatGPT +0.35. Improvements are modest and lack error bars to confirm they exceed noise.",
    412       "supported": "weak"
    413     },
    414     {
    415       "claim": "Existing multimodal reward models (e.g., LLaVA-Critic) exhibit significant overfitting to their training data domains.",
    416       "evidence": "Table 4 shows LLaVA-Critic achieving only 45% ACC (pointwise) and 35% ACC (pairwise) vs. GPT-4o's 74% on MM-RLHF-RewardBench. Section 5.3 attributes this to training data limited to 'conversational datasets and real-world images.'",
    417       "supported": "moderate"
    418     }
    419   ],
    420   "red_flags": [
    421     {
    422       "flag": "No error bars or uncertainty quantification",
    423       "detail": "All results across 27 benchmarks, 3 models, and multiple ablations are point estimates. No variance, standard deviation, confidence intervals, or significance tests are reported, making it impossible to assess whether observed differences are statistically meaningful."
    424     },
    425     {
    426       "flag": "Self-evaluation bias",
    427       "detail": "The authors evaluate their own dataset (MM-RLHF), reward model (MM-RLHF-Reward), alignment algorithm (MM-DPO), and benchmarks (MM-RLHF-RewardBench, MM-RLHF-SafetyBench). The reward model used to score training pairs for MM-DPO is trained on the same dataset. No independent evaluation is conducted."
    428     },
    429     {
    430       "flag": "No contamination analysis",
    431       "detail": "Many evaluation benchmarks (VQAv2, POPE, TextVQA, etc.) substantially predate the models used. No training cutoff dates are stated for any model, and no contamination analysis is performed despite evaluating on 27 established benchmarks."
    432     },
    433     {
    434       "flag": "Ablation on reduced data",
    435       "detail": "Section 5.1 states ablation studies use 'uniformly sample 1/5 of the data, which may result in minor performance discrepancies compared to the full dataset.' This weakens the ablation conclusions as they may not reflect the full-data setting."
    436     },
    437     {
    438       "flag": "Circular evaluation methodology",
    439       "detail": "MM-RLHF-Reward is trained on MM-RLHF data, then used to provide reward signals for MM-DPO training on MM-RLHF data. MM-RLHF-RewardBench is sampled from MM-RLHF data. This circular dependency could inflate performance metrics — the system is partially evaluated on data it was designed around."
    440     },
    441     {
    442       "flag": "Undisclosed conflicts of interest",
    443       "detail": "Four authors are from KuaiShou (a large Chinese tech company), and the work was done during an internship there. No funding disclosure, competing interests statement, or acknowledgment of commercial interest in improved MLLM alignment is provided."
    444     }
    445   ],
    446   "cited_papers": [
    447     {
    448       "title": "Aligning large multimodal models with factually augmented rlhf",
    449       "authors": ["Zhiqing Sun", "Sheng Shen", "Shengcao Cao", "Haotian Liu"],
    450       "year": 2023,
    451       "arxiv_id": "2309.14525",
    452       "relevance": "First multimodal RLHF algorithm (Fact-RLHF), pioneering work on aligning MLLMs with human feedback that this paper directly builds upon."
    453     },
    454     {
    455       "title": "LLaVA-Critic: Learning to Evaluate Multimodal Models",
    456       "authors": ["Tianyi Xiong", "Xiyao Wang", "Dong Guo"],
    457       "year": 2024,
    458       "arxiv_id": "2410.02712",
    459       "relevance": "Key baseline for MLLM self-evaluation and reward modeling, demonstrating iterative DPO strategy for MLLM alignment."
    460     },
    461     {
    462       "title": "RLHF-V: Towards Trustworthy MLLMs via Behavior Alignment from Fine-grained Correctional Human Feedback",
    463       "authors": ["Tianyu Yu", "Yuan Yao", "Haoye Zhang"],
    464       "year": 2024,
    465       "relevance": "Prior work on fine-grained human feedback for MLLM alignment, addressing hallucination through behavioral alignment."
    466     },
    467     {
    468       "title": "RLAIF-V: Aligning MLLMs through Open-Source AI Feedback for Super GPT-4V Trustworthiness",
    469       "authors": ["Tianyu Yu", "Haoye Zhang", "Yuan Yao"],
    470       "year": 2024,
    471       "arxiv_id": "2405.17220",
    472       "relevance": "Explores AI feedback as an alternative to human feedback for MLLM alignment, directly relevant to the human vs. machine annotation debate."
    473     },
    474     {
    475       "title": "Silkie: Preference Distillation for Large Visual Language Models",
    476       "authors": ["Lei Li", "Zhihui Xie", "Mukai Li"],
    477       "year": 2023,
    478       "arxiv_id": "2312.10665",
    479       "relevance": "VLfeedback dataset used as a data source in MM-RLHF construction; explores AI-generated preference data for VLM alignment."
    480     },
    481     {
    482       "title": "Direct Preference Optimization with an Offset",
    483       "authors": ["Afra Amini", "Tim Vieira", "Ryan Cotterell"],
    484       "year": 2024,
    485       "arxiv_id": "2402.10571",
    486       "relevance": "DPO variant that adjusts optimization based on sample quality — related to MM-DPO's Dynamic Reward Scaling approach."
    487     },
    488     {
    489       "title": "Beta-DPO: Direct Preference Optimization with Dynamic Beta",
    490       "authors": ["Junkang Wu", "Yuexiang Xie", "Zhengyi Yang"],
    491       "year": 2024,
    492       "arxiv_id": "2407.08639",
    493       "relevance": "Prior work on dynamic beta adjustment in DPO for LLMs, which the paper shows does not directly transfer to MLLMs."
    494     },
    495     {
    496       "title": "Self-generated critiques boost reward modeling for language models",
    497       "authors": ["Yue Yu", "Zhengxing Chen", "Aston Zhang"],
    498       "year": 2024,
    499       "arxiv_id": "2411.16646",
    500       "relevance": "Explores critique-based reward models in LLMs; this paper extends the concept to multimodal settings with distinct design choices."
    501     },
    502     {
    503       "title": "Safety fine-tuning at (almost) no cost: A baseline for vision large language models",
    504       "authors": ["Yongshuo Zong", "Ondrej Bohdal", "Tingyang Yu"],
    505       "year": 2024,
    506       "arxiv_id": "2402.02207",
    507       "relevance": "VLGuard dataset used for safety data construction in MM-RLHF; establishes safety fine-tuning baselines for VLMs."
    508     },
    509     {
    510       "title": "VLRewardBench: A Challenging Benchmark for Vision-Language Generative Reward Models",
    511       "authors": ["Lei Li", "Yuancheng Wei", "Zhihui Xie"],
    512       "year": 2024,
    513       "arxiv_id": "2411.17451",
    514       "relevance": "Benchmark for evaluating multimodal reward models, used as one of the evaluation settings for MM-RLHF-Reward."
    515     },
    516     {
    517       "title": "Provably Robust DPO: Aligning Language Models with Noisy Feedback",
    518       "authors": ["Sayak Ray Chowdhury", "Anush Kini", "Nagarajan Natarajan"],
    519       "year": 2024,
    520       "arxiv_id": "2403.00409",
    521       "relevance": "Addresses robustness of DPO to noisy preference data, related to MM-DPO's approach of weighting samples by quality."
    522     },
    523     {
    524       "title": "Red Teaming Visual Language Models",
    525       "authors": ["Mukai Li", "Lei Li", "Yuwei Yin"],
    526       "year": 2024,
    527       "arxiv_id": "2401.12915",
    528       "relevance": "Red teaming approach for VLMs, relevant to safety evaluation and adversarial robustness of multimodal models."
    529     }
    530   ],
    531   "engagement_factors": {
    532     "practical_relevance": {
    533       "score": 2,
    534       "justification": "Provides a concrete dataset and training recipe for MLLM alignment that practitioners could adopt, though implementation requires significant compute resources."
    535     },
    536     "surprise_contrarian": {
    537       "score": 1,
    538       "justification": "Challenges the view that alignment only helps specific tasks and shows self-improvement doesn't work for small MLLMs, but the main results (RLHF helps) are largely expected."
    539     },
    540     "fear_safety": {
    541       "score": 1,
    542       "justification": "Includes a safety alignment component and safety benchmark (jailbreaking, adversarial attacks), but the focus is on improving safety rather than demonstrating novel attacks."
    543     },
    544     "drama_conflict": {
    545       "score": 0,
    546       "justification": "No controversy or conflict angle; presents straightforward methodology improvement results."
    547     },
    548     "demo_ability": {
    549       "score": 1,
    550       "justification": "Project page exists (https://mm-rlhf.github.io/) but no pip-installable tool or live demo is available; reproducing requires substantial infrastructure."
    551     },
    552     "brand_recognition": {
    553       "score": 1,
    554       "justification": "Authors from KuaiShou, CASIA, and Meta AI have some recognition but are not among the most prominent Western AI labs; MM-RLHF itself is new."
    555     }
    556   }
    557 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs