scan.json (18174B)
1 { 2 "paper": { 3 "title": "CuMo: Scaling Multimodal LLM with Co-Upcycled Mixture-of-Experts", 4 "authors": ["Jiachen Li", "Xinyao Wang", "Sijie Zhu", "Chia-Wen Kuo", "Lu Xu", "Fan Chen", "Jitesh Jain", "Humphrey Shi", "Longyin Wen"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2405.05949" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub repository URL provided: https://github.com/SHI-Labs/CuMo. Abstract and introduction both state code and model weights are open-sourced." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states all training data are publicly accessible open-sourced datasets (LLaVA-558K, ALLaVA, ShareGPT4V, etc.). Evaluation benchmarks are all public." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found in the paper. Appendix B mentions hyperparameters but not software dependencies or library versions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are provided in the paper itself. The GitHub repo is referenced but the paper does not include a reproduction guide." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results in Tables 1-7 are reported as point estimates without confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims CuMo 'outperforms' baselines based solely on comparing numbers in tables without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Results are reported with baseline context across multiple benchmarks (e.g., CuMo Mistral-7B scores 73.9 on SQA vs LLaVA-NeXT Mistral-7B at 72.8), allowing readers to assess magnitude of improvements." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for benchmark sizes or discussion of whether sample sizes are adequate for the claimed differences." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Table 1 notes some numbers are 'averaged by three inference runs of querying GPT API' (marked with †) but no standard deviation or variance is reported. Most results appear to be single-run." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Extensive baselines in Table 1 including InstructBLIP, Qwen-VL, LLaVA-v1.5, LLaVA-NeXT, Mini-Gemini, MM1, SPHINX, and others." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include LLaVA-NeXT (2024), Mini-Gemini (2024), MM1 (2024), and other recent models contemporary to the submission." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Extensive ablation studies in Tables 3-7 covering MLP-MoE configurations, CLIP-MoE, LLM-MoE, multi-resolution features, and pre-finetuning stage." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Evaluated on 12 benchmarks: SQA-IMG, TextVQA, GQA, POPE, MME, MMBench, MMVet, VQAv2, LLaVA-Wild, SEED-IMG, MMMU, MathVista." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a model architecture paper evaluated on established automated benchmarks. Human evaluation is not standard or necessary for the claims made." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "All evaluation benchmarks (VQAv2, GQA, MME, etc.) have standard test/val splits. The paper follows LLaVA evaluation settings on these established benchmarks." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down across 12 individual benchmarks and model size groups (7B, 13B, 7B MoE) in Table 1." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.4 qualitative analysis shows hallucination examples (e.g., '2 characters standing on the table') and acknowledges 'the need for further investigation to mitigate hallucinations.'" 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 3 shows training MoE from scratch leads to 'clear performance drop.' Table 5 shows upcycled Mistral MoE is significantly worse than pre-trained Mixtral. Top 2-in-8 showed slight decline vs Top 2-in-4." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of outperforming state-of-the-art within model size groups are supported by Table 1 results, though 'outperforms' is sometimes marginal (< 1 point on some benchmarks)." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims about MoE components are supported by controlled ablation studies (Tables 3-7) that systematically add/remove components while keeping other factors fixed." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Claims are bounded to 'within each model size group' and specific benchmarks tested. The paper does not overclaim generality beyond the evaluated settings." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for performance gains. For example, the additional training data in the final model (1.65M vs 665K for baselines) could partly explain improvements, but this confound is not discussed." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Specific models specified: CLIP ViT-L, Mistral-7B, Mixtral 8x7B. GPT evaluation uses gpt-4-0613 and gpt-3.5-turbo with specific version identifiers." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "The paper trains and fine-tunes models rather than prompting LLMs. Evaluation follows LLaVA settings which convert data into visual instructions, but this is standard benchmark protocol, not prompt engineering." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Learning rates specified (1e-3, 2e-5, 2e-6), auxiliary loss coefficients (αb=0.1, αz=0.01), greedy decoding, and Appendix B referenced for additional hyperparameters." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. This is a model architecture and training paper." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.1 documents training datasets for each stage, with total data sizes (558K pre-training, ~1.65M instruction tuning). Appendix A provides detailed dataset breakdown." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "No dedicated limitations or threats-to-validity section. The conclusion mentions hallucinations but does not substantively discuss limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity discussed." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit statements about what the results do not show or what settings are excluded from the claims." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "All training datasets are publicly available open-source datasets. Evaluation benchmarks are public with standard evaluation protocols." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.1 describes the training datasets used at each stage with references to their original papers." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Data sources are standard public benchmarks and datasets." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The three-stage training pipeline is documented (pre-training → pre-finetuning → visual instruction tuning) with datasets specified for each stage." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding disclosure or acknowledgments section listing grants. The acknowledgments only thank individuals for discussions." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations clearly listed: SHI Labs @ Georgia Tech & UIUC, and ByteDance Inc." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "ByteDance (a major tech company) employs most authors and has commercial interest in multimodal AI. No discussion of whether funding is independent of outcomes." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement found in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper uses Mistral-7B and Mixtral 8x7B but does not state their training data cutoff dates." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether evaluation benchmark data could have appeared in the LLM or CLIP pre-training data." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "Benchmarks like VQAv2 (2017), GQA (2019) predate the models' training. No discussion of contamination risk." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The paper claims 'neglectable additional activated parameters during inference' but does not report actual inference latency, throughput, or cost measurements." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No mention of GPU hours, hardware used for training, or total computational budget despite training multiple large models across three stages." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CuMo Mistral-7B outperforms state-of-the-art 7B multimodal LLMs across multiple benchmarks.", 286 "evidence": "Table 1 shows CuMo Mistral-7B achieves top or near-top scores among 7B models on SQA (73.9), TextVQA (67.0), MMBench-EN (73.0), MMVet (51.0), MMMU (39.1).", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Co-upcycling is essential for stable MoE training in the vision encoder and MLP connector.", 291 "evidence": "Table 3 shows training MoE from scratch drops performance (SQA 68.1 vs baseline 72.8), while upcycling recovers it (73.7). Table 4 confirms CLIP-MoE from scratch fails to converge.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "Pre-trained MoE LLMs (Mixtral) significantly outperform upcycled LLMs.", 296 "evidence": "Table 5 shows Mixtral 8x7B outperforms upcycled Mistral 8x7B across all benchmarks (e.g., TextVQA 60.6 vs 56.4, MMVet 40.0 vs 35.7).", 297 "supported": "strong" 298 }, 299 { 300 "claim": "CuMo enhances multimodal LLMs with negligible additional activated parameters during inference.", 301 "evidence": "Table 1 shows CuMo Mistral-7B has 7.8B activated parameters vs 7.6B for LLaVA-NeXT Mistral-7B, but no actual inference cost measurements are provided.", 302 "supported": "moderate" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "CuMo introduces co-upcycled sparse Mixture-of-Experts blocks into the vision encoder and MLP connector of multimodal LLMs, achieving competitive or superior performance to larger models while adding minimal activated parameters. The co-upcycling initialization strategy is critical for training stability, as MoE blocks trained from scratch fail to converge. Pre-trained MoE LLMs (Mixtral) significantly outperform upcycled dense LLMs, so CuMo applies MoE only to the vision side. Ablation studies across six configurations validate each design choice.", 307 "red_flags": [ 308 { 309 "flag": "No variance or uncertainty quantification", 310 "detail": "Results are reported as single-run point estimates (except three GPT-evaluated benchmarks averaged over 3 runs without std dev). Performance differences between models are often within 1-2 points with no way to assess statistical significance." 311 }, 312 { 313 "flag": "Training data confound in main results", 314 "detail": "The final CuMo models in Table 1 use ~1.65M training examples while some baselines (LLaVA-v1.5) use only 665K. Table 2 partially addresses this with a limited-data comparison, but the headline results conflate architecture improvements with additional training data." 315 }, 316 { 317 "flag": "Company evaluating its own product", 318 "detail": "Most authors are from ByteDance, which has commercial interests in multimodal AI. No conflict of interest statement is provided." 319 }, 320 { 321 "flag": "No compute budget reported", 322 "detail": "Training multiple large models across three stages likely required substantial compute, but no GPU hours or hardware details are provided, making cost-benefit assessment impossible." 323 } 324 ], 325 "cited_papers": [ 326 { 327 "title": "LLaVA: Visual Instruction Tuning", 328 "authors": ["Haotian Liu", "Chunyuan Li", "Qingyang Wu", "Yong Jae Lee"], 329 "year": 2023, 330 "relevance": "Foundational multimodal LLM approach that CuMo builds upon; demonstrates visual instruction tuning methodology." 331 }, 332 { 333 "title": "Improved Baselines with Visual Instruction Tuning (LLaVA-v1.5)", 334 "authors": ["Haotian Liu", "Chunyuan Li", "Yuheng Li", "Yong Jae Lee"], 335 "year": 2023, 336 "relevance": "Primary baseline architecture for CuMo; establishes MLP connector and training recipe." 337 }, 338 { 339 "title": "LLaVA-NeXT: Improved reasoning, OCR, and world knowledge", 340 "authors": ["Haotian Liu", "Chunyuan Li", "Yuheng Li", "Bo Li", "Yuanhan Zhang", "Sheng Shen", "Yong Jae Lee"], 341 "year": 2024, 342 "relevance": "Key contemporary baseline for multimodal LLM benchmarking." 343 }, 344 { 345 "title": "MoE-LLaVA: Mixture of Experts for Large Vision-Language Models", 346 "authors": ["Bin Lin"], 347 "year": 2024, 348 "relevance": "Related work applying MoE to multimodal LLMs in the LLM component rather than vision side." 349 }, 350 { 351 "title": "MM1: Methods, Analysis & Insights from Multimodal LLM Pre-training", 352 "authors": ["Brandon McKinzie"], 353 "year": 2024, 354 "relevance": "Strong private baseline that CuMo compares against; summarizes essential steps for building multimodal LLMs." 355 }, 356 { 357 "title": "Sparse Upcycling: Training Mixture-of-Experts from Dense Checkpoints", 358 "authors": ["Aran Komatsuzaki"], 359 "year": 2022, 360 "relevance": "Core technique that CuMo adapts for vision encoder and MLP connector MoE initialization." 361 }, 362 { 363 "title": "Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity", 364 "authors": ["William Fedus", "Barret Zoph", "Noam Shazeer"], 365 "year": 2022, 366 "relevance": "Foundational MoE scaling work relevant to understanding efficient LLM scaling approaches." 367 } 368 ] 369 }