scan.json (21947B)
1 { 2 "paper": { 3 "title": "MoSE: Mixture of Slimmable Experts for Efficient and Adaptive Language Models", 4 "authors": ["Nurbek Tastan", "Stefanos Laskaridis", "Karthik Nandakumar", "Samuel Horváth"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.06154" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "MoSE extends Mixture-of-Experts architectures by making each expert slimmable, enabling a continuous accuracy-compute trade-off at inference time. Across GPT2-Small, Standard, and Medium models trained on OpenWebText, MoSE with test-time training consistently shifts the Pareto frontier, achieving 20-36% FLOP savings at comparable perplexity to standard MoE. The learned sharpness parameter γ transfers across datasets (OpenWebText to LAMBADA) without re-calibration.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "All datasets used (OpenWebText, WikiText-103, LAMBADA, WSC) are publicly available standard benchmarks." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper states 'NVIDIA A100-SXM4-40GB GPU machine' and '4 GPUs' with DDP but provides no software environment details (library versions, requirements.txt, etc.)." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions, README, or scripts are provided." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results in Table 1 and Figures 4-10 are reported as point estimates with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims MoSE 'matches or improves upon' MoE but provides no statistical significance tests for any comparison." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "FLOP savings are reported as percentages with baseline context (e.g., '20.3% FLOPs', '35.9% FLOPs') and absolute perplexity values with baselines are given in Table 1." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification for the choice of model sizes, training token budgets, or number of benchmark evaluation examples." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Standard MoE is used as the baseline throughout, with direct comparisons in Table 1 and all Pareto frontier figures." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": false, 72 "justification": "The only baseline is standard MoE. Recent elastic MoE works (RoE, EMoE) are discussed in Related Work but not compared experimentally." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Figure 10 ablates inference modes (uniform-width vs. normalized-probability vs. TTT shared vs. TTT layer-wise). Figures 4-8 systematically vary model size, routing configuration, and token budget." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Perplexity on OpenWebText and WikiText-103, accuracy and perplexity on LAMBADA, accuracy on WSC, and MFLOPs per token are all reported." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Human evaluation is irrelevant for this architecture/efficiency paper evaluating language modeling perplexity and zero-shot benchmarks." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "Results are on held-out test splits: 'a held-out split of OpenWebText', standard evaluation splits of WikiText-103, LAMBADA (5153 entries), and WSC (273 entries)." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by model scale (Small/Standard/Medium), routing configuration (E8A2/E8A4/E16A4), training budget (3B/15B tokens), and inference mode." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "No failure cases or scenarios where MoSE underperforms MoE are discussed." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": false, 107 "justification": "Every experiment shows MoSE matching or outperforming MoE. No negative results or failed approaches are reported." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims MoSE 'matches or improves upon standard MoE at full width' and 'achieving comparable performance with significantly fewer FLOPs', both supported by Table 1 and Figures 4-10." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "Causal claims like 'width identification improves performance' are supported by controlled ablations (Figure 10) comparing inference modes with all other variables held constant." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims 'Efficient and Adaptive Language Models' broadly, but experiments are limited to GPT2-scale models (55M-1B parameters) trained on OpenWebText. Section 5 acknowledges 'limited to small-scale LLMs' but the title and abstract do not bound this." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No alternative explanations are discussed for why MoSE outperforms MoE. For example, whether the multi-width training acts as a regularizer is not explored." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper measures perplexity and zero-shot accuracy and presents them as such, without overclaiming these as measures of broader capabilities. FLOPs per token is used as the compute proxy, which is well-matched to the efficiency claims." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "All models are trained from scratch with exact architectures specified in Table 2 (layers, heads, hidden dimensions, parameter counts). No pre-trained API models are used." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "The paper does not use prompting. Models are evaluated on standard language modeling (perplexity) and zero-shot benchmarks." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Appendix A.2 provides detailed hyperparameters: AdamW lr=6e-4, weight decay=0.1, β=(0.9,0.95), gradient clipping=1.0, warmup=200 iterations, load balancing loss=0.01, router z-loss=0.001, wmin=0.25, wmax=1.0, step size=0.05, TTT lr=0.01 with SGD." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used in this architecture paper." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "No data preprocessing details are provided. The paper simply states models are 'pre-trained on the OpenWebText corpus' without describing tokenization, filtering, or data preparation steps." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 5 'Discussion & Limitations' discusses limitations including scale constraints and open questions about post-training width adaptation." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 5 raises a specific threat: 'Our analysis is limited to small-scale LLMs, which we are able to train from scratch' and notes that post-training slimmability 'remains an open question.'" 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 5 explicitly states the scope limitation: experiments are on small-scale LLMs trained from scratch, and whether multi-width operation can be instilled at post-training is left for future work." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw experimental data (model checkpoints, training logs, per-example predictions) is made available." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Training data (OpenWebText) and evaluation datasets (WikiText-103, LAMBADA, WSC) are standard public benchmarks with citations and dataset sizes provided." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data sources are standard public benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "No documentation of the data pipeline from raw corpus to training batches (tokenization, sequence packing, etc.)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: MBZUAI, Amazon Science (with note 'Work done independently of Amazon'), and Michigan State University." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": true, 229 "justification": "Models are trained from scratch on OpenWebText with specified token budgets (3B-15B tokens), so the training data is fully known and controlled." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether OpenWebText training data overlaps with WikiText-103, LAMBADA, or WSC evaluation sets." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "LAMBADA and WSC are public benchmarks that could appear in OpenWebText (web-sourced). No contamination analysis is performed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "Inference cost is reported throughout as MFLOPs per token, with detailed breakdowns in Figure 12 and Pareto frontiers showing compute-quality trade-offs." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Appendix A.2 states 'NVIDIA A100-SXM4-40GB GPU machine' with '4 GPUs' using DDP. Training token budgets (3B-15B) and iteration counts are in Table 2." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No multi-seed results reported. All experiments appear to be single-run." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs per configuration is never stated." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget is disclosed despite many hyperparameters being set (learning rate, loss coefficients, width range, TTT calibration details)." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "The selection of configurations (e.g., wmin=0.25, step size=0.05, TTT calibration of 50 batches) is not justified or explained as selected from alternatives." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "Authors implement both MoSE and the MoE baseline. No acknowledgment of potential bias from implementing their own baselines." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": true, 325 "justification": "The entire paper is organized around compute-vs-performance trade-offs, with Pareto frontiers (Figures 4-10) explicitly showing performance as a function of MFLOPs per token." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether perplexity and zero-shot accuracy on small benchmarks (WSC has only 273 examples) actually measure the capabilities claimed." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No scaffolding is involved. Models are evaluated directly on language modeling and zero-shot tasks." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "Models are trained on OpenWebText which is web-sourced. LAMBADA (2016) and WSC (2012) predate OpenWebText's collection. No discussion of whether solutions appeared in the training data." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether evaluation setups leak information." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "OpenWebText validation split overlap with external benchmarks is not discussed." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention method is applied." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "MoSE at full width (w=1.0) matches or improves upon standard MoE across all model scales and training budgets.", 364 "evidence": "Table 1 shows MoSE (w=1.0) matching or beating MoE on OpenWebText, WikiText-103, LAMBADA, and WSC across GPT2-Small and GPT2-Standard at 3B and 15B tokens.", 365 "supported": "moderate" 366 }, 367 { 368 "claim": "MoSE with test-time training achieves comparable performance to MoE with 20-36% fewer FLOPs.", 369 "evidence": "Figures 4-5 report FLOP savings: 20.3% (GPT2-Small 3B), 35.9% (GPT2-Standard 3B), 30.6% (GPT2-Medium 7.5B), 29.1% (GPT2-Small 15B), 24.9% (GPT2-Standard 15B).", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "The learned sharpness parameter γ transfers across datasets without re-calibration.", 374 "evidence": "Figure 9 shows γ calibrated on OpenWebText transfers to LAMBADA, with TTT variants outperforming uniform-width on both accuracy and perplexity.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "MoSE training does not destabilize MoE pre-training.", 379 "evidence": "Figure 3 shows MoSE closely tracking MoE training loss curves across iterations for both GPT2-Small and GPT2-Standard.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "MoSE degrades gracefully when inference-time routing differs from training-time routing.", 384 "evidence": "Figure 8 shows a single E16A4-trained checkpoint evaluated at E16A2, E16A3, and E16A4, with smooth Pareto frontier degradation.", 385 "supported": "moderate" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "No variance or error bars", 391 "detail": "All results appear to be single-run without any uncertainty quantification. For small benchmarks like WSC (273 examples), point estimate differences could easily be within noise." 392 }, 393 { 394 "flag": "No comparison with contemporary elastic MoE methods", 395 "detail": "RoE and EMoE are discussed in Related Work as direct competitors but are not compared experimentally, despite being concurrent/recent work addressing the same problem." 396 }, 397 { 398 "flag": "Small-scale experiments only", 399 "detail": "All models are 55M-1B parameters, far below modern LLM scale. Whether MoSE works at scale is unknown, acknowledged by the authors but not reflected in the broad claims of the title/abstract." 400 }, 401 { 402 "flag": "Every experiment shows improvement", 403 "detail": "MoSE matches or outperforms MoE in every single reported comparison. No negative results or failure modes are shown, which is suspicious for a method applied across many configurations." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity", 409 "authors": ["W. Fedus", "B. Zoph", "N. Shazeer"], 410 "year": 2022, 411 "relevance": "Foundational MoE architecture for scaling language models with sparse expert selection." 412 }, 413 { 414 "title": "Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer", 415 "authors": ["N. Shazeer", "A. Mirhoseini", "K. Maziarz"], 416 "year": 2017, 417 "relevance": "Introduced sparsely-gated MoE layers enabling conditional computation in large models." 418 }, 419 { 420 "title": "Slimmable Neural Networks", 421 "authors": ["J. Yu", "L. Yang", "N. Xu", "J. Yang", "T. Huang"], 422 "year": 2019, 423 "relevance": "Pioneered slimmable networks enabling runtime width adjustment, core technique extended by MoSE." 424 }, 425 { 426 "title": "Matryoshka representation learning", 427 "authors": ["A. Kusupati", "G. Bhatt", "A. Rege"], 428 "year": 2022, 429 "relevance": "Nested/ordered representation learning relevant to MoSE's slimmable expert structure." 430 }, 431 { 432 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations", 433 "authors": ["Q. Wu", "G. Bansal", "J. Zhang"], 434 "year": 2024, 435 "relevance": "Referenced for agentic settings where MoSE could adaptively select model width based on task difficulty." 436 }, 437 { 438 "title": "Are emergent abilities of large language models a mirage?", 439 "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"], 440 "year": 2023, 441 "relevance": "Cited regarding accuracy metric sensitivity and non-monotonic behavior in discrete evaluations." 442 }, 443 { 444 "title": "Fast Inference from Transformers via Speculative Decoding", 445 "authors": ["Y. Leviathan", "M. Kalman", "Y. Matias"], 446 "year": 2023, 447 "relevance": "MoSE could enable self-speculation using reduced-width execution as a draft model." 448 }, 449 { 450 "title": "Elastic MoE: Unlocking the inference-time scalability of mixture-of-experts", 451 "authors": ["N. Gu", "Z. Zhang", "Y. Feng"], 452 "year": 2025, 453 "relevance": "Concurrent elastic MoE work addressing the same rigidity of top-k routing at inference time." 454 } 455 ] 456 }