scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21947B)
      1 {
      2   "paper": {
      3     "title": "MoSE: Mixture of Slimmable Experts for Efficient and Adaptive Language Models",
      4     "authors": ["Nurbek Tastan", "Stefanos Laskaridis", "Karthik Nandakumar", "Samuel Horváth"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.06154"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "MoSE extends Mixture-of-Experts architectures by making each expert slimmable, enabling a continuous accuracy-compute trade-off at inference time. Across GPT2-Small, Standard, and Medium models trained on OpenWebText, MoSE with test-time training consistently shifts the Pareto frontier, achieving 20-36% FLOP savings at comparable perplexity to standard MoE. The learned sharpness parameter γ transfers across datasets (OpenWebText to LAMBADA) without re-calibration.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All datasets used (OpenWebText, WikiText-103, LAMBADA, WSC) are publicly available standard benchmarks."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper states 'NVIDIA A100-SXM4-40GB GPU machine' and '4 GPUs' with DDP but provides no software environment details (library versions, requirements.txt, etc.)."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "All results in Table 1 and Figures 4-10 are reported as point estimates with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims MoSE 'matches or improves upon' MoE but provides no statistical significance tests for any comparison."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "FLOP savings are reported as percentages with baseline context (e.g., '20.3% FLOPs', '35.9% FLOPs') and absolute perplexity values with baselines are given in Table 1."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification for the choice of model sizes, training token budgets, or number of benchmark evaluation examples."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be single-run."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Standard MoE is used as the baseline throughout, with direct comparisons in Table 1 and all Pareto frontier figures."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "The only baseline is standard MoE. Recent elastic MoE works (RoE, EMoE) are discussed in Related Work but not compared experimentally."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Figure 10 ablates inference modes (uniform-width vs. normalized-probability vs. TTT shared vs. TTT layer-wise). Figures 4-8 systematically vary model size, routing configuration, and token budget."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Perplexity on OpenWebText and WikiText-103, accuracy and perplexity on LAMBADA, accuracy on WSC, and MFLOPs per token are all reported."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is irrelevant for this architecture/efficiency paper evaluating language modeling perplexity and zero-shot benchmarks."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Results are on held-out test splits: 'a held-out split of OpenWebText', standard evaluation splits of WikiText-103, LAMBADA (5153 entries), and WSC (273 entries)."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by model scale (Small/Standard/Medium), routing configuration (E8A2/E8A4/E16A4), training budget (3B/15B tokens), and inference mode."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No failure cases or scenarios where MoSE underperforms MoE are discussed."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "Every experiment shows MoSE matching or outperforming MoE. No negative results or failed approaches are reported."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims MoSE 'matches or improves upon standard MoE at full width' and 'achieving comparable performance with significantly fewer FLOPs', both supported by Table 1 and Figures 4-10."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims like 'width identification improves performance' are supported by controlled ablations (Figure 10) comparing inference modes with all other variables held constant."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Efficient and Adaptive Language Models' broadly, but experiments are limited to GPT2-scale models (55M-1B parameters) trained on OpenWebText. Section 5 acknowledges 'limited to small-scale LLMs' but the title and abstract do not bound this."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations are discussed for why MoSE outperforms MoE. For example, whether the multi-width training acts as a regularizer is not explored."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper measures perplexity and zero-shot accuracy and presents them as such, without overclaiming these as measures of broader capabilities. FLOPs per token is used as the compute proxy, which is well-matched to the efficiency claims."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "All models are trained from scratch with exact architectures specified in Table 2 (layers, heads, hidden dimensions, parameter counts). No pre-trained API models are used."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper does not use prompting. Models are evaluated on standard language modeling (perplexity) and zero-shot benchmarks."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix A.2 provides detailed hyperparameters: AdamW lr=6e-4, weight decay=0.1, β=(0.9,0.95), gradient clipping=1.0, warmup=200 iterations, load balancing loss=0.01, router z-loss=0.001, wmin=0.25, wmax=1.0, step size=0.05, TTT lr=0.01 with SGD."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used in this architecture paper."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No data preprocessing details are provided. The paper simply states models are 'pre-trained on the OpenWebText corpus' without describing tokenization, filtering, or data preparation steps."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 'Discussion & Limitations' discusses limitations including scale constraints and open questions about post-training width adaptation."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5 raises a specific threat: 'Our analysis is limited to small-scale LLMs, which we are able to train from scratch' and notes that post-training slimmability 'remains an open question.'"
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5 explicitly states the scope limitation: experiments are on small-scale LLMs trained from scratch, and whether multi-width operation can be instilled at post-training is left for future work."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (model checkpoints, training logs, per-example predictions) is made available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Training data (OpenWebText) and evaluation datasets (WikiText-103, LAMBADA, WSC) are standard public benchmarks with citations and dataset sizes provided."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data sources are standard public benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No documentation of the data pipeline from raw corpus to training batches (tokenization, sequence packing, etc.)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: MBZUAI, Amazon Science (with note 'Work done independently of Amazon'), and Michigan State University."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "Models are trained from scratch on OpenWebText with specified token budgets (3B-15B tokens), so the training data is fully known and controlled."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether OpenWebText training data overlaps with WikiText-103, LAMBADA, or WSC evaluation sets."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "LAMBADA and WSC are public benchmarks that could appear in OpenWebText (web-sourced). No contamination analysis is performed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Inference cost is reported throughout as MFLOPs per token, with detailed breakdowns in Figure 12 and Pareto frontiers showing compute-quality trade-offs."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix A.2 states 'NVIDIA A100-SXM4-40GB GPU machine' with '4 GPUs' using DDP. Training token budgets (3B-15B) and iteration counts are in Table 2."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No multi-seed results reported. All experiments appear to be single-run."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs per configuration is never stated."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search budget is disclosed despite many hyperparameters being set (learning rate, loss coefficients, width range, TTT calibration details)."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The selection of configurations (e.g., wmin=0.25, step size=0.05, TTT calibration of 50 batches) is not justified or explained as selected from alternatives."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Authors implement both MoSE and the MoE baseline. No acknowledgment of potential bias from implementing their own baselines."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "The entire paper is organized around compute-vs-performance trade-offs, with Pareto frontiers (Figures 4-10) explicitly showing performance as a function of MFLOPs per token."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether perplexity and zero-shot accuracy on small benchmarks (WSC has only 273 examples) actually measure the capabilities claimed."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved. Models are evaluated directly on language modeling and zero-shot tasks."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "Models are trained on OpenWebText which is web-sourced. LAMBADA (2016) and WSC (2012) predate OpenWebText's collection. No discussion of whether solutions appeared in the training data."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether evaluation setups leak information."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "OpenWebText validation split overlap with external benchmarks is not discussed."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "MoSE at full width (w=1.0) matches or improves upon standard MoE across all model scales and training budgets.",
    364       "evidence": "Table 1 shows MoSE (w=1.0) matching or beating MoE on OpenWebText, WikiText-103, LAMBADA, and WSC across GPT2-Small and GPT2-Standard at 3B and 15B tokens.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "MoSE with test-time training achieves comparable performance to MoE with 20-36% fewer FLOPs.",
    369       "evidence": "Figures 4-5 report FLOP savings: 20.3% (GPT2-Small 3B), 35.9% (GPT2-Standard 3B), 30.6% (GPT2-Medium 7.5B), 29.1% (GPT2-Small 15B), 24.9% (GPT2-Standard 15B).",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "The learned sharpness parameter γ transfers across datasets without re-calibration.",
    374       "evidence": "Figure 9 shows γ calibrated on OpenWebText transfers to LAMBADA, with TTT variants outperforming uniform-width on both accuracy and perplexity.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "MoSE training does not destabilize MoE pre-training.",
    379       "evidence": "Figure 3 shows MoSE closely tracking MoE training loss curves across iterations for both GPT2-Small and GPT2-Standard.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "MoSE degrades gracefully when inference-time routing differs from training-time routing.",
    384       "evidence": "Figure 8 shows a single E16A4-trained checkpoint evaluated at E16A2, E16A3, and E16A4, with smooth Pareto frontier degradation.",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "No variance or error bars",
    391       "detail": "All results appear to be single-run without any uncertainty quantification. For small benchmarks like WSC (273 examples), point estimate differences could easily be within noise."
    392     },
    393     {
    394       "flag": "No comparison with contemporary elastic MoE methods",
    395       "detail": "RoE and EMoE are discussed in Related Work as direct competitors but are not compared experimentally, despite being concurrent/recent work addressing the same problem."
    396     },
    397     {
    398       "flag": "Small-scale experiments only",
    399       "detail": "All models are 55M-1B parameters, far below modern LLM scale. Whether MoSE works at scale is unknown, acknowledged by the authors but not reflected in the broad claims of the title/abstract."
    400     },
    401     {
    402       "flag": "Every experiment shows improvement",
    403       "detail": "MoSE matches or outperforms MoE in every single reported comparison. No negative results or failure modes are shown, which is suspicious for a method applied across many configurations."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Switch transformers: Scaling to trillion parameter models with simple and efficient sparsity",
    409       "authors": ["W. Fedus", "B. Zoph", "N. Shazeer"],
    410       "year": 2022,
    411       "relevance": "Foundational MoE architecture for scaling language models with sparse expert selection."
    412     },
    413     {
    414       "title": "Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer",
    415       "authors": ["N. Shazeer", "A. Mirhoseini", "K. Maziarz"],
    416       "year": 2017,
    417       "relevance": "Introduced sparsely-gated MoE layers enabling conditional computation in large models."
    418     },
    419     {
    420       "title": "Slimmable Neural Networks",
    421       "authors": ["J. Yu", "L. Yang", "N. Xu", "J. Yang", "T. Huang"],
    422       "year": 2019,
    423       "relevance": "Pioneered slimmable networks enabling runtime width adjustment, core technique extended by MoSE."
    424     },
    425     {
    426       "title": "Matryoshka representation learning",
    427       "authors": ["A. Kusupati", "G. Bhatt", "A. Rege"],
    428       "year": 2022,
    429       "relevance": "Nested/ordered representation learning relevant to MoSE's slimmable expert structure."
    430     },
    431     {
    432       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversations",
    433       "authors": ["Q. Wu", "G. Bansal", "J. Zhang"],
    434       "year": 2024,
    435       "relevance": "Referenced for agentic settings where MoSE could adaptively select model width based on task difficulty."
    436     },
    437     {
    438       "title": "Are emergent abilities of large language models a mirage?",
    439       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    440       "year": 2023,
    441       "relevance": "Cited regarding accuracy metric sensitivity and non-monotonic behavior in discrete evaluations."
    442     },
    443     {
    444       "title": "Fast Inference from Transformers via Speculative Decoding",
    445       "authors": ["Y. Leviathan", "M. Kalman", "Y. Matias"],
    446       "year": 2023,
    447       "relevance": "MoSE could enable self-speculation using reduced-width execution as a draft model."
    448     },
    449     {
    450       "title": "Elastic MoE: Unlocking the inference-time scalability of mixture-of-experts",
    451       "authors": ["N. Gu", "Z. Zhang", "Y. Feng"],
    452       "year": 2025,
    453       "relevance": "Concurrent elastic MoE work addressing the same rigidity of top-k routing at inference time."
    454     }
    455   ]
    456 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs