scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25099B)
      1 {
      2   "paper": {
      3     "title": "Random Scaling of Emergent Capabilities",
      4     "authors": ["Rosie Zhao", "Tian Qin", "David Alvarez-Melis", "Sham Kakade", "Naomi Saphra"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2502.17356"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Different random seeds produce either smooth or emergent scaling trends, even with identical data and hyperparameters. Breakthroughs are driven by bimodal performance distributions across seeds that shift continuously with scale, not by deterministic capacity thresholds. This bimodality persists under continuous loss metrics, challenging both the 'emergence' and 'mirage' positions. The minimum capacity for a capability appears at scales well before most individual seeds exhibit breakthrough.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Synthetic tasks are procedurally generated from described algorithms. MMLU is a standard public benchmark (Hendrycks et al., 2021). The question formation data uses McCoy et al. (2018) grammars. All data sources are publicly available or reproducible from descriptions."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions 'single 40GB A100 GPU' (Appx. A) but provides no requirements.txt, Dockerfile, or detailed library version specifications."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Hyperparameters are listed in Appendix A but there is no guide to reproduce the full experimental pipeline."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Fig. 3 reports '95% confidence intervals with 1000 bootstrapped samples' for the mean of successful runs."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Hartigan's Dip Test used to confirm bimodality is statistically significant (p < 0.001) for MMLU (Sec. 3.3) and question formation (Sec. 4.2)."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Results are reported with full distributional context: percentage accuracies at each scale, fraction of successful runs, and mean of successful vs. all runs (Fig. 3, Fig. 6). The magnitude of effects is clear from the histograms and summary statistics."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "250 seeds for count, 200 for addition, and 80 for LM experiments are used, but no justification or power analysis is provided for why these numbers were chosen."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The entire paper characterizes variance across seeds. Full distributions are plotted as histograms, bootstrap CIs are reported, and Wasserstein distances track distributional changes across scales."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper systematically compares across model scales (width, depth), data compositions, and random seeds. It also compares against the random guess baseline (25%) for MMLU and base model performance."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Uses Qwen2.5 models (2024), references and engages with contemporary work including Schaeffer et al. (2024) and Madaan et al. (2024)."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Width and depth are independently varied while holding the other fixed (Fig. 2, Fig. 3). Data composition is varied systematically (Fig. 5, Fig. 6). These controlled variations function as ablations."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Multiple metrics used: exact match accuracy, continuous loss metric (Eq. 1), NLL loss, minimum token probability (Eq. 4), Wasserstein-L2 distance, breakthroughness, and linearity."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is irrelevant to claims about distributional properties of model training outcomes across random seeds."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Evaluation uses out-of-distribution test sets: length 40 for addition (trained on up to 35), length 60 for count (trained on up to 30), and OOD question formation test set requiring hierarchical rule."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results broken down by model width, depth, data composition ratio, and task (addition, count, MMLU, question formation). Distributions shown at each scale."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Failure modes extensively discussed: models that fail to generalize at depth=1 (Fig. 12), U-shaped scaling in count task (Appx. C.2), failed runs in bimodal distributions, and MMLU format failure cluster near 0%."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "U-shaped (inverse) scaling reported for count task (Appx. C.2). Models at certain scales consistently fail. Even at 100% hierarchical data, some seeds fail to learn the hierarchical rule (Fig. 6)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about bimodal distributions, seed-dependent emergence vs. smooth scaling, persistence under continuous metrics, and capacity thresholds preceding breakthrough are all supported by experimental results in Sections 2-4."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims ('breakthroughs are driven by continuous changes in the probability distribution') are supported by controlled experiments that vary single variables (width, depth, data composition, seed) while holding others fixed."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly states: 'Since training numerous seeds is prohibitively expensive at large scales, we study partially reinitialized LLMs and toy models.' Claims are framed around the tested settings."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper explicitly engages with the 'mirage' explanation (Schaeffer et al., 2024) that breakthroughs are metric thresholding artifacts, tests this with continuous metrics, and discusses how their findings challenge both the emergence and mirage camps."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper explicitly distinguishes between exact match accuracy (thresholded proxy) and continuous metrics (NLL, continuous error score in Eq. 1), and demonstrates that bimodality is not an artifact of the proxy metric choice."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Qwen2.5-0.5B and Qwen2.5-1.5B are specified by name and size. Synthetic models are fully described architecturally (RoPE, decoder-only Transformer, specific hidden dimensions and layer counts)."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting is used. Models are trained from scratch or via continued pretraining, not prompted."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix A provides learning rates, weight decay, batch sizes, context lengths, training steps, optimizers, and learning rate schedules for all experiments."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used in any experiment."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Synthetic data generation is described (Sec. 2.1, Appx. A). MMLU/C4 mixing ratios specified (Sec. 3.1). Question formation data generation from context-free grammars described (Sec. 4.1, Appx. E)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations section. The Discussion (Sec. 5) contextualizes results but does not systematically enumerate limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No dedicated threats-to-validity discussion. The paper acknowledges that 'training numerous seeds is prohibitively expensive at large scales' and uses reinitialziation as a proxy, but does not systematically discuss specific threats."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper states 'Since training numerous seeds is prohibitively expensive at large scales, we study partially reinitialized LLMs and toy models' and 'our continued training dataset... is insufficient to recover the base model's full MMLU performance.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No data downloads, model checkpoints, or supplementary data files are provided for independent verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Synthetic data generation procedures fully described (counting, reverse-order addition with index hints). MMLU and C4 are standard datasets with citations. Question formation grammar described in Appendix E."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. All data is synthetic or from standard benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline from data generation to model training to evaluation is documented: data sampling (i.i.d., in-context learning examples), training procedure, and evaluation on OOD test sets."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Acknowledgments list funding from Chan Zuckerberg Initiative, Office of Naval Research (N00014-22-1-2377), NSF (#IIS 2229881, DMS-2134157), DARPA (W911NF2010021), DOE (DE-SC0022199), Kempner Institute, Aramont Fellowship, and FAS Dean's Fund."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All authors listed as Harvard University and/or Kempner Institute. They are not evaluating their own product."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Funders are government agencies (ONR, NSF, DARPA, DOE) and foundations (CZI) with no financial stake in whether emergence is real or a mirage."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is included in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "Qwen2.5 models are used but their training data cutoff date is not stated. The paper does not discuss when Qwen2.5's training data was collected."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether Qwen2.5's pretraining data includes MMLU examples, despite MMLU being a widely-known public benchmark."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "MMLU was published in 2021 and Qwen2.5 was trained after that. No contamination analysis or discussion is provided."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in any experiment."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in any experiment."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in any experiment."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in any experiment."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in any experiment."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in any experiment."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in any experiment."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "Training wall-clock times are mentioned (2 hours for count, 6 hours for addition) but no inference cost or latency is reported."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix A states: 'all model scales can be run on a single 40GB A100 GPU with gradient accumulation; for count, runs can finish within 2 hours and for addition, runs can finish within 6 hours.' With 250 and 200 seeds across multiple scales, total compute is estimable."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "The entire paper is a study of seed sensitivity. Results reported across 200-250 seeds for synthetic tasks and 80 seeds for LM experiments."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Explicitly stated: 250 seeds for count (Appx. A), 200 seeds for reverse-order addition (Appx. A), 80 seeds for each LM experiment (Sec. 3.1, Sec. 4.1)."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Hyperparameters are stated as 'largely adapted from (Zhou et al., 2024a)' but no search budget or justification for the chosen values is provided."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "The paper does not select a 'best' configuration; it characterizes full distributions across all configurations. Hyperparameters are adopted from prior work (Zhou et al., 2024a)."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "Statistical tests (Hartigan's Dip Test) are applied to specific, pre-planned comparisons rather than a battery of exploratory tests requiring correction."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The paper is a characterization study, not a system comparison. There is no 'own system' being compared against baselines."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Performance is plotted as a function of parameter count (Figs. 1-7), and width/depth are independently varied to study performance-compute relationships."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper discusses what MMLU measures: 'Strong MMLU performance requires (1) natural language reasoning with domain knowledge and (2) producing answers in the required format. The latter drives emergent trends' (Sec. 3.1), citing Hu & Frank (2024)."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved in any experiment."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of temporal leakage for Qwen2.5 models evaluated on MMLU, despite MMLU predating Qwen2.5 training."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information. The MMLU training data is deliberately mixed into continued pretraining, which is by design, but potential feature leakage from pretraining is not addressed."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether MMLU examples share structural similarities with Qwen2.5's pretraining data."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Different random seeds can produce either smooth or emergent scaling trends for the same task with identical data and hyperparameters.",
    364       "evidence": "Fig. 1 shows seed 93 exhibits emergent scaling while seed 205 shows linear scaling on the count task. Breakthroughness and linearity metrics computed across seeds (Appx. B, Fig. 10).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Breakthroughs result from bimodal performance distributions across random seeds, where the distribution shifts continuously with scale.",
    369       "evidence": "Fig. 2 shows bimodal histograms of exact match accuracy across 200 seeds at each scale for addition. Fig. 3 shows that while the mode jumps sharply, the mean, success probability, and mean of successful runs all change continuously.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Bimodal variation persists under continuous metrics (not just thresholded accuracy).",
    374       "evidence": "Fig. 4 shows continuous loss metric (Eq. 1) remains bimodal. Fig. 8 shows NLL is bimodal for MMLU and question formation. Hartigan's Dip Test confirms statistical significance (p < 0.001) in Sec. 3.3 and 4.2.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "The transition from unimodal to bimodal distributions is abrupt and occurs at a minimum capacity threshold, which appears well before most seeds exhibit breakthrough.",
    379       "evidence": "Fig. 4c shows Wasserstein-L2 distance drops sharply when depth reaches 3 layers or width reaches 2 heads, marking sudden bimodality emergence. This occurs before the mode breakthrough scale.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Bimodality can produce nonmonotonic (U-shaped) scaling trends, where inverse scaling is an artifact of success probability changes.",
    384       "evidence": "Fig. 14 shows U-shaped mean accuracy for count task when scaling width. The mean of successful runs remains monotonic, indicating the U-shape comes from changing success probability (Appx. C.2).",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "Proxy for independent training runs",
    391       "detail": "For LM experiments, the paper reinitializes only the final attention layer and LM head of pretrained Qwen2.5 models rather than training from scratch. This is a proxy for independent training runs motivated by cost, but may not capture the full distribution of outcomes from truly independent pretraining. The paper acknowledges this but the generalization gap is substantial."
    392     },
    393     {
    394       "flag": "No contamination analysis for MMLU experiments",
    395       "detail": "Qwen2.5 was likely pretrained on MMLU data. While the paper reinitializes the top layer, the lower layers retain learned representations that may include MMLU knowledge. No contamination analysis is performed."
    396     },
    397     {
    398       "flag": "Small model scales only",
    399       "detail": "Claims about emergence are tested only on 0.5B and 1.5B parameter models and small synthetic Transformers. The paper's conclusions about emergence 'at scale' are extrapolations from small-scale observations."
    400     }
    401   ],
    402   "cited_papers": [
    403     {
    404       "title": "Are emergent abilities of large language models a mirage?",
    405       "authors": ["R. Schaeffer", "B. Miranda", "S. Koyejo"],
    406       "year": 2024,
    407       "relevance": "Core reference in the emergence debate; argues breakthroughs are metric thresholding artifacts, which this paper partially refutes."
    408     },
    409     {
    410       "title": "Emergent abilities of large language models",
    411       "authors": ["J. Wei", "Y. Tay", "R. Bommasani"],
    412       "year": 2022,
    413       "relevance": "Foundational paper on emergence in LLMs; this paper challenges its assumption that emergence is deterministic at specific scales."
    414     },
    415     {
    416       "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models",
    417       "authors": ["A. Srivastava"],
    418       "year": 2023,
    419       "relevance": "BIG-Bench benchmark; defines breakthroughness and linearity metrics used in this paper's analysis."
    420     },
    421     {
    422       "title": "Scaling laws for neural language models",
    423       "authors": ["J. Kaplan", "S. McCandlish", "T. Henighan"],
    424       "year": 2020,
    425       "arxiv_id": "2001.08361",
    426       "relevance": "Foundational scaling laws paper; this work shows scaling laws based on single runs miss important distributional variation."
    427     },
    428     {
    429       "title": "Quantifying variance in evaluation benchmarks",
    430       "authors": ["L. Madaan", "A. K. Singh", "R. Schaeffer"],
    431       "year": 2024,
    432       "relevance": "Documents that out-of-distribution performance varies widely across runs even at larger scales."
    433     },
    434     {
    435       "title": "Measuring massive multitask language understanding",
    436       "authors": ["D. Hendrycks", "C. Burns", "S. Basart"],
    437       "year": 2021,
    438       "arxiv_id": "2009.03300",
    439       "relevance": "MMLU benchmark used as the primary LM evaluation task in this paper."
    440     },
    441     {
    442       "title": "Fine-tuning pretrained language models: Weight initializations, data orders, and early stopping",
    443       "authors": ["J. Dodge", "G. Ilharco", "R. Schwartz"],
    444       "year": 2020,
    445       "relevance": "Shows weight initialization and data ordering contribute equally to performance variation."
    446     },
    447     {
    448       "title": "Predicting emergent capabilities by finetuning",
    449       "authors": ["C. Snell", "E. Wallace", "D. Klein", "S. Levine"],
    450       "year": 2024,
    451       "arxiv_id": "2411.16035",
    452       "relevance": "Shows smaller models may have capacity for emergent tasks but are limited by data scarcity; complementary to this paper's findings."
    453     },
    454     {
    455       "title": "Underspecification presents challenges for credibility in modern machine learning",
    456       "authors": ["A. D'Amour", "K. Heller", "D. Moldovan"],
    457       "year": 2022,
    458       "relevance": "Documents performance variation across stress tests, supporting the claim that random variation matters for model evaluation."
    459     },
    460     {
    461       "title": "Deep learning scaling is predictable, empirically",
    462       "authors": ["J. Hestness", "S. Narang", "N. Ardalani"],
    463       "year": 2017,
    464       "arxiv_id": "1712.00409",
    465       "relevance": "Early empirical scaling laws paper; this work shows those smooth laws miss distributional variation across seeds."
    466     }
    467   ]
    468 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs