scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24857B)
      1 {
      2   "paper": {
      3     "title": "Pretraining Scaling Laws for Generative Evaluations of Language Models",
      4     "authors": ["Rylan Schaeffer", "Noam Levi", "Brando Miranda", "Sanmi Koyejo"],
      5     "year": 2025,
      6     "venue": "ICLR 2026",
      7     "arxiv_id": "2509.24012",
      8     "doi": "10.48550/arXiv.2509.24012"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "theoretical"],
     13   "key_findings": "The paper proposes three pretraining scaling laws for pass-at-k on generative evaluations (GSM8K, MATH) using (1) compute, (2) parameters+tokens, and (3) gold reference likelihoods. The number of attempts k acts as a hyperparameter that modulates scaling behavior, eliminating irreducible error and steepening power laws. The gold reference likelihood law is uniquely stable across ~5 orders of magnitude of compute, compared to ~1.5-2.5 for the other laws. The compute scaling law is proven to be the compute-optimal envelope of the parameters+tokens law.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository link is provided in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses the publicly available Pythia model family, GSM8K, and MATH benchmarks. The sampling data (~500M GSM8K, ~400M MATH samples) is not explicitly released, but the underlying benchmarks and models are public."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions using vLLM for sampling (Appendix C) but provides no environment specification, requirements file, or dependency versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the approach but not how to replicate the experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No confidence intervals or error bars are reported on the scaling law fits or predictions. Figures show point estimates of fitted curves without uncertainty bands."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper compares three scaling laws' predictive performance but does not apply statistical significance tests to determine whether differences are meaningful."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports concrete effect sizes: e.g., compute exponent α(k) rising from 0.121 to 0.375, parameter stability within ~1.5-2.5 vs ~5 orders of magnitude, and relative prediction errors (Figs 3, 5, 7)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 2 justifies using 128 problems from each benchmark and explains the compute-aware sampling strategy (Appendix B), noting 2^14 minimum samples per problem to ensure resolution for k up to 10^4."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance or standard deviation is reported across experimental runs. Results appear to be from single fits without reporting stability across different random subsets of problems."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The three scaling laws serve as mutual baselines, compared against each other for fit quality and predictive performance."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The scaling law formulations are grounded in the most relevant prior work: GPT-4 Technical Report (2024), Hoffmann et al. (2022), and Schaeffer et al. (2024b)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper systematically varies k across five orders of magnitude and examines how each scaling law parameter changes, effectively ablating the role of the attempt hyperparameter."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper evaluates fit quality, backtesting relative error, parameter stability across compute horizons, and compares across two benchmarks (GSM8K, MATH)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant for a scaling law study on automated benchmarks."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The backtesting methodology (Sec 3.2) explicitly holds out the most expensive model (Pythia 12B at 300B tokens) as the prediction target, fitting only on cheaper models."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by k value (1, 10, 100, 1000, 10000) and by benchmark (GSM8K vs MATH in Section 7 and Appendix F)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where scaling laws fail: unreliable extrapolation beyond ~2 orders of magnitude (Sec 3.2), the parameters+tokens law's large error on the most expensive checkpoint (Sec 4.1), and instability of MATH parameters (Sec 7)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that the parameters+tokens law despite better in-range fit does not improve extrapolation (Sec 4.2), and that gold reference law worsens for large k (Sec 5.2)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "All abstract claims are supported: k modulates scaling behavior (Figs 1-2), parameter stability differences (Figs 3,5,7), comparable predictive performance (Figs 3,5,7), and the theoretical envelope connection (Sec 6, Appendix E)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper's causal claims are primarily mathematical (the compute law IS the envelope of the N+D law, proved in Sec 6/Appendix E) or descriptive (k shapes scaling behavior). The theoretical derivation provides adequate justification."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Limitations section (Sec 8) explicitly states the primary limitation is 'its empirical focus on a single model family (Pythia)' and that they 'focus exclusively on pass-at-k metrics.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 7 discusses how benchmark hardness affects scaling behavior. Section 8 discusses that other sampling algorithms and temperatures could affect results. The paper acknowledges it's unclear what determines stability differences across benchmarks."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper carefully distinguishes pass-at-k from accuracy (Appendix A, three explicit differences listed) and frames its claims specifically in terms of pass-at-k, not broader 'model capability.'"
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper uses Pythia model family (Biderman et al., 2023), specifying 8 models from 14M to 12B parameters, non-deduplicated variants, with specific checkpoint counts."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper evaluates models via generative sampling on math problems, not through prompting with system instructions. The problems themselves are from public benchmarks."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Temperature τ=1.0 is stated (Sec 2). Sampling strategy (temperature-only, no top-p/top-k) is specified. Minimum 2^14 samples per problem, maximum 2^15 (Appendix B)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper evaluates raw model sampling."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 2 documents: randomly selected 128 problems from each benchmark, excluded overtrained checkpoints per Godey et al. (2024), compute approximated as C≈6ND, and sampling strategy detailed in Appendix B."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8 (Discussion) contains a dedicated 'Limitations' paragraph discussing single model family and single metric focus."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The paper identifies specific threats: single model family constraint (Sec 8), using intermediate checkpoints in lieu of fully trained models (Sec 2), potential impact of the 128-problem subset (Sec 2), and excluded overtrained checkpoints (Sec 2)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states scope: 'focusing on a particular setting: benchmarks with verifiable binary rewards, with multiple attempts per problem, with performance scored using the pass-at-k metric' (Sec 1). Limitations in Sec 8 state focus on 'a single model family' and 'pass-at-k metrics.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The ~500M GSM8K and ~400M MATH samples are not released for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 2 and Appendix B describe the sampling procedure in detail: model family, number of samples per problem (min 2^14, max 2^15, early stopping at 10 successes), vLLM with prefix caching (Appendix C)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard benchmarks (GSM8K, MATH) and a public model family (Pythia)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: select 128 problems → sample from each Pythia checkpoint → apply unbiased pass@k estimator (Eq. 1-2) → fit scaling laws → backtest. Appendix B shows exact sample counts per model per problem."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section lists: Stanford Data Science, OpenAI Superalignment Fast Grant (RS), EPFL AI4science/AI Center (NL), Schmidt Sciences, Stanford EDGE Scholar Fellowship (BM), NSF, MacArthur Foundation, Stanford HAI, OpenAI, Google (SK)."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Stanford (RS, BM, SK) and EPFL (NL). No product of these institutions is being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The paper evaluates the open-source Pythia model family, not products of any funder. While OpenAI and Google fund some authors, the evaluated models are from EleutherAI, making the funders independent of the outcome."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "Pythia was trained on The Pile (Gao et al., 2020), and the paper specifies up to 300B tokens. The Pile's composition is public, establishing an implicit cutoff."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper does not discuss whether GSM8K (2021) or MATH (2021) problems appeared in The Pile training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "GSM8K and MATH were published in 2021. The Pile was assembled around the same time. No discussion of whether benchmark solutions leaked into training data."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Appendix C provides detailed cost estimates: A100-seconds per 128 samples by model size (Tables 1-2), total lower bound of 81K A100-hours for GSM8K, and financial cost >$112K at current cloud pricing."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Appendix C states the total compute: lower bound of 81K A100-hours for GSM8K, with per-model breakdowns in Tables 1-2. Total samples: ~500M for GSM8K, ~400M for MATH."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No sensitivity analysis across random seeds is reported. The 128-problem subset is randomly selected but no analysis of how different subsets would affect results."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Appendix B details exact sample counts per model per problem (Figs 9-10), with minimum 2^14 and maximum 2^15 samples per problem."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": false,
    305         "answer": false,
    306         "justification": "The scaling law fits use standard nonlinear regression; no hyperparameter search is involved in the methodology."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The three functional forms are motivated by prior work (GPT-4 report, Hoffmann et al., Schaeffer et al.) and all three are reported rather than selecting only the best."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors propose all three scaling laws and evaluate them without acknowledging self-comparison bias or having independent evaluation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "The entire paper is about performance as a function of compute budget. Figures 1, 4, 6 and the backtesting analysis explicitly show performance vs compute."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses GSM8K and MATH without discussing whether pass-at-k on these specific benchmarks validly measures the 'generative capability' claimed in the title."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is used. Models are evaluated via direct sampling."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "GSM8K and MATH were published in 2021, around the same time as The Pile. No discussion of whether benchmark solutions appeared in training data."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup introduces information leakage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether train and test data share structural similarities."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "The number of attempts per problem k acts as a hyperparameter that modulates scaling law parameters: irreducible error E0(k) vanishes exponentially, compute exponent α(k) increases from ~0.121 to ~0.375 as k goes from 1 to 10^4.",
    365       "evidence": "Figures 1-2 and Section 3.1 show fitted scaling law parameters as functions of k across GSM8K.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "The gold reference likelihood scaling law has uniquely stable parameters, converging across ~5 orders of magnitude of compute, compared to ~1.5-2.5 for the other two laws.",
    370       "evidence": "Figure 7 (other three panels) shows parameter stability across backtesting horizons, compared to Figures 3 and 5 for the other laws.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "All three scaling laws perform comparably in predicting the most expensive model's performance, though the compute law is slightly worse for small k and the gold reference law slightly worse for large k.",
    375       "evidence": "Top-left panels of Figures 3, 5, and 7 show relative prediction errors across k values.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The compute scaling law is the compute-optimal envelope of the parameters+tokens scaling law, with α(k) = (1/β(k) + 1/γ(k))^{-1}.",
    380       "evidence": "Section 6 and Appendix E provide the full mathematical derivation with proof.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "MATH retains significant irreducible error even at k=10^4 (E0≈0.45), quantifying that it is a harder benchmark than GSM8K where E0 vanishes by k≈100.",
    385       "evidence": "Section 7 and Appendix F (Figure 14) compare GSM8K and MATH scaling behaviors.",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Single model family",
    392       "detail": "All empirical results are from Pythia (14M-12B parameters). The authors acknowledge this but it severely limits generalizability — scaling laws may not hold for differently architected or differently trained model families."
    393     },
    394     {
    395       "flag": "No contamination analysis",
    396       "detail": "GSM8K and MATH were published around the same time The Pile was assembled. If benchmark solutions leaked into training data, the scaling law fits could be distorted, particularly for larger models that memorize more."
    397     },
    398     {
    399       "flag": "Small benchmark subset",
    400       "detail": "Only 128 problems from each benchmark were used (out of 8,500 GSM8K and 12,500 MATH). The random subset could affect scaling law parameters, and no sensitivity analysis is performed."
    401     }
    402   ],
    403   "cited_papers": [
    404     {
    405       "title": "Evaluating large language models trained on code",
    406       "authors": ["Mark Chen", "Jerry Tworek"],
    407       "year": 2021,
    408       "arxiv_id": "2107.03374",
    409       "relevance": "Introduced pass-at-k metric and the unbiased estimator used in this paper; foundational for code generation evaluation."
    410     },
    411     {
    412       "title": "Training compute-optimal large language models",
    413       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud"],
    414       "year": 2022,
    415       "arxiv_id": "2203.15556",
    416       "relevance": "Chinchilla scaling laws for compute-optimal training; the parameters+tokens law in this paper extends Chinchilla to generative evaluation."
    417     },
    418     {
    419       "title": "Scaling laws for neural language models",
    420       "authors": ["Jared Kaplan", "Sam McCandlish"],
    421       "year": 2020,
    422       "arxiv_id": "2001.08361",
    423       "relevance": "Foundational neural scaling laws paper; this work extends scaling laws from pretraining loss to generative benchmarks."
    424     },
    425     {
    426       "title": "Are emergent abilities of large language models a mirage?",
    427       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    428       "year": 2023,
    429       "relevance": "Challenged emergent abilities claims by showing they arise from metric choice; directly relevant to understanding benchmark evaluation methodology."
    430     },
    431     {
    432       "title": "Language models scale reliably with over-training and on downstream tasks",
    433       "authors": ["Samir Yitzhak Gadre"],
    434       "year": 2024,
    435       "arxiv_id": "2403.08540",
    436       "relevance": "Studied scaling on 46 downstream tasks (none generative); this paper fills the generative evaluation gap."
    437     },
    438     {
    439       "title": "Large language monkeys: Scaling inference compute with repeated sampling",
    440       "authors": ["Bradley Brown", "Jordan Juravsky"],
    441       "year": 2024,
    442       "arxiv_id": "2407.21787",
    443       "relevance": "Studied repeated sampling as inference-time compute scaling strategy; directly related to the pass-at-k scaling studied here."
    444     },
    445     {
    446       "title": "Scaling llm test-time compute optimally can be more effective than scaling model parameters",
    447       "authors": ["Charlie Snell", "Jaehoon Lee"],
    448       "year": 2024,
    449       "arxiv_id": "2408.03314",
    450       "relevance": "Inference-time compute scaling; complementary to pretraining scaling laws for generative tasks."
    451     },
    452     {
    453       "title": "Codemonkeys: Scaling test-time compute for software engineering",
    454       "authors": ["Ryan Ehrlich", "Bradley Brown"],
    455       "year": 2025,
    456       "arxiv_id": "2501.14723",
    457       "relevance": "Applied repeated sampling scaling to software engineering tasks; extends pass-at-k to practical code generation."
    458     },
    459     {
    460       "title": "GPT-4 technical report",
    461       "authors": ["OpenAI"],
    462       "year": 2024,
    463       "arxiv_id": "2303.08774",
    464       "relevance": "Pioneered scaling law prediction for generative evaluations (HumanEval); the compute law in this paper adopts their functional form."
    465     },
    466     {
    467       "title": "Predicting emergent abilities with infinite resolution evaluation",
    468       "authors": ["Shengding Hu", "Xin Liu"],
    469       "year": 2024,
    470       "arxiv_id": "2310.03262",
    471       "relevance": "Infinite resolution sampling for emergent abilities; philosophically related approach to understanding scaling via finer-grained evaluation."
    472     },
    473     {
    474       "title": "Pythia: A suite for analyzing large language models across training and scaling",
    475       "authors": ["Stella Biderman"],
    476       "year": 2023,
    477       "relevance": "The model family used for all experiments in this paper; enables dense checkpoint analysis across model sizes."
    478     }
    479   ]
    480 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs