scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (19404B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Evaluating the Robustness of Chinchilla Compute-Optimal Scaling",
      6     "authors": [
      7       "Rylan Schaeffer",
      8       "Noam Levi",
      9       "Andreas Kirsch",
     10       "Theo Guenais",
     11       "Brando Miranda",
     12       "Elyas Obbad",
     13       "Sanmi Koyejo"
     14     ],
     15     "year": 2025,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2509.23963",
     18     "doi": "10.48550/arXiv.2509.23963"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "All abstract claims are verified in the paper body: the 15.2% parameter discrepancy is demonstrated in Fig 1, robustness of scaling law fits across three interpretations is shown in Fig 2, and differential sensitivity to additive versus multiplicative perturbations is demonstrated in Figs 4-5.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claims about how each perturbation type affects fitted parameters are supported by controlled parameter perturbation experiments plus closed-form analytical derivations in Appendix C, which together provide adequate basis for causal inference in this mathematical/fitting context.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The analysis is explicitly based on re-fitting the original 50-model Chinchilla dataset, and the Future Directions paragraph notes extending to more recent scaling results as open work, implicitly bounding current conclusions to the original setup.",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Appendix C provides analytical explanations for why each perturbation type has its observed effect (e.g., why additive offsets break the power-law slope while multiplicative ones do not), offering theoretical grounding rather than leaving results unexplained.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper directly measures the scaling law fit parameters and compute-optimal tokens-per-parameter ratio, which are exactly the quantities the claims concern—no proxy gap exists.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "There is no dedicated limitations or threats-to-validity section; the Discussion only summarizes findings and lists future directions without acknowledging methodological limitations of the analysis.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No specific threats are discussed, such as restriction to the original 44M-16B parameter range, the assumption that the four perturbation types are representative of real-world errors, or the small number of training runs (50 models) used for fitting.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The Discussion frames conclusions broadly as guidance 'for the field' and 'for practitioners' without explicitly qualifying that they apply only within Chinchilla's original model range and training conditions.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No funding acknowledgment appears anywhere in the paper; only LLM usage is disclosed in Appendix A.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Author affiliations (Stanford University, EPFL) are clearly listed on the title page.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No funding source is disclosed, making funder independence unverifiable and this question inapplicable.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms are precisely defined: the scaling law formula (Eq. 4), all three interpretations of model parameters (reported, standard formula, best-fit formula with explicit equations), the four perturbation types, and the compute-optimal tokens-per-parameter ratio are all formally specified.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper explicitly frames its contribution as answering whether 'practitioners can still confidently rely on Chinchilla's prescriptions' through a robustness and sensitivity analysis of the original Chinchilla methodology.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper substantively engages with Besiroglu et al. (2024), Porian et al. (2024), Pearce & Song (2024), and Zhang (2023), explicitly situating each prior critique and showing how this work relates to or extends their findings (e.g., comparing additive perturbation results to Porian et al.'s ˆα increase of 0.080).",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "theoretical": {
    122       "formal_quality": {
    123         "assumptions_stated_explicitly": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Appendix C explicitly states all modeling assumptions: the specific loss form, the compute approximation C ≈ cND, and the assumption ˆB ≈ B, ˆβ ≈ β for most perturbations with an explicit note that this is relaxed when necessary.",
    127           "source": "haiku"
    128         },
    129         "proofs_complete_or_sketched": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Appendix C.1 provides a complete baseline derivation, and C.2.1–C.2.3 provide full algebraic derivations for the multiplicative, additive, and systematic bias cases, with each step shown explicitly.",
    133           "source": "haiku"
    134         },
    135         "bounds_tight_or_discussed": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "For the systematic bias case, the derived relationship ˆα ∝ s^{-1} is empirically verified with R² > 0.999 (p ≈ 5.9 × 10^{-90}); other analytical predictions are confirmed by close agreement with bootstrapped empirical results.",
    139           "source": "haiku"
    140         },
    141         "counterexamples_explored": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "The paper explores edge cases including extreme multiplicative constants (0.001, 0.004) that cause NaN instabilities, large additive offsets approaching the smallest model size, and high log-normal noise levels where parameters become nearly unidentifiable.",
    145           "source": "haiku"
    146         },
    147         "notation_consistent": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Notation is consistent throughout: E, A, α, B, β for scaling law parameters; N/D/C for parameters/tokens/compute; tilde (˜) for perturbed quantities; hat (ˆ) for fitted estimates—no symbol overloading is observed.",
    151           "source": "haiku"
    152         },
    153         "constructive_vs_existence_noted": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "All theoretical results are constructive: explicit closed-form formulas are derived for how ˆα and ˆA change under each perturbation (e.g., ˆα = α/s for systematic bias, ˆA ≈ Ac^α_m for multiplicative), rather than mere existence claims.",
    157           "source": "haiku"
    158         }
    159       },
    160       "connections": {
    161         "connection_to_practice_discussed": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "The entire paper is motivated by practical guidance: the abstract and Discussion explicitly address whether the Chinchilla 20-to-1 prescription should be trusted when training large language models, providing direct practitioner-facing conclusions.",
    165           "source": "haiku"
    166         },
    167         "relationship_to_prior_work_clear": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Section 4 and Appendix D explicitly position this work relative to Besiroglu et al., Porian et al., and Pearce & Song, and the additive perturbation analysis is quantitatively compared to their empirical findings (e.g., ˆα increases of 0.080 and 0.231).",
    171           "source": "haiku"
    172         },
    173         "computational_complexity_discussed": {
    174           "applies": false,
    175           "answer": false,
    176           "justification": "The paper analyzes a parameter-fitting procedure for scaling laws, not an algorithm; computational complexity is not relevant to its theoretical contributions.",
    177           "source": "haiku"
    178         },
    179         "limitations_of_formal_model_stated": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "The formal model assumes C ≈ cND, a specific two-term power law loss form, and that the four perturbation types are representative—none of these modeling assumptions' limitations are explicitly discussed.",
    183           "source": "haiku"
    184         }
    185       }
    186     }
    187   },
    188   "claims": [
    189     {
    190       "claim": "Three interpretations of Chinchilla's model parameters are possible, with relative differences as high as 15.2%",
    191       "evidence": "Fig 1 shows all 50 reported model parameters disagree with the standard formula (avg 7.4%, max 15.2%); Table 1 gives a per-model comparison; a best-fit formula reduces but does not eliminate discrepancies.",
    192       "supported": "strong"
    193     },
    194     {
    195       "claim": "Key Chinchilla results (scaling law fit parameters and 20-to-1 tokens-per-parameter ratio) do not meaningfully change across all three parameter interpretations",
    196       "evidence": "Fig 2 shows overlapping bootstrapped confidence intervals for all five fit parameters and near-identical compute-optimal ratio curves across all three interpretations.",
    197       "supported": "strong"
    198     },
    199     {
    200       "claim": "Multiplicative parameter perturbations shift the compute-optimal ratio constant but preserve its flatness with respect to compute budget",
    201       "evidence": "Fig 5 top left shows flat lines at shifted levels across all multiplicative constants; Appendix C.2.1 derives analytically that the exponent on C is unchanged under multiplicative perturbation.",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Additive constant perturbations linearly increase ˆα and exponentially increase ˆA, making the optimal tokens-per-parameter ratio non-flat",
    206       "evidence": "Fig 4 row 2 shows the empirical trend; Appendix C.2.2 derives the mechanism via the effective local slope N/(N+ca), which is consistent with the additive embedding parameter findings of Porian et al. and Pearce & Song.",
    207       "supported": "strong"
    208     },
    209     {
    210       "claim": "Systematic bias perturbations cause ˆα to decay as ˆα ∝ s^{-1} with R² > 0.999",
    211       "evidence": "Section 3.3 reports the power-law fit with R² > 0.999 and p ≈ 5.9 × 10^{-90}; Appendix C.2.3 derives the exact relationship ˆα = α/s analytically.",
    212       "supported": "strong"
    213     },
    214     {
    215       "claim": "Chinchilla's compute-optimal prescription remains robust overall and can be relied upon by practitioners",
    216       "evidence": "All four perturbation analyses show the original results withstand sizable errors; the most plausible real-world error types (multiplicative, noise) have the least effect on the key results.",
    217       "supported": "moderate"
    218     }
    219   ],
    220   "methodology_tags": [
    221     "theoretical",
    222     "observational",
    223     "meta-analysis"
    224   ],
    225   "key_findings": "The paper reveals that Chinchilla's original analysis was ambiguous in its model parameter definitions, with three interpretations differing by up to 15.2%, yet all three produce virtually identical scaling law fits and the 20-to-1 tokens-per-parameter heuristic—with one interpretation yielding an even flatter relationship. A controlled sensitivity analysis across four perturbation types shows that multiplicative errors and random noise leave key results intact, while additive constants or systematic biases can qualitatively tilt the flat trend of the compute-optimal ratio. Appendix C provides closed-form analytical explanations for all observed behaviors, showing that additive perturbations break the power-law structure while multiplicative ones merely shift its constant. The authors conclude that Chinchilla's compute-optimal guidance remains a robust and trustworthy practical blueprint for scaling language models.",
    226   "red_flags": [
    227     {
    228       "flag": "No limitations section",
    229       "detail": "There is no dedicated limitations or threats-to-validity section; the Discussion only summarizes findings and lists future directions without acknowledging what the analysis cannot show."
    230     },
    231     {
    232       "flag": "No funding disclosure",
    233       "detail": "The paper discloses only LLM usage assistance (Appendix A) but never discloses funding sources, grants, or institutional support."
    234     },
    235     {
    236       "flag": "Scope limited to original Chinchilla dataset",
    237       "detail": "All analyses use the original 50-model Chinchilla training runs (44M-16B parameters); robustness conclusions may not generalize to modern training scales (100B+ parameters), different architectures, or non-Transformer models."
    238     },
    239     {
    240       "flag": "Broad conclusions from narrow scope",
    241       "detail": "The Discussion frames conclusions as guidance 'for the field' without qualifying that they apply only within the specific model range, loss function form, and perturbation types studied."
    242     }
    243   ],
    244   "cited_papers": [
    245     {
    246       "title": "An empirical analysis of compute-optimal large language model training (Hoffmann et al., 2022)",
    247       "relevance": "The foundational Chinchilla paper being analyzed; introduced the compute-optimal scaling principle and 20-to-1 tokens-per-parameter heuristic that this paper evaluates for robustness."
    248     },
    249     {
    250       "title": "Chinchilla scaling: A replication attempt (Besiroglu et al., 2024)",
    251       "relevance": "Prior replication identifying Chinchilla's three inconsistent approaches; this paper uses their fitting code to analyze the three parameter interpretations."
    252     },
    253     {
    254       "title": "Resolving discrepancies in compute-optimal scaling of language models (Porian et al., 2024)",
    255       "relevance": "Found that head parameters, warmup, and optimizer tuning explain Kaplan-Chinchilla differences; their ˆα increase of 0.080 from head parameters is quantitatively compared to the additive perturbation analysis here."
    256     },
    257     {
    258       "title": "Reconciling kaplan and chinchilla scaling laws (Pearce & Song, 2024)",
    259       "relevance": "Found embedding parameter inclusion increases ˆα by 0.231; their finding is directly compared to this paper's additive constant perturbation results as independent validation."
    260     },
    261     {
    262       "title": "Scaling laws for neural language models (Kaplan et al., 2020)",
    263       "relevance": "Established power-law scaling before Chinchilla and predicted different compute-optimal tradeoffs; understanding its divergence from Chinchilla motivates the robustness inquiry."
    264     },
    265     {
    266       "title": "Beyond chinchilla-optimal: Accounting for inference in language model scaling laws (Sardana et al., 2024)",
    267       "relevance": "Extension of Chinchilla accounting for inference costs; cited as a natural target for future robustness analysis following this paper's methodology."
    268     },
    269     {
    270       "title": "Scaling data-constrained language models (Muennighoff et al., 2023)",
    271       "relevance": "Addresses scaling under data repetition constraints; cited as another extension direction where this paper's robustness methodology could be applied."
    272     },
    273     {
    274       "title": "Language models scale reliably with over-training and on downstream tasks (Gadre et al., 2024)",
    275       "relevance": "Studies over-training beyond Chinchilla-optimal compute; cited in Future Directions as a scaling result whose robustness warrants similar analysis."
    276     }
    277   ],
    278   "engagement_factors": {
    279     "practical_relevance": {
    280       "score": 3,
    281       "justification": "Directly answers whether practitioners should trust the widely-used Chinchilla 20-to-1 training recipe, with actionable guidance on which error types matter and which don't."
    282     },
    283     "surprise_contrarian": {
    284       "score": 2,
    285       "justification": "The finding that parameter discrepancies up to 15.2% don't affect Chinchilla's conclusions is counterintuitive and responds to widespread community doubts about the analysis."
    286     },
    287     "fear_safety": {
    288       "score": 0,
    289       "justification": "No AI safety or risk concerns; this is a purely technical analysis of scaling law robustness."
    290     },
    291     "drama_conflict": {
    292       "score": 1,
    293       "justification": "Responds to existing community controversy about Chinchilla's reliability raised by several prior papers, but is ultimately a confirmatory rather than controversial result."
    294     },
    295     "demo_ability": {
    296       "score": 1,
    297       "justification": "The analysis uses publicly available Besiroglu et al. fitting code and Chinchilla's public architectural hyperparameters, making replication feasible but not immediately interactive."
    298     },
    299     "brand_recognition": {
    300       "score": 2,
    301       "justification": "Stanford affiliation and Chinchilla scaling laws are both high-recognition signals in the LLM community; Rylan Schaeffer is known for prior scaling laws and emergent abilities work."
    302     }
    303   },
    304   "hn_data": {
    305     "threads": [
    306       {
    307         "hn_id": "45417771",
    308         "title": "What the F*ck Is Artificial General Intelligence?",
    309         "points": 59,
    310         "comments": 45,
    311         "url": "https://news.ycombinator.com/item?id=45417771",
    312         "created_at": "2025-09-29T19:31:22Z"
    313       },
    314       {
    315         "hn_id": "43622263",
    316         "title": "GIScience in the Era of Artificial Intelligence",
    317         "points": 1,
    318         "comments": 0,
    319         "url": "https://news.ycombinator.com/item?id=43622263",
    320         "created_at": "2025-04-08T14:33:08Z"
    321       },
    322       {
    323         "hn_id": "43548425",
    324         "title": "What the Fuck Is Artificial General Intelligence?",
    325         "points": 1,
    326         "comments": 0,
    327         "url": "https://news.ycombinator.com/item?id=43548425",
    328         "created_at": "2025-04-01T16:05:49Z"
    329       }
    330     ],
    331     "top_points": 59,
    332     "total_points": 61,
    333     "total_comments": 45
    334   }
    335 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs