scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (18062B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Evaluating the Robustness of Chinchilla Compute-Optimal Scaling",
      6     "authors": [
      7       "Rylan Schaeffer",
      8       "Noam Levi",
      9       "Andreas Kirsch",
     10       "Theo Guenais",
     11       "Brando Miranda"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2509.23963",
     16     "doi": "10.48550/arXiv.2509.23963"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims that key results are robust to parameter interpretation and structured perturbations, which is supported by Figs. 2, 4, 5. The abstract also notes sensitivity to additive/systematic errors, which is shown in Sections 3.2-3.3.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper makes causal claims about how perturbations affect scaling law fits. These are justified by controlled single-variable manipulation (each perturbation type varied independently) and mathematical derivations in Appendix C.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The paper bounds its claims to the specific Chinchilla dataset and scaling law formulation. The Future Directions section explicitly notes that extending to 'more recent scaling results with additional considerations' is future work.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper discusses alternative interpretations: the 'best fit formula' with factor 5 instead of 4 in attention parameters (Eqn. 3), and connects findings to Porian et al. and Pearce & Song's explanations for discrepancies between Chinchilla and Kaplan scaling laws.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper's claims match the granularity of its measurements. It measures scaling law fit parameters and tokens-per-parameter ratios under perturbations, and claims robustness of these specific quantities — no broader framing gap exists.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations section. The Discussion (Section 5) briefly mentions future directions but does not discuss limitations of the current analysis.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are discussed. The paper does not address, e.g., whether the four perturbation types are exhaustive, whether the bootstrap methodology is appropriate for this data structure, or whether using Besiroglu et al.'s fitting code introduces assumptions.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The Future Directions paragraph explicitly states scope boundaries: 'One obvious next step is to evaluate the robustness of more recent scaling results with additional considerations such as inference constraints, data constraints and overtraining.' This acknowledges the analysis is limited to the original Chinchilla formulation.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source is disclosed. Authors are from Stanford University and EPFL but no grants or sponsorship are mentioned.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly listed: Stanford University and EPFL. The paper evaluates Chinchilla (DeepMind), and no authors are affiliated with DeepMind.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so independence cannot be assessed.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is provided.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are precisely defined: 'compute-optimal', 'reported model parameters', 'standard formula model parameters', and 'best fit formula model parameters' are each given explicit mathematical definitions (Eqns. 1–3).",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contribution is clearly stated in the introduction: to determine whether Chinchilla's prescriptions remain valid given parameter ambiguity, via both a three-interpretation comparison and a four-perturbation sensitivity analysis.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 4 explicitly distinguishes this work from Besiroglu et al. (2024), Porian et al. (2024), Pearce & Song (2024), and Zhang (2023), explaining how a robustness focus differs from prior replication/reconciliation approaches.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "theoretical": {
    120       "formal_quality": {
    121         "assumptions_stated_explicitly": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Assumptions are stated at the start of Appendix C.2: 'since the data D is unaffected, we assume B̂ ≈ B and β̂ ≈ β. We will break this assumption when necessary.' Each perturbation model is also explicitly defined.",
    125           "source": "haiku"
    126         },
    127         "proofs_complete_or_sketched": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Appendix C provides complete analytical derivations for all four perturbation types with full algebraic steps; no proofs are deferred or left to the reader.",
    131           "source": "haiku"
    132         },
    133         "bounds_tight_or_discussed": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Where approximations are made (e.g., N ≫ |c_a| in C.2.2), the conditions are stated; the regime limitation is acknowledged as 'for the larger models in the study.'",
    137           "source": "haiku"
    138         },
    139         "counterexamples_explored": {
    140           "applies": true,
    141           "answer": true,
    142           "justification": "Edge cases are explicitly noted: multiplicative constants of 0.001 and 0.004 produced NaN fits, and log-normal noise with high σ made parameters 'nearly unidentifiable'—both identified and discussed in the text.",
    143           "source": "haiku"
    144         },
    145         "notation_consistent": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Notation is consistent throughout: N for model parameters, D for training tokens, C for compute budget, (E, A, α, B, β) for scaling law parameters, and tilde notation for perturbed quantities.",
    149           "source": "haiku"
    150         },
    151         "constructive_vs_existence_noted": {
    152           "applies": true,
    153           "answer": true,
    154           "justification": "All results are constructive—explicit formulas are derived for how fitted parameters change under each perturbation type (e.g., α̂ = α/s for systematic bias, Â ≈ Ac^α_m for multiplicative), not mere existence claims.",
    155           "source": "haiku"
    156         }
    157       },
    158       "connections": {
    159         "connection_to_practice_discussed": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper explicitly frames all results in terms of practitioner implications throughout, concluding that 'Chinchilla's compute-optimal prescription remains robust, further justifying its widespread use as a practical scaling blueprint.'",
    163           "source": "haiku"
    164         },
    165         "relationship_to_prior_work_clear": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Section 4 explicitly states: 'Like Besiroglu et al. and Porian et al., our work scrutinizes Chinchilla. However, our analyses focuses specifically on how robust the original methodology is to different perturbations.'",
    169           "source": "haiku"
    170         },
    171         "computational_complexity_discussed": {
    172           "applies": false,
    173           "answer": false,
    174           "justification": "Not applicable: all computations are simple nonlinear regression fits on 50 data points with 4000 bootstrap resamples; no novel algorithms with tractability concerns are introduced.",
    175           "source": "haiku"
    176         },
    177         "limitations_of_formal_model_stated": {
    178           "applies": true,
    179           "answer": false,
    180           "justification": "The paper does not discuss limitations of the L(N,D) = E + A/N^α + B/D^β formal model itself—e.g., that it ignores architecture differences, data quality, or that the 50-model sample may not span all relevant regimes.",
    181           "source": "haiku"
    182         }
    183       }
    184     }
    185   },
    186   "claims": [
    187     {
    188       "claim": "Three different interpretations of Chinchilla model parameters are possible, with relative differences as high as 15.2%.",
    189       "evidence": "Table 1 and Fig. 1 show 50/50 models disagree between standard formula and reported parameters (avg 7.4% error, max 15.2%); 44/50 match under the best-fit formula.",
    190       "supported": "strong"
    191     },
    192     {
    193       "claim": "Key Chinchilla results (scaling law parameters and 20-to-1 tokens-per-parameter ratio) do not meaningfully change across all three interpretations.",
    194       "evidence": "Fig. 2 shows overlapping bootstrap confidence intervals for all five fit parameters and near-constant D/N ≈ 20 across all three interpretations.",
    195       "supported": "strong"
    196     },
    197     {
    198       "claim": "Multiplicative constant perturbations preserve the flat trend of the compute-optimal tokens-per-parameter ratio while shifting only the overall constant.",
    199       "evidence": "Fig. 5 (top left) and Appendix C.2.1 show analytically and empirically that multiplicative scaling shifts the ratio by c^α_m but does not change its slope with compute.",
    200       "supported": "strong"
    201     },
    202     {
    203       "claim": "Additive constant and systematic bias perturbations can qualitatively alter the trend of the optimal tokens-per-parameter ratio.",
    204       "evidence": "Fig. 5 (top right and bottom left) show the ratio acquires a non-zero slope with compute; Appendix C.2.2–3 derive this analytically from the resulting change in α̂.",
    205       "supported": "strong"
    206     },
    207     {
    208       "claim": "Log-normal noise perturbations primarily increase uncertainty without systematically altering the flat trend of the compute-optimal ratio.",
    209       "evidence": "Fig. 5 (bottom right) shows substantially widened 80% confidence intervals and higher magnitude but approximately constant trend, with NaNs only at the highest noise levels.",
    210       "supported": "strong"
    211     },
    212     {
    213       "claim": "The standard formula model parameters yield a flatter compute-optimal trend (slope -0.572 per decade) than the reported parameters (slope -1.248).",
    214       "evidence": "Fig. 2 bottom row reports these slopes directly, but the authors explicitly caution 'uncertainty makes drawing strong conclusions difficult.'",
    215       "supported": "moderate"
    216     }
    217   ],
    218   "methodology_tags": [
    219     "theoretical"
    220   ],
    221   "key_findings": "Three different interpretations of Chinchilla's 50 training model parameter counts—differing by up to 15.2%—produce essentially identical scaling law estimates and compute-optimal tokens-per-parameter ratios (~20), validating the '20-to-1' heuristic as robust to this source of ambiguity. A systematic sensitivity analysis of four structured perturbation types reveals that multiplicative and log-normal noise perturbations are largely benign (preserving the flat relationship between optimal ratio and compute budget), while additive constant and systematic bias perturbations can qualitatively alter the trend—but only under substantial distortions. Complete analytical derivations in the appendix explain the mechanism for each perturbation type, providing both practical reassurance and theoretical understanding of Chinchilla's robustness.",
    222   "red_flags": [
    223     {
    224       "flag": "No limitations section",
    225       "detail": "The paper lacks any discussion of limitations of its own analysis—only 50 Chinchilla models are used, perturbations are tested in isolation rather than in combination, and the four perturbation types are not argued to be exhaustive of real-world error modes."
    226     },
    227     {
    228       "flag": "No funding disclosure",
    229       "detail": "No funding source is disclosed anywhere in the paper, preventing assessment of potential conflicts of interest for work at Stanford and EPFL."
    230     },
    231     {
    232       "flag": "Approximation domain not verified",
    233       "detail": "The additive perturbation analysis assumes N ≫ |c_a|, but the smallest Chinchilla model has only 42M parameters while the perturbation sweep includes |c_a| up to 40M—putting several models near or below the approximation's validity boundary."
    234     }
    235   ],
    236   "cited_papers": [
    237     {
    238       "title": "An empirical analysis of compute-optimal large language model training (Chinchilla)",
    239       "relevance": "Primary paper being evaluated; introduced compute-optimal scaling and the 20-to-1 tokens-per-parameter heuristic that this work validates."
    240     },
    241     {
    242       "title": "Chinchilla scaling: A replication attempt (Besiroglu et al., 2024)",
    243       "relevance": "Investigated inconsistencies in Chinchilla's third fitting approach; provides the fitting code used in this paper's analyses."
    244     },
    245     {
    246       "title": "Resolving discrepancies in compute-optimal scaling of language models (Porian et al., 2024)",
    247       "relevance": "Identified three sources of discrepancy between Chinchilla and Kaplan scaling laws; found embedding parameters affected α̂ by 0.080, directly compared to this paper's additive perturbation results."
    248     },
    249     {
    250       "title": "Reconciling Kaplan and Chinchilla scaling laws (Pearce & Song, 2024)",
    251       "relevance": "Found that embedding parameter inclusion increased α̂ by 0.231, providing a real-world reference point for the additive perturbation analysis."
    252     },
    253     {
    254       "title": "Scaling laws for neural language models (Kaplan et al., 2020)",
    255       "relevance": "Established the foundational scaling law framework that Chinchilla built upon; the discrepancy between Kaplan and Chinchilla estimates motivates the perturbation analysis."
    256     },
    257     {
    258       "title": "Beyond Chinchilla-optimal: Accounting for inference in language model scaling laws (Sardana et al., 2024)",
    259       "relevance": "Extends Chinchilla with inference constraints; cited as a future direction where robustness analysis should be applied."
    260     },
    261     {
    262       "title": "Language models scale reliably with over-training and on downstream tasks (Gadre et al., 2024)",
    263       "relevance": "Examines overtraining regimes beyond Chinchilla-optimal; cited as another setting where the robustness analysis should be extended."
    264     }
    265   ],
    266   "engagement_factors": {
    267     "practical_relevance": {
    268       "score": 2,
    269       "justification": "Directly relevant to anyone training large language models, confirming that the Chinchilla 20-to-1 rule remains a reliable guide despite parameter measurement ambiguities."
    270     },
    271     "surprise_contrarian": {
    272       "score": 2,
    273       "justification": "The finding that up to 15.2% parameter disagreement leaves scaling law estimates essentially unchanged is counterintuitive and provides genuine field-wide reassurance."
    274     },
    275     "fear_safety": {
    276       "score": 0,
    277       "justification": "No AI safety or risk implications; purely a methodological validation of a compute-efficiency heuristic."
    278     },
    279     "drama_conflict": {
    280       "score": 1,
    281       "justification": "Responds to existing public criticism of Chinchilla (Zhang 2023 Twitter thread, Besiroglu 2024) but resolves rather than escalates the controversy."
    282     },
    283     "demo_ability": {
    284       "score": 0,
    285       "justification": "No interactive demo possible; requires access to Chinchilla's model parameter data and regression fitting code."
    286     },
    287     "brand_recognition": {
    288       "score": 1,
    289       "justification": "Stanford affiliation and lead author has prior high-visibility scaling law work (emergent abilities mirage paper), but no major lab product association."
    290     }
    291   },
    292   "hn_data": {
    293     "threads": [
    294       {
    295         "hn_id": "45417771",
    296         "title": "What the F*ck Is Artificial General Intelligence?",
    297         "points": 59,
    298         "comments": 45,
    299         "url": "https://news.ycombinator.com/item?id=45417771",
    300         "created_at": "2025-09-29T19:31:22Z"
    301       },
    302       {
    303         "hn_id": "43622263",
    304         "title": "GIScience in the Era of Artificial Intelligence",
    305         "points": 1,
    306         "comments": 0,
    307         "url": "https://news.ycombinator.com/item?id=43622263",
    308         "created_at": "2025-04-08T14:33:08Z"
    309       },
    310       {
    311         "hn_id": "43548425",
    312         "title": "What the Fuck Is Artificial General Intelligence?",
    313         "points": 1,
    314         "comments": 0,
    315         "url": "https://news.ycombinator.com/item?id=43548425",
    316         "created_at": "2025-04-01T16:05:49Z"
    317       }
    318     ],
    319     "top_points": 59,
    320     "total_points": 61,
    321     "total_comments": 45
    322   }
    323 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs