scan.json (21225B)
1 { 2 "paper": { 3 "title": "Evaluating the Robustness of Chinchilla Compute-Optimal Scaling", 4 "authors": ["Rylan Schaeffer", "Noam Levi", "Andreas Kirsch", "Theo Guenais", "Brando Miranda"], 5 "year": 2025, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2509.23963", 8 "doi": "10.48550/arXiv.2509.23963" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": false, 17 "justification": "No code repository or archive URL is provided in the paper. The paper mentions using 'Besiroglu et al. (2024)'s Chinchilla fitting code' but does not release their own perturbation analysis code." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The paper uses Chinchilla's publicly available Table A9 data (model parameters and architectural hyperparameters), which is fully reproduced in Appendix B (Table 2). All perturbation parameters are specified with exact sweep ranges." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "No environment specifications, dependency files, or library versions are provided." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions are provided. The methods are described at a mathematical level but there are no runnable scripts or README." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": true, 39 "justification": "Error bars from 4000 bootstrapped samples are reported for scaling law parameters (Fig. 2, Fig. 4). 80% confidence intervals are shown for compute-optimal tokens-per-parameter ratios (Fig. 2, Fig. 5)." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "No formal significance tests are used to assess whether differences between parameter interpretations or perturbation effects are statistically significant. Comparisons rely on visual inspection of overlapping error bars." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper reports specific effect sizes: relative errors between parameter interpretations up to 15.2%, slopes of tokens-per-parameter ratio (-0.572, -1.049, -1.248 per decade), R² > 0.999 for power-law fit of ˆα under systematic bias." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The analysis uses 50 models from Chinchilla's Table A9. No justification is given for whether this is sufficient for the sensitivity analyses performed." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Standard errors from 4000 bootstrapped samples are reported throughout (Figs. 2, 4). For log-normal noise perturbation, variance across random noise realizations is shown." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The unperturbed Chinchilla results (reported model parameters, no perturbation) serve as the baseline against which all perturbations are compared." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "The paper engages with contemporary replication/re-evaluation studies: Besiroglu et al. (2024), Porian et al. (2024), Pearce & Song (2024), all recent work scrutinizing Chinchilla." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": true, 76 "justification": "The four perturbation types (multiplicative, additive, systematic bias, log-normal noise) function as an ablation/sensitivity analysis, systematically varying one factor at a time." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "Results are evaluated on multiple metrics: five scaling law parameters (Ê, Â, α̂, B̂, β̂) and the compute-optimal tokens-per-parameter ratio." 82 }, 83 "human_evaluation": { 84 "applies": false, 85 "answer": false, 86 "justification": "This is a mathematical/analytical study of scaling laws. Human evaluation is not relevant." 87 }, 88 "held_out_test_set": { 89 "applies": false, 90 "answer": false, 91 "justification": "This is not a prediction task. The paper refits scaling laws to existing data under perturbations; there is no train/test split." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are broken down per perturbation type (Sections 3.1-3.4), per scaling parameter (Fig. 4), and per perturbation strength (Figs. 3-5)." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper identifies cases where robustness breaks down: additive perturbations and systematic bias can 'qualitatively change the compute-optimal scaling strategy' (Section 3.2, 3.3). NaN results for extreme multiplicative constants are also noted." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper reports that additive and systematic bias perturbations DO affect the trend of the optimal ratio (making it less constant), which is a negative finding for robustness. NaN results at extremes are also reported." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The abstract claims that key results are robust to parameter interpretation and structured perturbations, which is supported by Figs. 2, 4, 5. The abstract also notes sensitivity to additive/systematic errors, which is shown in Sections 3.2-3.3." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper makes causal claims about how perturbations affect scaling law fits. These are justified by controlled single-variable manipulation (each perturbation type varied independently) and mathematical derivations in Appendix C." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper bounds its claims to the specific Chinchilla dataset and scaling law formulation. The Future Directions section explicitly notes that extending to 'more recent scaling results with additional considerations' is future work." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper discusses alternative interpretations: the 'best fit formula' with factor 5 instead of 4 in attention parameters (Eqn. 3), and connects findings to Porian et al. and Pearce & Song's explanations for discrepancies between Chinchilla and Kaplan scaling laws." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper's claims match the granularity of its measurements. It measures scaling law fit parameters and tokens-per-parameter ratios under perturbations, and claims robustness of these specific quantities — no broader framing gap exists." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": false, 139 "answer": false, 140 "justification": "This paper does not use any LLM for experiments. It refits scaling laws using mathematical optimization on existing data." 141 }, 142 "prompts_provided": { 143 "applies": false, 144 "answer": false, 145 "justification": "No prompting is used. This is a mathematical/analytical study." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "All perturbation sweep parameters are specified: multiplicative cm in logspace(-3, 3, num=11), additive ca in specified ranges, systematic bias s in logspace(-0.5, 0.5, 11), noise σ from 1e-2 to 1e2. Bootstrap uses 4000 samples." 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding is used." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "The data source is clearly documented: Chinchilla's Table A9 with 50 models. Three interpretations of model parameters are derived from architectural hyperparameters using Equations 1 and 3. The full table is reproduced in Appendix B." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": false, 167 "justification": "There is no dedicated limitations section. The Discussion (Section 5) briefly mentions future directions but does not discuss limitations of the current analysis." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": false, 172 "justification": "No specific threats to validity are discussed. The paper does not address, e.g., whether the four perturbation types are exhaustive, whether the bootstrap methodology is appropriate for this data structure, or whether using Besiroglu et al.'s fitting code introduces assumptions." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "The Future Directions paragraph explicitly states scope boundaries: 'One obvious next step is to evaluate the robustness of more recent scaling results with additional considerations such as inference constraints, data constraints and overtraining.' This acknowledges the analysis is limited to the original Chinchilla formulation." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "The raw data (Chinchilla's Table A9) is fully reproduced in Appendix B (Table 2), including all 50 models' architectural hyperparameters and three interpretations of model parameters." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "The data comes from a single, clearly identified source: Hoffmann et al. (2022)'s Table A9. No new data was collected." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. Data is from a published paper's model specifications." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "The pipeline is transparent: take Table A9 data → compute three parameter interpretations using Eqns. 1 and 3 → apply perturbations using Eqns. 6-9 → refit using Besiroglu et al.'s code. Each step is documented with equations." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding source is disclosed. Authors are from Stanford University and EPFL but no grants or sponsorship are mentioned." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly listed: Stanford University and EPFL. The paper evaluates Chinchilla (DeepMind), and no authors are affiliated with DeepMind." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding is disclosed, so independence cannot be assessed." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is provided." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": false, 227 "answer": false, 228 "justification": "This paper does not evaluate a pre-trained model on any benchmark. It refits scaling laws to published data." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": false, 232 "answer": false, 233 "justification": "No pre-trained model evaluation on benchmarks is performed." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": false, 237 "answer": false, 238 "justification": "No pre-trained model evaluation on benchmarks is performed." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "This is a theoretical/analytical paper. No inference costs are relevant." 283 }, 284 "compute_budget_stated": { 285 "applies": false, 286 "answer": false, 287 "justification": "This is a theoretical/analytical paper refitting scaling laws. Compute is negligible." 288 } 289 } 290 }, 291 "claims": [ 292 { 293 "claim": "Three interpretations of Chinchilla's model parameters are possible, with relative differences as high as 15.2%.", 294 "evidence": "Table 1 and Fig. 1 show that the standard formula disagrees with reported parameters for 50/50 models, with average relative error of 7.4% and max 15.2%. The 'best fit' formula reduces discrepancies to 6/50 models.", 295 "supported": "strong" 296 }, 297 { 298 "claim": "Key Chinchilla results (scaling law parameters and ~20:1 tokens-per-parameter ratio) are robust to all three parameter interpretations.", 299 "evidence": "Fig. 2 shows the five fit parameters do not meaningfully differ across interpretations, and the compute-optimal ratio remains ~20 tokens per parameter. Standard formula yields the flattest slope (-0.572 per decade).", 300 "supported": "strong" 301 }, 302 { 303 "claim": "Multiplicative perturbations shift the tokens-per-parameter ratio but preserve the flat trend with compute budget.", 304 "evidence": "Fig. 4 Row 1 and Fig. 5 Top Left show ˆA scales as A·cm^α while ˆα remains stable. Theoretical derivation in Appendix C.2.1 confirms this.", 305 "supported": "strong" 306 }, 307 { 308 "claim": "Additive perturbations and systematic bias can qualitatively change the compute-optimal scaling strategy.", 309 "evidence": "Fig. 5 Top Right and Bottom Left show the tokens-per-parameter ratio becomes less constant, with positive/negative slopes depending on perturbation direction. Mathematical derivations in Appendix C.2.2-C.2.3 explain the mechanism.", 310 "supported": "strong" 311 }, 312 { 313 "claim": "Overall, Chinchilla's key results withstand sizable perturbations, providing renewed confidence in compute-optimal scaling.", 314 "evidence": "Across all four perturbation types, the 20:1 ratio and scaling law form are preserved for moderate perturbations. Only extreme additive/systematic perturbations alter the qualitative picture.", 315 "supported": "moderate" 316 } 317 ], 318 "methodology_tags": ["theoretical"], 319 "key_findings": "The paper discovers that three possible interpretations of Chinchilla's model parameters exist with discrepancies up to 15.2%, but none meaningfully affect the scaling law estimates or the ~20:1 compute-optimal tokens-per-parameter ratio. Through four structured perturbation analyses (multiplicative, additive, systematic bias, log-normal noise), they show that multiplicative errors and random noise preserve robustness while additive constants and systematic biases can alter the trend of the optimal ratio with compute budget. The work provides mathematical derivations explaining each perturbation's effects and concludes that Chinchilla's compute-optimal prescription remains a durable guide.", 320 "red_flags": [ 321 { 322 "flag": "Confirmatory framing", 323 "detail": "The paper's stated goal is to provide 'renewed confidence' in Chinchilla, and the framing consistently emphasizes robustness. The finding that additive and systematic perturbations DO qualitatively change results is somewhat downplayed relative to the robustness narrative. The abstract says 'the answer is yes' to whether practitioners can rely on Chinchilla, but the sensitivity to additive perturbations (which correspond to the realistic scenario of including/excluding embedding parameters) suggests the answer is more nuanced." 324 }, 325 { 326 "flag": "No limitations section", 327 "detail": "The paper lacks any discussion of limitations: whether the four perturbation types cover the space of realistic errors, whether the 50 Chinchilla models provide sufficient statistical power, or whether the bootstrap methodology is appropriate given the structured (non-i.i.d.) nature of the model configurations." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "An empirical analysis of compute-optimal large language model training", 333 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"], 334 "year": 2022, 335 "relevance": "The foundational Chinchilla paper whose robustness is the subject of this study." 336 }, 337 { 338 "title": "Scaling laws for neural language models", 339 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 340 "year": 2020, 341 "arxiv_id": "2001.08361", 342 "relevance": "Earlier scaling law work whose discrepancies with Chinchilla motivated this robustness analysis." 343 }, 344 { 345 "title": "Chinchilla scaling: A replication attempt", 346 "authors": ["Tamay Besiroglu", "Ege Erdil", "Matthew Barnett", "Josh You"], 347 "year": 2024, 348 "arxiv_id": "2404.10102", 349 "relevance": "Key replication study that provided the fitting code used in this paper and identified inconsistencies in Chinchilla's three approaches." 350 }, 351 { 352 "title": "Resolving discrepancies in compute-optimal scaling of language models", 353 "authors": ["Tomer Porian", "Mitchell Wortsman", "Jenia Jitsev"], 354 "year": 2024, 355 "relevance": "Identified three factors resolving Chinchilla-Kaplan discrepancies; this paper's additive perturbation analysis relates to their finding about head parameters." 356 }, 357 { 358 "title": "Reconciling Kaplan and Chinchilla scaling laws", 359 "authors": ["Tim Pearce", "Jinyeop Song"], 360 "year": 2024, 361 "relevance": "Found embedding parameter inclusion explains much of the Chinchilla-Kaplan discrepancy, directly related to this paper's additive perturbation analysis." 362 }, 363 { 364 "title": "Beyond chinchilla-optimal: Accounting for inference in language model scaling laws", 365 "authors": ["Nikhil Sardana", "Jacob Portes", "Sasha Doubov", "Jonathan Frankle"], 366 "year": 2024, 367 "relevance": "Extends Chinchilla scaling to account for inference costs, identified as a future direction in this paper." 368 }, 369 { 370 "title": "Scaling data-constrained language models", 371 "authors": ["Niklas Muennighoff"], 372 "year": 2023, 373 "relevance": "Studied scaling under data constraints, another future direction identified in this paper." 374 }, 375 { 376 "title": "Language models scale reliably with over-training and on downstream tasks", 377 "authors": ["Samir Yitzhak Gadre"], 378 "year": 2024, 379 "arxiv_id": "2403.08540", 380 "relevance": "Studied scaling with overtraining, mentioned as a future extension of robustness analysis." 381 }, 382 { 383 "title": "Are emergent abilities of large language models a mirage?", 384 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 385 "year": 2023, 386 "relevance": "Prior work by the same authors on scaling phenomena, relevant to understanding LLM capability scaling." 387 }, 388 { 389 "title": "Scaling laws for precision", 390 "authors": ["Tanishq Kumar"], 391 "year": 2024, 392 "arxiv_id": "2411.04330", 393 "relevance": "Studies how quantization precision interacts with scaling laws, extending the Chinchilla framework." 394 } 395 ] 396 }