scan.json (21043B)
1 { 2 "paper": { 3 "title": "Scaling Laws for Neural Language Models", 4 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B. Brown", "Benjamin Chess", "Rewon Child", "Scott Gray", "Alec Radford", "Jeffrey Wu", "Dario Amodei"], 5 "year": 2020, 6 "venue": "arXiv", 7 "arxiv_id": "2001.08361" 8 }, 9 "scan_version": 2, 10 "active_modules": [], 11 "methodology_tags": ["observational", "theoretical"], 12 "key_findings": "Language model cross-entropy loss follows power-law relationships with model size, dataset size, and compute budget, spanning seven orders of magnitude. Performance depends strongly on scale but weakly on architectural details like depth vs. width. Optimally compute-efficient training involves training very large models on relatively modest data and stopping well before convergence, with optimal model size scaling as N ∝ C^0.73. The paper derives a unified framework predicting overfit onset, optimal compute allocation, and learning curve shapes.", 13 "claims": [ 14 { 15 "claim": "Performance scales as a power law with model size N, dataset size D, and compute C, with trends spanning more than seven orders of magnitude.", 16 "evidence": "Figure 1 and Section 3 show power-law fits across 6+ orders of magnitude in N (768 to 1.5B parameters), 2+ orders in D (22M to 23B tokens), and 8 orders in compute. Exponents: αN ≈ 0.076, αD ≈ 0.095, αC^min ≈ 0.050.", 17 "supported": "strong" 18 }, 19 { 20 "claim": "Performance depends very weakly on architectural hyperparameters (depth, width, attention heads) when total non-embedding parameter count is fixed.", 21 "evidence": "Figure 5 shows loss varies only a few percent across a wide range of shapes at fixed N. Aspect ratio can vary by 40x with only ~3% impact. Section 3.1.", 22 "supported": "strong" 23 }, 24 { 25 "claim": "Optimally compute-efficient training allocates most additional compute to larger models rather than more training steps, with N ∝ C^0.73 and S ∝ C^0.03.", 26 "evidence": "Figure 14 and Section 6.1 show empirical fits of N(C_min) ∝ C^0.73 and S_min ∝ C^0.03. Theoretical predictions from L(N,S) match within a few percent (Section 6.2).", 27 "supported": "strong" 28 }, 29 { 30 "claim": "Dataset size need only grow sublinearly with model size (D ∝ N^0.74) to avoid overfitting.", 31 "evidence": "Equation 1.5 and Figure 9 show the combined L(N,D) fit with αN/αD ≈ 0.74. Equation 4.4 provides the practical threshold D ≳ 5000 × N^0.74.", 32 "supported": "strong" 33 }, 34 { 35 "claim": "Transfer to other data distributions improves in parallel with training distribution performance, with a roughly constant offset in loss.", 36 "evidence": "Figure 8 shows test loss on Books, Wikipedia, Common Crawl all follow the same power-law slope, offset from WebText2. Right panel shows generalization depends only on training loss, not training phase.", 37 "supported": "strong" 38 } 39 ], 40 "checklist": { 41 "artifacts": { 42 "code_released": { 43 "applies": true, 44 "answer": false, 45 "justification": "No code repository or URL is provided in the paper." 46 }, 47 "data_released": { 48 "applies": true, 49 "answer": false, 50 "justification": "WebText2 is not publicly released. The paper describes how it was constructed but does not provide a download link." 51 }, 52 "environment_specified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No environment specifications, dependency lists, or hardware details beyond 'memory constraints' for largest models are provided." 56 }, 57 "reproduction_instructions": { 58 "applies": true, 59 "answer": false, 60 "justification": "No reproduction instructions or scripts are provided. The paper describes methodology but not step-by-step reproduction procedures." 61 } 62 }, 63 "statistical_methodology": { 64 "confidence_intervals_or_error_bars": { 65 "applies": true, 66 "answer": false, 67 "justification": "No confidence intervals or error bars are reported on the power-law fits or individual data points. The paper mentions run-to-run variation is ~0.05 in loss (Appendix D.6) but does not include error bars on main results." 68 }, 69 "significance_tests": { 70 "applies": true, 71 "answer": false, 72 "justification": "No statistical significance tests are performed. Claims of power-law scaling are supported by visual fits, not formal statistical testing." 73 }, 74 "effect_sizes_reported": { 75 "applies": true, 76 "answer": true, 77 "justification": "Effect sizes are reported as power-law exponents with clear context: e.g., 'doubling the number of parameters yields a loss smaller by a factor 2^−αN = 0.95' (Section 1.2). Exponents αN ≈ 0.076, αD ≈ 0.095, αC ≈ 0.050 quantify the magnitude of scaling effects." 78 }, 79 "sample_size_justified": { 80 "applies": true, 81 "answer": false, 82 "justification": "No justification for the number of model sizes, dataset sizes, or training runs chosen. The range is described but the number of data points for fitting power laws is not justified." 83 }, 84 "variance_reported": { 85 "applies": true, 86 "answer": false, 87 "justification": "Run-to-run variation is mentioned as ~0.05 in Appendix D.6 but no systematic variance reporting (std dev, multiple seeds) is provided for the main results." 88 } 89 }, 90 "evaluation_design": { 91 "baselines_included": { 92 "applies": true, 93 "answer": true, 94 "justification": "LSTMs and Universal Transformers are included as baselines for comparison (Section 3.2.1, Figure 7, Figure 17)." 95 }, 96 "baselines_contemporary": { 97 "applies": true, 98 "answer": true, 99 "justification": "LSTMs and Universal Transformers (2018) were contemporary and relevant architecture comparisons at the time of writing." 100 }, 101 "ablation_study": { 102 "applies": true, 103 "answer": true, 104 "justification": "Systematic ablation of architectural parameters: depth, width, attention heads, feed-forward dimension are varied independently at fixed N (Section 3.1, Figure 5)." 105 }, 106 "multiple_metrics": { 107 "applies": true, 108 "answer": false, 109 "justification": "The paper uses only cross-entropy loss as its metric throughout. No other metrics (e.g., perplexity in explicit form, downstream task performance) are reported." 110 }, 111 "human_evaluation": { 112 "applies": false, 113 "answer": false, 114 "justification": "Human evaluation is irrelevant to claims about scaling laws for loss metrics." 115 }, 116 "held_out_test_set": { 117 "applies": true, 118 "answer": true, 119 "justification": "A held-out test set of 6.6 × 10^8 tokens from WebText2 is reserved (Section 2.3). Results are reported on this test set." 120 }, 121 "per_category_breakdown": { 122 "applies": true, 123 "answer": true, 124 "justification": "Performance is broken down by: data distribution (Figure 8: WebText2, Books, Wikipedia, Common Crawl), token position in context (Figure 20-21), model architecture type (Figure 7, 17), and dataset size (Figure 9)." 125 }, 126 "failure_cases_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper discusses where fits break down: very small datasets (~2×10^7 tokens, Section 4.2), 1-layer models (Figure 13), extreme depth-to-width ratios (Figure 6), and the predicted eventual contradiction between scaling laws (Section 6.3)." 130 }, 131 "negative_results_reported": { 132 "applies": true, 133 "answer": true, 134 "justification": "Several negative/null results reported: architectural shape has minimal effect (Figure 5), LSTMs cannot match Transformer performance on later tokens (Figure 7), recurrent Transformers perform slightly worse per FLOP (Figure 17), convergence is compute-inefficient." 135 } 136 }, 137 "claims_and_evidence": { 138 "abstract_claims_supported": { 139 "applies": true, 140 "answer": true, 141 "justification": "All abstract claims (power-law scaling, weak architecture dependence, overfitting governed by N/D ratio, optimal compute allocation favoring large models stopped before convergence) are supported by results in the paper." 142 }, 143 "causal_claims_justified": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper is careful to frame results as empirical observations and fits rather than causal claims. Ablation-style experiments (varying one factor at a time) support the limited causal claims about architecture independence. Language is appropriately hedged: 'we observe', 'we find', 'we conjecture'." 147 }, 148 "generalization_bounded": { 149 "applies": true, 150 "answer": true, 151 "justification": "Claims are bounded to Transformer language models trained on WebText2. Section 8 explicitly notes: 'we do not know which of our results depend on the structure of natural language data, and which are universal.' The paper conjectures but does not claim generalization to other domains." 152 }, 153 "alternative_explanations_discussed": { 154 "applies": true, 155 "answer": true, 156 "justification": "Section 6.3 discusses the contradiction between scaling laws and what it implies. Appendix C lists specific caveats including confounding from context length, potential hyperparameter sensitivity, and limitations of the compute estimate. The paper discusses multiple functional forms (logarithmic vs power-law, Figure 23)." 157 }, 158 "proxy_outcome_distinction": { 159 "applies": true, 160 "answer": true, 161 "justification": "The paper is explicit that it studies cross-entropy loss and does not claim this directly translates to task performance. Section 8 notes: 'it will be important to investigate whether continued improvement on the loss translates into improvement on relevant language tasks. Smooth quantitative change can mask major qualitative improvements.'" 162 } 163 }, 164 "setup_transparency": { 165 "model_versions_specified": { 166 "applies": false, 167 "answer": false, 168 "justification": "The paper trains its own models from scratch rather than evaluating pre-existing model versions. Architecture details (layers, dimensions) are fully specified." 169 }, 170 "prompts_provided": { 171 "applies": false, 172 "answer": false, 173 "justification": "The paper does not use prompting. Models are trained on autoregressive language modeling." 174 }, 175 "hyperparameters_reported": { 176 "applies": true, 177 "answer": true, 178 "justification": "Training hyperparameters are reported: Adam optimizer, 2.5×10^5 steps, batch size 512 sequences of 1024 tokens, learning rate schedule with 3000-step warmup + cosine decay (Section 2.2). Learning rate formula in Appendix D.6. Adafactor for largest models." 179 }, 180 "scaffolding_described": { 181 "applies": false, 182 "answer": false, 183 "justification": "No agentic scaffolding is used." 184 }, 185 "data_preprocessing_documented": { 186 "applies": true, 187 "answer": true, 188 "justification": "Data preprocessing is described: Reddit outbound links with ≥3 karma, text extracted with Newspaper3k, reversible BPE tokenizer with nvocab=50257 (Section 2.3). Dataset size: 20.3M documents, 96GB text, 2.29×10^10 tokens." 189 } 190 }, 191 "limitations_and_scope": { 192 "limitations_section_present": { 193 "applies": true, 194 "answer": true, 195 "justification": "Appendix C ('Caveats') provides a dedicated section listing specific limitations. Section 8 (Discussion) also discusses limitations and open questions." 196 }, 197 "threats_to_validity_specific": { 198 "applies": true, 199 "answer": true, 200 "justification": "Appendix C lists specific threats: no theoretical understanding of scaling laws, uncertainty in Bcrit extrapolation, poor fits for smallest datasets, compute estimate excludes context-dependent terms, potential neglected hyperparameters, learning rate sensitivity to target loss." 201 }, 202 "scope_boundaries_stated": { 203 "applies": true, 204 "answer": true, 205 "justification": "The paper explicitly bounds scope: results are for Transformer language models on WebText2. Section 8 states results may not apply to other domains. The paper notes 'we do not know which of our results depend on the structure of natural language data, and which are universal.'" 206 } 207 }, 208 "data_integrity": { 209 "raw_data_available": { 210 "applies": true, 211 "answer": false, 212 "justification": "Neither the WebText2 dataset nor the raw experimental results (individual training runs, loss curves) are made available." 213 }, 214 "data_collection_described": { 215 "applies": true, 216 "answer": true, 217 "justification": "Data collection is described: Reddit outbound links through October 2018, ≥3 karma threshold, Newspaper3k extraction (Section 2.3). Model training procedures are documented in Section 2.2." 218 }, 219 "recruitment_methods_described": { 220 "applies": false, 221 "answer": false, 222 "justification": "No human participants. Data source is web-scraped text." 223 }, 224 "data_pipeline_documented": { 225 "applies": true, 226 "answer": true, 227 "justification": "The pipeline from raw web text to tokens is documented: Reddit links → karma filter → Newspaper3k extraction → BPE tokenization → train/test split (Section 2.3). Model parameter counts and compute estimates are derived in Table 1." 228 } 229 }, 230 "conflicts_of_interest": { 231 "funding_disclosed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No funding source is disclosed. The work was done at OpenAI but no funding statement is provided." 235 }, 236 "affiliations_disclosed": { 237 "applies": true, 238 "answer": true, 239 "justification": "All authors list OpenAI as their affiliation (one co-affiliated with Johns Hopkins). Affiliations are prominently displayed." 240 }, 241 "funder_independent_of_outcome": { 242 "applies": true, 243 "answer": false, 244 "justification": "OpenAI has a direct commercial interest in scaling results — larger models being better supports their business strategy. The funder (OpenAI) is not independent of the outcome." 245 }, 246 "financial_interests_declared": { 247 "applies": true, 248 "answer": false, 249 "justification": "No competing interests or financial disclosure statement is provided." 250 } 251 }, 252 "contamination": { 253 "training_cutoff_stated": { 254 "applies": false, 255 "answer": false, 256 "justification": "The paper trains its own models from scratch on a known dataset. It does not evaluate a pre-trained model's capability on benchmarks." 257 }, 258 "train_test_overlap_discussed": { 259 "applies": false, 260 "answer": false, 261 "justification": "Not applicable — models are trained on WebText2 and evaluated on its held-out test set. No pre-trained model benchmark evaluation." 262 }, 263 "benchmark_contamination_addressed": { 264 "applies": false, 265 "answer": false, 266 "justification": "Not applicable — the paper does not use external benchmarks to evaluate pre-trained model capability." 267 } 268 }, 269 "human_studies": { 270 "pre_registered": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants." 274 }, 275 "irb_or_ethics_approval": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants." 279 }, 280 "demographics_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants." 284 }, 285 "inclusion_exclusion_criteria": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants." 289 }, 290 "randomization_described": { 291 "applies": false, 292 "answer": false, 293 "justification": "No human participants." 294 }, 295 "blinding_described": { 296 "applies": false, 297 "answer": false, 298 "justification": "No human participants." 299 }, 300 "attrition_reported": { 301 "applies": false, 302 "answer": false, 303 "justification": "No human participants." 304 } 305 }, 306 "cost_and_practicality": { 307 "inference_cost_reported": { 308 "applies": true, 309 "answer": false, 310 "justification": "No inference cost or latency is reported, though the paper extensively discusses compute in PF-days for training." 311 }, 312 "compute_budget_stated": { 313 "applies": true, 314 "answer": true, 315 "justification": "Compute is extensively reported in PF-days throughout the paper. Individual training runs span from ~10^-8 to ~10^3 PF-days (Figures 1, 13, 14). The compute estimation formula C ≈ 6NBS is derived in Section 2.1." 316 } 317 } 318 }, 319 "red_flags": [ 320 { 321 "flag": "Company evaluating its own scaling paradigm", 322 "detail": "All authors are OpenAI employees. The conclusion that 'larger models will continue to perform better' directly supports OpenAI's commercial strategy of building increasingly large models. While the empirical methodology appears sound, the conflict is not disclosed." 323 }, 324 { 325 "flag": "No error bars or uncertainty quantification on fits", 326 "detail": "Power-law exponents are reported as point estimates (e.g., αN ≈ 0.076) without confidence intervals or goodness-of-fit statistics. Run-to-run variation of ~0.05 is mentioned in an appendix but not propagated to the main results. The claimed precision of exponents like '0.076' may not be warranted." 327 }, 328 { 329 "flag": "Private dataset", 330 "detail": "WebText2 is not publicly available, making independent verification of the core results impossible. The original WebText was also not released. Results cannot be reproduced on the same data." 331 } 332 ], 333 "cited_papers": [ 334 { 335 "title": "Attention is all you need", 336 "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N Gomez", "Łukasz Kaiser", "Illia Polosukhin"], 337 "year": 2017, 338 "relevance": "Foundational Transformer architecture paper that this scaling study is built upon." 339 }, 340 { 341 "title": "Language models are unsupervised multitask learners", 342 "authors": ["Alec Radford", "Jeff Wu", "Rewon Child", "David Luan", "Dario Amodei", "Ilya Sutskever"], 343 "year": 2019, 344 "relevance": "GPT-2 paper; provides the WebText dataset and Transformer implementation used in this study." 345 }, 346 { 347 "title": "BERT: Pre-training of deep bidirectional transformers for language understanding", 348 "authors": ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"], 349 "year": 2018, 350 "arxiv_id": "1810.04805", 351 "relevance": "Major language model pretraining approach; context for understanding the scaling landscape." 352 }, 353 { 354 "title": "An empirical model of large-batch training", 355 "authors": ["Sam McCandlish", "Jared Kaplan", "Dario Amodei"], 356 "year": 2018, 357 "arxiv_id": "1812.06162", 358 "relevance": "Critical batch size theory used extensively in this paper's analysis of compute-efficient training." 359 }, 360 { 361 "title": "Deep learning scaling is predictable, empirically", 362 "authors": ["Joel Hestness", "Sharan Narang", "Newsha Ardalani", "Gregory Diamos"], 363 "year": 2017, 364 "arxiv_id": "1712.00409", 365 "relevance": "Prior work on scaling relationships between model size and data size; found super-linear scaling (contrasting this paper's sub-linear finding)." 366 }, 367 { 368 "title": "EfficientNet: Rethinking model scaling for convolutional neural networks", 369 "authors": ["Mingxing Tan", "Quoc V. Le"], 370 "year": 2019, 371 "relevance": "Parallel scaling work in vision models; advocates different depth/width scaling than found here for language." 372 }, 373 { 374 "title": "A constructive prediction of the generalization error across scales", 375 "authors": ["Jonathan S. Rosenfeld", "Amir Rosenfeld", "Yonatan Belinkov", "Nir Shavit"], 376 "year": 2019, 377 "arxiv_id": "1909.12673", 378 "relevance": "Concurrent work making similar predictions for loss dependence on model and dataset size." 379 }, 380 { 381 "title": "Improving language understanding by generative pre-training", 382 "authors": ["Alec Radford", "Karthik Narasimhan", "Tim Salimans", "Ilya Sutskever"], 383 "year": 2018, 384 "relevance": "GPT-1 paper; establishes the decoder-only Transformer approach used in this scaling study." 385 } 386 ] 387 }