scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25360B)
      1 {
      2   "paper": {
      3     "title": "Beyond Chinchilla-Optimal: Accounting for Inference in Language Model Scaling Laws",
      4     "authors": ["Nikhil Sardana", "Jacob Portes", "Sasha Doubov", "Jonathan Frankle"],
      5     "year": 2024,
      6     "venue": "ICML 2024 (Proceedings of the 41st International Conference on Machine Learning)",
      7     "arxiv_id": "2401.00448"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The paper uses MPT architecture and Composer library but does not release the code for their analysis or experiments."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The training dataset is described only as 'trillions of tokens of general web text and code' (Section 3) without any download link or public release. The 47 trained model checkpoints are also not released."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using the MPT architecture and Composer library but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided. While Appendix C gives model training configurations (Table 4), there are no scripts, README, or explicit instructions for reproducing the experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No confidence intervals or error bars are reported for any results. Figures 2, 3, 5, and 6 show point estimates only. The paper acknowledges this indirectly by citing Besiroglu et al. (2024) noting that 'confidence intervals are quite wide for parametric function fitting' but does not report its own."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No statistical significance tests are performed. Claims like 'loss continues to decrease' and comparisons between compute-optimal and Chinchilla models are made by comparing raw values without any statistical testing."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Effect sizes are reported in context throughout: e.g., 'reduce their total compute by 1.7 x 10^22 FLOPs (17%)' (Figure 1), '28% by training a 13.6B model on 2.84x the data' (Section 2), '36% more than a cost-optimal model' (Section 6). Tables 2 and 3 provide compute reductions and cost savings with baselines."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper trains 47 models but does not justify this number. The authors acknowledge the limitation: 'our analysis only includes 47 separate experiments (Chinchilla included more than 400 experiments)' (Section 7) but provide no power analysis or justification for why 47 is sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported. Each of the 47 models appears to be a single training run. The 'smoothed final training loss over the last ten batches' (Appendix C) is used but this is noise reduction, not variance across runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Chinchilla-optimal models serve as the baseline throughout. The paper systematically compares its inference-adjusted optimal configurations against Chinchilla-style models in Figures 2 and 6 and Tables 2 and 3."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The Chinchilla scaling laws (Hoffmann et al., 2022) are the state-of-the-art scaling law methodology. The paper also references and compares against concurrent work by Gadre et al. (2024), De Vries (2023), and Villalobos & Atkinson (2023)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 5 (Parametric Fitting) performs an ablation on the curve fitting procedure, fitting parametric curves on successively larger subsets of training data (≤100, ≤250, ≤500 tokens/param, all data) to understand how data range affects coefficient estimation (Table 1, Figure 5)."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses both pre-training cross-entropy loss and the Evaluation Gauntlet Average (aggregating 20+ downstream tasks across 5 categories: World Knowledge, Commonsense Reasoning, Reading Comprehension, Language Understanding, Symbolic Problem Solving). Per-category breakdowns are provided in Appendix D."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a scaling law study evaluating model quality via loss and automated benchmarks. Human evaluation is not relevant to the claims about compute-optimal training configurations."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Evaluation is performed on standard held-out benchmarks (MMLU, ARC, HellaSwag, GSM8k, etc.) that are separate from the training data. The paper explicitly states 'we train for only a single epoch and do not repeat data' (Section 3)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Per-category Gauntlet results are provided in Appendix D (Figure 7), showing results for World Knowledge, Commonsense Reasoning, Reading Comprehension, Language Understanding, and Symbolic Problem Solving individually."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses where the approach breaks down: 'none of our parametric curves fit our 150M long-ratio training results well' (Section 5), and notes that 'for some categories (e.g. Symbolic Problem Solving), all models achieve nearly zero performance' (Appendix D)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper reports that the Chinchilla parametric function 'is not flexible enough to accurately model the behavior of both smaller (≤150M) and larger models' at extreme ratios (Section 5), and that scaling laws overestimate improvements from additional data at extreme ranges."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract's three main claims (1. models should be trained smaller and longer for high inference demand, 2. quality improves up to 10,000 tokens/parameter, 3. Chinchilla fitting overestimates at extreme ranges) are all supported by the corresponding results in Sections 2, 4, and 5 respectively."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper's causal claims are primarily mathematical derivations (modifying Chinchilla equations to include inference cost) and controlled experiments (training 47 models varying size and token ratio). The ablation in Section 5 systematically varies one factor (data range for fitting). These constitute adequate controlled designs for the causal claims made."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper explicitly bounds its claims: 'Further work is needed to show if this scales beyond 10,000 tokens per parameter, or at larger model sizes' (Section 8), 'Due to resource constraints, we do not collect data at the same scale as the Chinchilla paper—both in terms of model size (we only test up to 6B vs. 16B), and number of training runs (47 vs. 400)' (Section 5)."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations: it addresses De Vries' (2023) critical size hypothesis as an alternative view (Section 4), acknowledges that 'the best-fit constants vary based on the exact dataset and model architecture' (Section 2), and discusses that quality improvements in Llama models could be due to 'architecture modifications or data quality improvements' rather than more data (Section 5)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper specifies exact model configurations: MPT architecture with detailed specifications in Table 4 (parameter counts from 151M to 6.05B, d_model, n_heads, n_layers). These are models they trained themselves, not API-accessed models, so architecture specifications serve as the version."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "The paper does not use prompting. It evaluates pre-training loss and uses in-context learning evaluation from the Composer library on standard benchmarks. No prompt engineering is involved."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix C (Table 4) provides detailed hyperparameters: learning rates per model size, batch sizes, optimizer (Lion with β1=0.9, β2=0.95), weight decay, cosine warmup schedule (αf=0.1, duration=3x parameters), gradient clipping threshold=1, max sequence length=4096."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a scaling law study involving model training and evaluation, not an agentic system."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The training dataset is described only as 'trillions of tokens of general web text and code' (Section 3). No preprocessing, filtering, or data cleaning steps are documented. The specific dataset composition and preparation are not described."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "While there is no dedicated 'Limitations' section, substantive limitations are discussed throughout: Section 5 acknowledges fewer training runs than Chinchilla (47 vs. 400), Section 4 notes resource constraints limiting the sweep, and the Conclusion (Section 8) explicitly states what further work is needed."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats are discussed: the scaling law coefficients depend on exact dataset and architecture (Section 2), the parametric function may not be flexible enough at extreme ranges (Section 5), Besiroglu et al.'s finding that confidence intervals are implausibly narrow applies to their work too (Section 7), and the assumption that inference demand is independent of model size may not hold (footnote 1)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states scope boundaries: tested only up to 6B parameters (vs. 16B in Chinchilla), only 47 training runs (vs. 400), only single epoch training, results may not extend beyond 10,000 tokens/parameter, and the cost model 'leaves aside latency requirements' (Section 6)."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw training data, loss curves, or individual model checkpoints are made available. Only aggregated results are shown in figures and tables."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The data collection for the 47 training experiments is described: model sizes (150M to 6B), token ratios (10 to 10,000), architecture (MPT), single epoch training, and the evaluation suite is fully enumerated in Section 3. The Gauntlet metrics and their computation are described."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The study trains language models and evaluates them on standard benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The pipeline from training data to final results is only partially documented. The training configuration is specified (Appendix C), but the training dataset composition, preprocessing, and the exact pipeline from raw data to the figures shown is not fully documented."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding sources are disclosed. The Acknowledgements section thanks specific individuals but does not mention grants or funding agencies."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are listed with their affiliation: 'Databricks MosaicML, United States of America' (page 1 footnote)."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The authors work at Databricks MosaicML, a company that provides LLM training infrastructure and services. They have a financial interest in scaling law research that could influence LLM training practices among their customers. This potential conflict is not acknowledged."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is included. The authors work at a company (Databricks MosaicML) that directly benefits from LLM training research, but no declaration is made."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper trains its own models from scratch but does not state when the training data was collected or any cutoff date for the 'trillions of tokens of general web text and code' dataset."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether any of the evaluation benchmarks (MMLU, ARC, HellaSwag, GSM8k, etc.) appear in the training data. Given the training data is described as 'general web text,' overlap with publicly available benchmarks is possible but not addressed."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Many of the evaluation benchmarks (MMLU published 2020, ARC 2018, HellaSwag 2019, etc.) were publicly available before the models were trained. No contamination analysis is discussed despite the training data being 'general web text.'"
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Inference cost is central to the paper. Section 6 and Table 3 report inference costs in both FLOPs and US dollars for various configurations, using specific GPU pricing (A100-40GB at $1.10/hr, A100-80GB at $1.50/hr from Lambda Labs)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "While the paper reports FLOPs for individual model configurations, the total computational budget used to train all 47 models is never stated. No total GPU hours, training time, or aggregate cost is reported for the experimental sweep."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "LLM practitioners expecting significant inference demand (~1B requests) should train models substantially smaller and longer than Chinchilla-optimal.",
    286       "evidence": "Section 2 and Figure 2 show that as inference demand increases, the compute-optimal model shifts toward smaller models trained on more data. Table 2 quantifies: for a 30B-quality model with 5T inference tokens, training a 16.4B model on 3.27T tokens reduces FLOPs by 16%.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Model quality continues to improve as tokens per parameter are scaled to extreme ranges (up to 10,000), with no evidence of a saturation point.",
    291       "evidence": "Figure 3a shows loss continuing to decrease for the 150M model up to 10,000 tokens/parameter. Figure 3b shows Gauntlet Average also improves. However, only the 150M model is tested at 10,000; larger models are tested only up to 250-1000 tokens/parameter (Section 3, Table 4).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Chinchilla scaling laws fitted from typical token ratios overestimate the impact of additional tokens at extreme ranges.",
    296       "evidence": "Section 5 and Figure 5 show that parametric curves fitted to progressively larger data subsets become flatter (Table 1), indicating that extrapolation from typical ratios overestimates loss reduction at extreme ratios. However, 47 data points is acknowledged as limited vs. Chinchilla's 400.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "Real-world cost savings from inference-adjusted training are larger than pure compute savings due to MFU differences between training and inference.",
    301       "evidence": "Section 6 compares: for 2T inference tokens, a Chinchilla-70B model requires only 1.3% extra FLOPs but costs 36% more than a cost-optimal model, due to the 50x lower MFU of inference output tokens. Table 3 shows cost savings of 34-58% for various configurations.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Loss is an excellent predictor of downstream task performance.",
    306       "evidence": "Figure 3c shows tight correlation between pre-training loss and Gauntlet Average across all 47 models, with smaller loss decreases leading to larger downstream accuracy improvements at lower loss values.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["theoretical", "benchmark-eval"],
    311   "key_findings": "The paper modifies Chinchilla scaling laws to incorporate inference costs, showing that practitioners with high inference demand should train smaller models on proportionally more data. Training 47 models from 150M to 6B parameters at token ratios up to 10,000, they find no evidence of a saturation point where additional training data stops improving quality. The parametric curve fitting procedure from Chinchilla overestimates the benefits of additional data at extreme token-per-parameter ratios. Real-world cost analysis accounting for hardware utilization differences shows even larger savings (up to 58%) compared to the FLOP-only analysis.",
    312   "red_flags": [
    313     {
    314       "flag": "No error bars or uncertainty quantification",
    315       "detail": "All 47 training runs appear to be single-run experiments with no repeated runs, confidence intervals, or uncertainty estimates. The paper acknowledges that Besiroglu et al. (2024) found implausibly narrow confidence intervals in the Chinchilla paper, yet provides no uncertainty quantification for its own fitted coefficients."
    316     },
    317     {
    318       "flag": "Limited scale relative to claims",
    319       "detail": "The extreme token ratio experiments (10,000 tokens/param) are only conducted at the 150M parameter scale. Claims about the absence of saturation at larger scales rely on extrapolation. The 6B model is only tested at 20 tokens/parameter ratio."
    320     },
    321     {
    322       "flag": "Unreleased training data",
    323       "detail": "The training dataset is described only as 'trillions of tokens of general web text and code' with no further characterization, preventing independent verification of data quality or contamination effects."
    324     },
    325     {
    326       "flag": "Potential conflict of interest",
    327       "detail": "All authors are from Databricks MosaicML, a company selling LLM training infrastructure. Research showing how to train models more efficiently could directly benefit their business. This conflict is not disclosed."
    328     }
    329   ],
    330   "cited_papers": [
    331     {
    332       "title": "Training compute-optimal large language models",
    333       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    334       "year": 2022,
    335       "relevance": "The foundational Chinchilla scaling laws paper that this work extends to include inference costs."
    336     },
    337     {
    338       "title": "Scaling laws for neural language models",
    339       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B. Brown"],
    340       "year": 2020,
    341       "arxiv_id": "2001.08361",
    342       "relevance": "Earlier scaling law work establishing power-law relationships between model/data size and performance."
    343     },
    344     {
    345       "title": "Scaling data-constrained language models",
    346       "authors": ["Niklas Muennighoff", "Alexander M. Rush", "Boaz Barak"],
    347       "year": 2023,
    348       "relevance": "Adapts Chinchilla scaling laws for data-constrained regimes where training tokens must be repeated."
    349     },
    350     {
    351       "title": "Chinchilla scaling: A replication attempt",
    352       "authors": ["Tamay Besiroglu", "Ege Erdil", "Matthew Barnett", "Josh You"],
    353       "year": 2024,
    354       "arxiv_id": "2404.10102",
    355       "relevance": "Methodological critique of scaling law fitting showing implausibly narrow confidence intervals in original Chinchilla work."
    356     },
    357     {
    358       "title": "Language models scale reliably with over-training and on downstream tasks",
    359       "authors": ["Samir Yitzhak Gadre", "Georgios Smyrnis", "Vaishaal Shankar"],
    360       "year": 2024,
    361       "arxiv_id": "2403.08540",
    362       "relevance": "Concurrent work training 100 models at 1.4B-6.9B params with 20-640 tokens/parameter ratios, finding reliable scaling laws."
    363     },
    364     {
    365       "title": "LLaMA: Open and efficient foundation language models",
    366       "authors": ["Hugo Touvron"],
    367       "year": 2023,
    368       "relevance": "Influential open-source LLM trained beyond Chinchilla-optimal, motivating the study of inference-aware scaling."
    369     },
    370     {
    371       "title": "Llama 2: Open foundation and fine-tuned chat models",
    372       "authors": ["Hugo Touvron"],
    373       "year": 2023,
    374       "relevance": "Trained on 2 trillion tokens, further exemplifying the trend toward training smaller models on more data."
    375     },
    376     {
    377       "title": "Efficiently scaling transformer inference",
    378       "authors": ["Reiner Pope", "Sholto Douglas", "Aakanksha Chowdhery"],
    379       "year": 2022,
    380       "relevance": "Analyzes inference compute costs and MFU for transformers, directly relevant to the paper's cost model."
    381     },
    382     {
    383       "title": "Resolving discrepancies in compute-optimal scaling of language models",
    384       "authors": ["Tomer Porian", "Mitchell Wortsman", "Jenia Jitsev"],
    385       "year": 2024,
    386       "arxiv_id": "2406.19146",
    387       "relevance": "Investigates discrepancies between Kaplan and Chinchilla scaling laws, relevant to methodology of fitting scaling laws."
    388     },
    389     {
    390       "title": "Beyond neural scaling laws: beating power law scaling via data pruning",
    391       "authors": ["Ben Sorscher", "Robert Geirhos", "Shashank Shekhar"],
    392       "year": 2022,
    393       "relevance": "Proposes data pruning as an alternative to pure scaling, relevant to data efficiency in LLM training."
    394     }
    395   ]
    396 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs