scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27882B)
      1 {
      2   "paper": {
      3     "title": "Training Compute-Optimal Large Language Models",
      4     "authors": [
      5       "Jordan Hoffmann",
      6       "Sebastian Borgeaud",
      7       "Arthur Mensch",
      8       "Elena Buchatskaya",
      9       "Trevor Cai",
     10       "Eliza Rutherford",
     11       "Diego de Las Casas",
     12       "Lisa Anne Hendricks",
     13       "Johannes Welbl",
     14       "Aidan Clark",
     15       "Tom Hennigan",
     16       "Eric Noland",
     17       "Katie Millican",
     18       "George van den Driessche",
     19       "Bogdan Damoc",
     20       "Aurelia Guy",
     21       "Simon Osindero",
     22       "Karen Simonyan",
     23       "Erich Elsen",
     24       "Jack W. Rae",
     25       "Oriol Vinyals",
     26       "Laurent Sifre"
     27     ],
     28     "year": 2022,
     29     "venue": "NeurIPS 2022",
     30     "arxiv_id": "2203.15556"
     31   },
     32   "checklist": {
     33     "artifacts": {
     34       "code_released": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No source code repository or link is provided anywhere in the paper. The paper describes training procedures in detail but does not release any code."
     38       },
     39       "data_released": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The training data (MassiveText) is not publicly released. The model card states 'We will not make this model available publicly.' Standard evaluation benchmarks (MMLU, The Pile, etc.) are public, but the authors' own dataset and training curves data are not released."
     43       },
     44       "environment_specified": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper mentions training on TPUv3/TPUv4 with JAX and Haiku but provides no requirements file, library versions, or detailed environment specifications beyond these high-level mentions."
     48       },
     49       "reproduction_instructions": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No step-by-step reproduction instructions are provided. While many hyperparameters and architectural details are given (Table 4, Appendix D, F), there are no README-style instructions or scripts for reproducing the experiments."
     53       }
     54     },
     55     "statistical_methodology": {
     56       "confidence_intervals_or_error_bars": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Table 2 reports 10th and 90th percentiles via bootstrapping for the scaling exponents (e.g., a=0.50 (0.488, 0.502), b=0.50 (0.501, 0.512) for Approach 1), providing uncertainty estimates for the key scaling law parameters."
     60       },
     61       "significance_tests": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper claims Chinchilla 'significantly outperforms' Gopher and other models on many tasks but provides no statistical significance tests. Comparisons are based on point estimates of accuracy/loss without any formal tests."
     65       },
     66       "effect_sizes_reported": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Effect sizes are reported in context throughout: e.g., Chinchilla achieves 67.6% on MMLU vs 60.0% for Gopher (7.6% improvement, Table 6), 10.7% average improvement on BIG-bench (65.1% vs 54.4%), and specific per-task comparisons with baselines in all evaluation tables."
     70       },
     71       "sample_size_justified": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "The paper trains over 400 models for the scaling analysis but does not justify why this number was chosen or discuss whether it is sufficient. The number of IsoFLOP profiles (9) or model sizes is not justified by any formal analysis."
     75       },
     76       "variance_reported": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No variance or standard deviation across runs is reported. All benchmark results (MMLU, BIG-bench, question answering, etc.) appear to be from single runs. The bootstrapping in Table 2 addresses parameter estimation uncertainty but not run-to-run variance."
     80       }
     81     },
     82     "evaluation_design": {
     83       "baselines_included": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Extensive baselines are included: Gopher (280B), GPT-3 (175B), Jurassic-1 (178B), MT-NLG 530B (530B), along with supervised SOTA and human baselines on multiple tasks (Tables 6-9)."
     87       },
     88       "baselines_contemporary": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "All baselines were contemporary at time of publication: Gopher (2021), GPT-3 (2020), Jurassic-1 (2021), MT-NLG 530B (2022). These represented the largest and most capable dense LLMs available."
     92       },
     93       "ablation_study": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Appendix G ablates the training differences between Chinchilla and Gopher: Adam vs AdamW optimizer (Figure A7), high-precision optimizer state, and their combination (Figure A6). These isolate the contribution of each training change."
     97       },
     98       "multiple_metrics": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper uses numerous evaluation metrics: perplexity, bits-per-byte on The Pile (20 subsets), accuracy on MMLU (57 tasks), accuracy on BIG-bench (62 tasks), reading comprehension accuracy, closed-book QA accuracy, common sense benchmarks, gender bias metrics, and toxicity scores."
    102       },
    103       "human_evaluation": {
    104         "applies": false,
    105         "answer": false,
    106         "justification": "This paper evaluates scaling laws for language model training and compares model performance on automated benchmarks. Human evaluation of model outputs is not relevant to the core claims about compute-optimal training."
    107       },
    108       "held_out_test_set": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Results are reported on standard held-out test/dev sets for each benchmark (e.g., Natural Questions dev set, TriviaQA test/dev sets as stated in Table 9, MMLU 5-shot evaluation). The paper also notes train/test leakage concerns for language modelling benchmarks (Section 4.2.1)."
    112       },
    113       "per_category_breakdown": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Extensive per-category breakdowns are provided: per-subset Pile evaluation (Figure 5, Table A5), per-task MMLU results (Figure 6, Table A6), per-task BIG-bench results (Figure 7, Table A7), and per-benchmark common sense results (Table 8)."
    117       },
    118       "failure_cases_discussed": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper explicitly identifies tasks where Chinchilla underperforms Gopher: 4 MMLU tasks (college_mathematics, econometrics, moral_scenarios, formal_logic) in Section 4.2.2 and 4 BIG-bench tasks (crash_blossom, dark_humor_detection, mathematical_induction, logical_args) in Section 4.2.4. Gender bias disparities are also discussed."
    122       },
    123       "negative_results_reported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper reports that toxicity levels are largely unchanged between Chinchilla and Gopher despite improved performance (Section 4.2.7). It also reports uneven gender bias improvements and notes that compute-optimal training does not uniformly improve all capabilities."
    127       }
    128     },
    129     "claims_and_evidence": {
    130       "abstract_claims_supported": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The abstract claims (1) current LLMs are under-trained, (2) model size and tokens should scale equally, (3) Chinchilla outperforms Gopher/GPT-3/Jurassic-1/MT-NLG on downstream tasks, (4) 67.5% MMLU accuracy are all supported by results in Sections 3-4. The actual MMLU result is 67.6% (Table 6), slightly higher than the 67.5% stated in the abstract."
    134       },
    135       "causal_claims_justified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper's core causal claim is that the scaling law relationship determines optimal compute allocation. This is supported by three independent estimation approaches (Section 3) and validated by the Chinchilla training run. The ablation in Appendix G isolates optimizer changes from the scaling hypothesis. The paper appropriately notes confounds between Chinchilla and Gopher (slightly different data distribution, optimizer, tokenizer)."
    139       },
    140       "generalization_bounded": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper states its limitations clearly: 'we only have two comparable training runs at large scale (Chinchilla and Gopher), and we do not have additional tests at intermediate scales' (Section 5). It notes the power-law assumption may not hold at very large scales, and acknowledges the analysis is on less than one epoch of data. The title 'Training Compute-Optimal Large Language Models' is reasonably bounded."
    144       },
    145       "alternative_explanations_discussed": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section 5 discusses that the concavity in the FLOP-loss frontier may mean even smaller models are optimal. The paper also acknowledges in Section 4.2.1 that train/test leakage may artificially enhance language modelling results since Chinchilla saw 4x more data. Appendix G addresses whether performance gains come from optimizer changes rather than scaling."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "All models are trained from scratch by the authors, with exact architectures specified in Table 4 (layers, heads, key/value size, d_model, learning rate, batch size). Model sizes are precisely stated (70B for Chinchilla, 280B for Gopher). This is not an API-based study where version snapshots matter."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The paper evaluates on various benchmarks using few-shot prompting but does not provide the actual prompt text used for evaluations. It references the evaluation setup in Rae et al. (2021) but does not reproduce the prompts."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Detailed hyperparameters are reported: learning rates (Table 4, Appendix D.1), batch sizes (Table 4), optimizer choice (AdamW), learning rate schedule (cosine with 10x decay), precision (bfloat16 forward/backward, float32 optimizer state), and architecture details. Appendix D provides additional details for the scaling runs."
    166       },
    167       "scaffolding_described": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No agentic scaffolding is used. This is a standard language model training and evaluation study."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Table A1 documents the MassiveText dataset composition including disk size, document counts, sampling proportions, and epochs for 1.4T tokens. The tokenizer change (SentencePiece without NFKC normalization) is documented in Section 4.1. The paper references Rae et al. (2021) for the full MassiveText data pipeline details."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 5 (Discussion & Conclusion) contains substantive discussion of limitations spanning two paragraphs, covering limited large-scale validation runs, power-law assumption concerns, single-epoch limitation, and ethical concerns about large dataset collection."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The paper discusses specific threats: (1) only two comparable large-scale runs (Chinchilla vs Gopher), (2) concavity in the FLOP-loss frontier suggesting power-law may not hold at large scales (Appendix E), (3) potential train/test leakage from 4x more training data (Section 4.2.1), (4) confounding factors between Chinchilla and Gopher beyond just scaling (different optimizer, tokenizer, data distribution)."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The paper explicitly states: the analysis covers only dense transformer models (not MoE), only single-epoch training regimes, and only autoregressive language models. Section 5 states 'we assume that the efficient computational frontier can be described by a power-law relationship' and notes this may not hold. The model card states the model is not publicly available and is for research only."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "Neither the raw training data (MassiveText) nor the individual training curves from the 400+ model runs are released. Only aggregated results and fitted scaling law parameters are provided."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The training data composition is described in Table A1 (MassiveText subsets: MassiveWeb, Books, C4, News, GitHub, Wikipedia with sizes and proportions). The paper references Rae et al. (2021) for full data collection details. The scaling experiment setup (model sizes, training horizons, FLOP budgets) is described in Section 3 and Appendix D."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No human participants are involved. The study trains and evaluates language models on existing datasets and benchmarks."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The data pipeline from training runs to scaling law estimates is well documented: Section 3.1 describes how training curves are smoothed and interpolated, how the envelope of minimal loss per FLOP is extracted, and how power laws are fitted. Section 3.3 and Appendix D.2 detail the parametric fitting procedure including the Huber loss, L-BFGS optimization, and initialization grid."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No explicit funding statement appears in the paper. The copyright line states '© 2023 DeepMind. All rights reserved' but no funding sources or grants are listed."
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "All authors are identified as DeepMind researchers via the corresponding author email addresses ({jordanhoffmann|sborgeaud|amensch|sifre}@deepmind.com) and the DeepMind copyright notice."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "DeepMind (a Google/Alphabet subsidiary) funded and conducted this research. DeepMind has a direct financial interest in demonstrating effective approaches to LLM training, and the results validate their research direction (showing that their 280B Gopher was over-sized and that a 70B model trained optimally performs better)."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is included in the paper. DeepMind employees may hold equity in Alphabet, which is not disclosed."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The paper does not state the training data cutoff date for MassiveText. It references the dataset from Rae et al. (2021) but does not specify when the data was collected or what temporal range it covers."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "Section 4.2.1 explicitly discusses train/test overlap: 'Some caution is needed when comparing Chinchilla with Gopher on these language modelling benchmarks as Chinchilla is trained on 4× more data than Gopher and thus train/test set leakage may artificially enhance the results.' The paper shifts emphasis to benchmarks where leakage is less of a concern (MMLU, BIG-bench)."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": true,
    253         "justification": "The paper acknowledges contamination risk for language modelling benchmarks (Section 4.2.1) and explicitly notes that MMLU and BIG-bench are less susceptible to leakage. Section 5 notes 'Larger datasets will require extra care to ensure train-test set overlap is properly accounted for, both in the language modelling loss but also with downstream tasks.'"
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants are involved in this study."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants are involved in this study."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants are involved in this study."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants are involved in this study."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants are involved in this study."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants are involved in this study."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants are involved in this study."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The paper qualitatively notes that Chinchilla's smaller size 'reduces inference cost considerably' (Section 4) but does not quantify inference cost, latency, or tokens per second for any model."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "The compute budget is central to the paper. The Gopher compute budget is stated as 5.76×10^23 FLOPs (Table 3), and Chinchilla uses the same budget. Appendix F details the exact FLOP calculation methodology. Table 3 provides compute budgets for various model sizes. Training was done on TPUv3/TPUv4 (Section 4.1)."
    303       }
    304     }
    305   },
    306   "claims": [
    307     {
    308       "claim": "Current large language models are significantly under-trained: for compute-optimal training, model size and number of training tokens should be scaled equally (for every doubling of model size, training tokens should also be doubled).",
    309       "evidence": "Three independent approaches (Sections 3.1-3.3) all yield scaling exponents near a=0.5, b=0.5 (Table 2), compared to Kaplan et al. (2020) which predicted a=0.73, b=0.27. Over 400 model training runs support this finding. Bootstrap confidence intervals are provided.",
    310       "supported": "strong"
    311     },
    312     {
    313       "claim": "Chinchilla (70B parameters, 1.4T tokens) uniformly and significantly outperforms Gopher (280B) on a large range of downstream evaluation tasks despite using the same compute budget.",
    314       "evidence": "Chinchilla outperforms Gopher on all 20 Pile subsets (Figure 5), 51/57 MMLU tasks (Figure 6), 58/62 BIG-bench tasks (Figure 7), all reading comprehension tasks (Table 7), all question answering tasks (Table 9), and 4/5 common sense benchmarks (Table 8). The average MMLU improvement is 7.6% (67.6% vs 60.0%).",
    315       "supported": "strong"
    316     },
    317     {
    318       "claim": "Chinchilla achieves state-of-the-art average accuracy of 67.6% on the MMLU benchmark, exceeding the June 2023 expert forecast of 63.4%.",
    319       "evidence": "Table 6 shows Chinchilla 5-shot accuracy of 67.6% vs Gopher 60.0%, GPT-3 43.9%, and the June 2023 forecast of 63.4% from Steinhardt (2021). Per-task breakdown in Table A6 and Figure 6.",
    320       "supported": "strong"
    321     },
    322     {
    323       "claim": "Chinchilla also significantly outperforms GPT-3 (175B) and Megatron-Turing NLG (530B) on most evaluated tasks.",
    324       "evidence": "Tables 7-9 show Chinchilla outperforming GPT-3 on LAMBADA, RACE, Natural Questions, and TriviaQA. Table 8 shows Chinchilla outperforming MT-NLG 530B on 4/5 common sense benchmarks despite being 7.5x smaller.",
    325       "supported": "strong"
    326     },
    327     {
    328       "claim": "The scaling results are consistent across different datasets (MassiveText, C4, GitHub code).",
    329       "evidence": "Appendix C and Table A2 show IsoFLOP analysis on C4 (a=0.50, b=0.50) and GitHub (a=0.53, b=0.47), both very similar to the MassiveText results, confirming equal scaling of parameters and tokens.",
    330       "supported": "strong"
    331     },
    332     {
    333       "claim": "Toxicity levels in unconditional text generation are largely independent of model quality, and Chinchilla's improved loss does not increase toxicity.",
    334       "evidence": "Section 4.2.7 reports nearly identical toxicity scores: Gopher mean 0.081 (median 0.064) vs Chinchilla mean 0.087 (median 0.066), with 95th percentile scores of 0.230 vs 0.238 based on 25,000 unprompted samples evaluated with PerspectiveAPI.",
    335       "supported": "moderate"
    336     }
    337   ],
    338   "methodology_tags": [
    339     "benchmark-eval",
    340     "theoretical"
    341   ],
    342   "key_findings": "The paper establishes that for compute-optimal training of large language models, model size and number of training tokens should be scaled in approximately equal proportions, contradicting the prior recommendation from Kaplan et al. (2020) that favored scaling model size much faster than data. Three independent estimation approaches using over 400 training runs all converge on this finding. The practical validation, Chinchilla (70B parameters trained on 1.4T tokens), outperforms the 4x larger Gopher (280B) and other larger models on nearly every benchmark tested, achieving 67.6% on MMLU — a 7.6% improvement over Gopher.",
    343   "red_flags": [
    344     {
    345       "flag": "Company evaluating its own models",
    346       "detail": "All authors are DeepMind researchers comparing their new model (Chinchilla) against their previous model (Gopher). While the paper also compares against external models (GPT-3, Jurassic-1, MT-NLG), the primary comparison is internal. No conflicts of interest statement is provided."
    347     },
    348     {
    349       "flag": "Confounding factors in Chinchilla vs Gopher comparison",
    350       "detail": "Chinchilla differs from Gopher in multiple ways beyond the scaling hypothesis: different optimizer (AdamW vs Adam), different tokenizer (no NFKC normalization), different data subset distribution, and higher precision optimizer state. While Appendix G attempts to ablate optimizer effects, the final Chinchilla comparison bundles all changes together. The paper acknowledges this but the main results conflate these factors."
    351     },
    352     {
    353       "flag": "No run-to-run variance reported",
    354       "detail": "All benchmark results appear to be from single runs with no reported variance or standard deviation. Given the stochastic nature of language model training and evaluation, the lack of repeated runs makes it impossible to assess the stability of the reported improvements."
    355     },
    356     {
    357       "flag": "Training data not released",
    358       "detail": "MassiveText is proprietary to DeepMind and not publicly available. The 400+ training runs underlying the scaling law analysis cannot be independently verified or reproduced. The raw loss curves are not released."
    359     }
    360   ],
    361   "cited_papers": [
    362     {
    363       "title": "Scaling laws for neural language models",
    364       "authors": ["J. Kaplan", "S. McCandlish", "T. Henighan", "T. B. Brown", "B. Chess", "R. Child", "S. Gray", "A. Radford", "J. Wu", "D. Amodei"],
    365       "year": 2020,
    366       "relevance": "The primary work this paper extends and corrects — established the original scaling laws that recommended scaling model size faster than data, which Chinchilla refutes."
    367     },
    368     {
    369       "title": "Language models are few-shot learners",
    370       "authors": ["T. Brown", "B. Mann", "N. Ryder"],
    371       "year": 2020,
    372       "relevance": "GPT-3 paper — one of the key baselines in the evaluation and an example of a model trained with the suboptimal scaling recommended by Kaplan et al."
    373     },
    374     {
    375       "title": "Scaling language models: Methods, analysis & insights from training Gopher",
    376       "authors": ["J. Rae", "S. Borgeaud", "T. Cai"],
    377       "year": 2021,
    378       "relevance": "The Gopher paper — the primary baseline model and training setup that Chinchilla builds upon and improves over."
    379     },
    380     {
    381       "title": "Using Deepspeed and Megatron to Train Megatron-turing NLG 530b, A Large-Scale Generative Language Model",
    382       "authors": ["S. Smith", "M. Patwary", "B. Norick"],
    383       "year": 2022,
    384       "relevance": "MT-NLG 530B — the largest dense model at time of publication, a key baseline that Chinchilla (70B) outperforms on most tasks despite being 7.5x smaller."
    385     },
    386     {
    387       "title": "Measuring massive multitask language understanding",
    388       "authors": ["D. Hendrycks", "C. Burns", "S. Basart", "A. Zou", "M. Mazeika", "D. Song", "J. Steinhardt"],
    389       "year": 2020,
    390       "relevance": "The MMLU benchmark used as a primary evaluation metric — Chinchilla achieves SOTA 67.6% on this benchmark."
    391     },
    392     {
    393       "title": "Unified scaling laws for routed language models",
    394       "authors": ["A. Clark", "D. de Las Casas", "A. Guy"],
    395       "year": 2022,
    396       "relevance": "Examines scaling laws for Mixture of Expert models, complementing Chinchilla's analysis of dense models."
    397     },
    398     {
    399       "title": "Improving language models by retrieving from trillions of tokens",
    400       "authors": ["S. Borgeaud", "A. Mensch", "J. Hoffmann"],
    401       "year": 2021,
    402       "relevance": "RETRO paper — shows that retrieval augmentation effectively increases training data seen, supporting the hypothesis that data quantity is crucial for model performance."
    403     },
    404     {
    405       "title": "On the dangers of stochastic parrots: Can language models be too big?",
    406       "authors": ["E. M. Bender", "T. Gebru", "A. McMillan-Major", "S. Shmitchell"],
    407       "year": 2021,
    408       "relevance": "Discusses risks of large language models including environmental cost and societal biases, directly relevant to the efficiency argument that compute-optimal models reduce waste."
    409     },
    410     {
    411       "title": "Ethical and social risks of harm from language models",
    412       "authors": ["L. Weidinger", "J. Mellor", "M. Rauh"],
    413       "year": 2021,
    414       "relevance": "Framework for evaluating LLM risks; Chinchilla's bias and toxicity evaluations build on this work."
    415     },
    416     {
    417       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    418       "authors": ["S. Lin", "J. Hilton", "O. Evans"],
    419       "year": 2021,
    420       "relevance": "Chinchilla shows large improvements on TruthfulQA (14.1% in 0-shot) over Gopher, contradicting the original finding that larger models perform worse on this benchmark."
    421     },
    422     {
    423       "title": "GLaM: Efficient scaling of language models with mixture-of-experts",
    424       "authors": ["N. Du", "Y. Huang", "A. M. Dai"],
    425       "year": 2021,
    426       "relevance": "1.2T parameter MoE model — represents the alternative approach of sparse scaling that complements Chinchilla's dense model efficiency findings."
    427     }
    428   ]
    429 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs