scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25776B)
      1 {
      2   "paper": {
      3     "title": "Broken Neural Scaling Laws",
      4     "authors": ["Ethan Caballero", "Kshitij Gupta", "Irina Rish", "David Krueger"],
      5     "year": 2022,
      6     "venue": "ICLR 2023",
      7     "arxiv_id": "2210.14891"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The abstract states 'Code is available at github.com/ethancaballero/broken_neural_scaling_laws' providing a direct repository URL."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses the publicly available scaling laws benchmark dataset of Alabdulmohsin et al. (2022) for its main vision and language experiments, and the 4-digit addition task uses the authors' own code to generate data. Additional experimental data points were obtained from published papers (GPT-3, etc.)."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions SciPy for curve fitting and minGPT for the arithmetic experiments, but does not provide a reproducible environment specification with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While the code repository is referenced and hyperparameters for the 4-digit addition task are given in Table 5 and Section A.5, there are no step-by-step reproduction instructions in the paper itself. The paper does not include a 'Reproducing Results' section or README-level guidance for replicating the main curve-fitting experiments."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "All extrapolation results in Tables 3 and 4 are reported as RMSLE ± root standard log error, providing uncertainty quantification for each result."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims BNSL outperforms other functional forms (M1-M4) based on comparing RMSLE values in Tables 2-4, but no statistical significance tests (e.g., paired t-tests, Wilcoxon) are used to determine if the differences are statistically significant."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports the percentage of tasks where each functional form is best (Table 2: BNSL best on 69.44% of vision tasks and 75% of language tasks), and provides raw RMSLE values allowing comparison of the magnitude of improvement across methods."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper uses a large set of tasks from the Alabdulmohsin et al. (2022) benchmark and many additional tasks, but does not justify why this particular set of tasks or the number of data points per task is sufficient for the claims being made."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports root standard log error alongside RMSLE in all tables (Tables 3, 4, 6). For the 4-digit addition experiments, each point is 'mean of greater than 1000 seeds at that dataset size' (Section 5.5, Figure 5 caption)."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares BNSL against four baseline functional forms: M1 (power law), M2 (power law + constant), M3 (shifted power law + constant), and M4 (Alabdulmohsin et al. 2022's form). Results are presented in Tables 2, 3, 4, and 6."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The baselines include M4 from Alabdulmohsin et al. (2022), which was the most recent scaling law functional form at the time. M3 from Zhai et al. (2021) and Bansal et al. (2022) are also contemporary. The comparison was done using the benchmark from Alabdulmohsin et al. (2022)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "The paper does not include a formal ablation study of the BNSL functional form (e.g., systematically varying the number of breaks, the sharpness parameters, or other components). Section 6 discusses the impact of break sharpness on predictability but this is more of an analysis than a controlled ablation."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "The primary metric used throughout is RMSLE (Root Mean Squared Logarithmic Error) for evaluating extrapolation quality. While the paper evaluates across many tasks with different performance metrics (error rate, cross-entropy, FID, etc.), the comparison between functional forms uses only RMSLE."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a curve-fitting methodology paper. Human evaluation is not relevant to the claims about which functional form best extrapolates scaling behavior."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses a clear train/test split for extrapolation evaluation: 'black points are points used for fitting a functional form, green (gray if color blind) points are held-out points used for evaluating extrapolation' (Section 5). This held-out evaluation is used consistently across all experiments."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Tables 3 and 4 provide per-task breakdowns for every individual task/model combination. Table 2 provides per-domain summaries. The appendix provides individual plots for each task."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6 discusses the limits of BNSL's predictability, noting that extrapolation fails when fitting points are before a sufficiently sharp break. Figure 5 right shows the noiseless simulation revealing the fundamental limitation. Figures in Appendix A.36 (e.g., Caltech101, BIG-Bench) acknowledge 'unsatisfactory extrapolations' for certain tasks."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6 explicitly reports the fundamental limitation that BNSL cannot extrapolate past sharp breaks it hasn't observed. The captions for Figures 37 and 36 acknowledge subsets with 'unsatisfactory extrapolations'. The paper also notes BNSL is not always the best (Table 2 shows it is best on 69-75% of tasks, not all)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims BNSL 'accurately models and extrapolates the scaling behaviors' across a diverse set of tasks. Tables 2-4 and extensive appendix results support this. The abstract's claim about non-monotonic behaviors (double descent) is supported by Figure 4, and inflection points by Figure 5. The claim about being 'considerably more accurate' is supported by the 69-75% best-rate in Table 2."
    111       },
    112       "causal_claims_justified": {
    113         "applies": false,
    114         "answer": false,
    115         "justification": "The paper makes no causal claims. It proposes a functional form and empirically evaluates its curve-fitting and extrapolation accuracy compared to alternatives. The language is descriptive ('accurately models', 'yields extrapolations') rather than causal."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6 explicitly bounds the generalization by identifying limitations: the need for data points near or past breaks for accurate extrapolation, the inability to predict unobserved sharp breaks, and the dependence on noise/number of seeds. The paper is careful to state results per-domain and per-task."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 6 discusses why extrapolation may fail (insufficient points near breaks, noise masking breaks). The captions for Figures 36-37 offer alternative explanations for unsatisfactory extrapolations (fitting points near or before a break, not enough points for the fitter to distinguish breaks from noise). The ethics statement discusses whether BNSL truly reflects the underlying scaling behavior or is simply a flexible curve fit."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "For their own experiments (4-digit addition), exact architecture details are given in Table 5. For other experiments, they reference specific published results (e.g., GPT-3 Table H.1, specific figures from cited papers) with exact model specifications (e.g., '2.62e+8 Param', '1.07e+9 Param', specific BiT/ViT/MiX model variants)."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This paper does not use prompting in its methodology. It fits mathematical functional forms to empirical scaling data using curve-fitting libraries (SciPy)."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 5 provides a comprehensive hyperparameter table for the 4-digit addition experiments (learning rate, weight decay, dropout, optimizer settings, etc.). Section A.6 describes the curve-fitting procedure including the use of scipy.optimize.brute and scipy.optimize.curve_fit."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The methodology involves curve fitting with SciPy, not agentic AI systems."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section A.6 describes the fitting procedure (grid search initialization, non-linear least squares). For the 4-digit addition task, Section A.5 specifies dataset sizes (144-1008), how data was generated, and all experimental settings. For the benchmark data, the source is explicitly cited (Alabdulmohsin et al. 2022 benchmark, with specific figures/tables from other papers)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 6 ('The Limit of the Predictability of Scaling Behavior') is a substantial discussion of the fundamental limitations of the approach. The Ethics Statement also discusses a limitation about the need for sufficient data points."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 6 provides specific threats: (1) sharp breaks beyond observed data cannot be predicted, (2) points before sufficiently sharp breaks can be useless for extrapolation, (3) noise across seeds limits accuracy. These are specific to this study rather than generic disclaimers."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 explicitly states three implications that bound the scope: what happens when breaks are sufficiently sharp, when additional breaks occur beyond observed scales, and when fitting data is beyond a sharp break. The ethics statement notes the compute-access limitation for gathering sufficient data points."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The experimental data comes from publicly available sources: the scaling laws benchmark of Alabdulmohsin et al. (2022), published papers (GPT-3 Table H.1, etc.), and the code repository github.com/ethancaballero/broken_neural_scaling_laws. Data for the 4-digit addition task is generated by the provided code."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "For each experiment, the paper states exactly where the data was obtained: specific figures and tables from cited papers (e.g., 'Experimental data obtained from Figure 1 of Hernandez et al. (2021)'). For the 4-digit addition task, Section A.5 describes the experimental setup in detail."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants are involved. The data consists of empirical scaling measurements from neural network training runs, sourced from published papers and public benchmarks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section A.6 documents the full pipeline: collecting scaling data from sources, fitting BNSL via scipy.optimize.brute grid search followed by scipy.optimize.curve_fit least squares, using a numerically stable MSLE variant. The train/test split for extrapolation evaluation is clearly described in Section 5."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "The acknowledgments section thanks various individuals for feedback and assistance but does not disclose any funding sources, grants, or financial support."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: Mila + McGill University (Caballero), Mila + University of Montreal (Gupta, Rish), and University of Cambridge (Krueger). These are academic institutions with no obvious conflict regarding the evaluated products."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding is disclosed, so the independence of any funder cannot be assessed. The absence of funding disclosure does not confirm independence."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "This paper does not evaluate a pre-trained model's capability on a benchmark. It fits mathematical functional forms to existing empirical scaling data from published papers. The only novel model training is for the 4-digit addition task, which is a from-scratch training experiment, not evaluation of a pre-trained model."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The paper's methodology is curve fitting, not model evaluation on benchmarks. There is no concern about train/test overlap in the context of fitting scaling laws to published experimental data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Benchmark contamination is not applicable here. The paper fits functional forms to scaling data points, not evaluating a language model on a knowledge benchmark."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants are involved in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost or wall-clock time is reported for running the BNSL curve fitting procedure. Section A.5 mentions 'each run took less than 2 hours' for the 4-digit addition training experiments, but not for the curve-fitting methodology itself."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "Section A.5 states each 4-digit addition experiment 'was run on a single V100 GPU and each run took less than 2 hours', but the total computational budget across all experiments and curve-fitting runs is not stated."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "BNSL yields extrapolations with the lowest RMSLE for 69.44% of downstream image classification tasks compared to four alternative functional forms.",
    286       "evidence": "Table 2 and Table 3 in Section 5.1, comparing BNSL to M1, M2, M3, M4 on the scaling laws benchmark of Alabdulmohsin et al. (2022).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "BNSL yields extrapolations with the lowest RMSLE for 75% of language tasks compared to four alternative functional forms.",
    291       "evidence": "Table 2 and Table 4 in Section 5.2, comparing BNSL to M1, M2, M3, M4 on language tasks from the BIG-Bench benchmark and language modeling tasks.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "Previously proposed scaling law functional forms (M1, M2, M3) are mathematically incapable of expressing non-monotonic behavior or inflection points on linear-linear plots.",
    296       "evidence": "Section 4 provides mathematical proofs via first and second derivative analysis in Table 1, showing these forms are strictly monotonic and lack inflection points over the relevant domain.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "BNSL accurately models and extrapolates non-monotonic scaling behaviors such as double descent.",
    301       "evidence": "Figure 4 in Section 5.4 shows BNSL fitting and extrapolating double descent behavior for transformers trained on neural machine translation, using data from Nakkiran et al. (2021).",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "BNSL accurately extrapolates to scales over an order of magnitude larger than the maximum of the fitting points.",
    306       "evidence": "Figure 2 and Section A.8/A.15 show extrapolation to scales >10x larger for both vision and language tasks. Section A.35 shows extrapolation to scales >100,000x larger using GPT-4 data.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "There is a fundamental limit to the predictability of scaling behavior: when breaks are sufficiently sharp, data points near or beyond the break are needed for accurate extrapolation.",
    311       "evidence": "Section 6 demonstrates this using the 4-digit addition task (Figure 5), showing that fitting requires points with dataset size >= 720 (real data) or >= 415 (noiseless simulation, near the break point).",
    312       "supported": "strong"
    313     }
    314   ],
    315   "methodology_tags": ["benchmark-eval", "theoretical"],
    316   "key_findings": "The paper proposes a smoothly broken power law functional form (BNSL) that generalizes standard power laws by introducing smooth transitions ('breaks') between power law segments. BNSL achieves the best extrapolation accuracy (by RMSLE) on 69-75% of tasks from a large scaling laws benchmark spanning vision, language, reinforcement learning, and many other domains. Unlike prior functional forms (M1-M4), BNSL can mathematically express non-monotonic behaviors (double descent) and inflection points, and the paper demonstrates accurate modeling and extrapolation of these phenomena. The paper also identifies a fundamental predictability limit: extrapolation past sufficiently sharp, unobserved breaks is impossible regardless of the number of seeds.",
    317   "red_flags": [
    318     {
    319       "flag": "No statistical significance tests",
    320       "detail": "The paper claims BNSL is 'considerably more accurate' than alternatives based on comparing RMSLE values across tasks, but never applies statistical significance tests (e.g., paired Wilcoxon or bootstrap tests across tasks) to determine if the differences are statistically reliable."
    321     },
    322     {
    323       "flag": "Overfitting risk from flexible functional form",
    324       "detail": "BNSL has more free parameters than the baseline forms (M1-M4) due to additional break parameters. While the held-out extrapolation evaluation mitigates this concern, the paper does not systematically analyze whether BNSL's advantage comes from genuine modeling ability versus having more parameters (e.g., via information criteria like BIC/AIC or cross-validation)."
    325     }
    326   ],
    327   "cited_papers": [
    328     {
    329       "title": "Scaling Laws for Neural Language Models",
    330       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan", "Tom B. Brown"],
    331       "year": 2020,
    332       "arxiv_id": "2001.08361",
    333       "relevance": "Foundational work on neural scaling laws that BNSL generalizes; directly relevant to understanding how LLM performance scales with compute, data, and parameters."
    334     },
    335     {
    336       "title": "Language Models are Few-Shot Learners",
    337       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    338       "year": 2020,
    339       "relevance": "GPT-3 paper whose published scaling data (Table H.1) is used as experimental data for BNSL extrapolation evaluation on downstream language tasks."
    340     },
    341     {
    342       "title": "Revisiting Neural Scaling Laws in Language and Vision",
    343       "authors": ["Ibrahim Mansour I Alabdulmohsin", "Behnam Neyshabur", "Xiaohua Zhai"],
    344       "year": 2022,
    345       "arxiv_id": "2209.06640",
    346       "relevance": "Proposed M4 functional form and the scaling laws benchmark dataset used extensively in this paper's evaluation."
    347     },
    348     {
    349       "title": "Emergent Abilities of Large Language Models",
    350       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    351       "year": 2022,
    352       "arxiv_id": "2206.07682",
    353       "relevance": "Discusses 'emergent' abilities of LLMs that BNSL aims to model and predict; directly relevant to AI capability forecasting."
    354     },
    355     {
    356       "title": "Beyond the Imitation Game: Quantifying and Extrapolating the Capabilities of Language Models",
    357       "authors": ["Aarohi Srivastava", "Abhinav Rastogi"],
    358       "year": 2022,
    359       "arxiv_id": "2206.04615",
    360       "relevance": "BIG-Bench benchmark used for evaluation of language task scaling; key benchmark for measuring LLM capabilities."
    361     },
    362     {
    363       "title": "Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback",
    364       "authors": ["Yuntao Bai", "Andy Jones"],
    365       "year": 2022,
    366       "arxiv_id": "2204.05862",
    367       "relevance": "RLHF training data used to demonstrate BNSL extrapolation of AI alignment scaling behavior; directly relevant to AI safety scaling."
    368     },
    369     {
    370       "title": "Deep Double Descent: Where Bigger Models and More Data Hurt",
    371       "authors": ["Preetum Nakkiran", "Gal Kaplun", "Yamini Bansal"],
    372       "year": 2021,
    373       "relevance": "Source of experimental data for the double descent phenomenon that BNSL can model but prior scaling laws cannot."
    374     },
    375     {
    376       "title": "GPT-4 Technical Report",
    377       "authors": ["OpenAI"],
    378       "year": 2023,
    379       "relevance": "Source of scaling data showing BNSL can extrapolate downstream performance to scales >100,000x larger than fitting data; relevant to LLM capability prediction."
    380     },
    381     {
    382       "title": "Scaling Laws for Transfer",
    383       "authors": ["Danny Hernandez", "Jared Kaplan", "Tom Henighan", "Sam McCandlish"],
    384       "year": 2021,
    385       "arxiv_id": "2102.01293",
    386       "relevance": "Prior work that described a smoothly broken power law in passing (for transfer learning scaling); BNSL extends this to multiple breaks and diverse settings."
    387     },
    388     {
    389       "title": "Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets",
    390       "authors": ["Alethea Power", "Yuri Burda", "Harri Edwards"],
    391       "year": 2022,
    392       "arxiv_id": "2201.02177",
    393       "relevance": "Introduced the modular arithmetic task exhibiting delayed sharp transitions in scaling behavior that BNSL aims to capture."
    394     },
    395     {
    396       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    397       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    398       "year": 2022,
    399       "arxiv_id": "2201.11903",
    400       "relevance": "Related to emergent reasoning capabilities in LLMs whose scaling behavior BNSL aims to predict."
    401     }
    402   ]
    403 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs