scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21722B)
      1 {
      2   "paper": {
      3     "title": "Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets",
      4     "authors": ["Alethea Power", "Yuri Burda", "Harri Edwards", "Igor Babuschkin", "Vedant Misra"],
      5     "year": 2022,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2201.02177"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Neural networks trained on small algorithmic datasets (binary operation tables) exhibit 'grokking' — generalization that occurs well after memorization/overfitting, sometimes requiring 1000x more optimization steps. The amount of optimization required for generalization grows rapidly as dataset size decreases. Weight decay is the most effective intervention for improving data efficiency, more than halving the required training data. Embedding visualizations reveal that generalizing networks learn meaningful mathematical structure (e.g., circular topology for modular arithmetic).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL or code release is mentioned anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The datasets are algorithmically generated from fully specified binary operations (listed in Appendix A.1.1) with explicit parameters (p=97, S5). Anyone can reproduce the exact data from the specification."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, library versions, or dependency information is provided. The paper mentions using a transformer but does not specify the framework or environment."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. Hyperparameters are listed in Appendix A.1.2 but there are no scripts or README-level instructions."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as medians or means across seeds but no confidence intervals or error bars are shown in the figures or text."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are reported. Claims about weight decay being more effective are based on visual comparison of data efficiency curves."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper reports concrete effect sizes: 'a decrease of 1% of training data leads to an increase of 40-50% in median time to generalization' (Section 3.1.1), and weight decay 'more than halving the amount of samples needed' (Section 3.3)."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Experiments use 3 random seeds (7 for Section 3.1.1) but no justification is given for why these numbers are sufficient."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Results are averaged over 3 seeds (Figure 2 right) or medians over 7 seeds (Section 3.1.1), but no standard deviations, IQR, or spread measures are reported."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 3.3 and Figure 2 (left) compare multiple optimization algorithms (Adam, AdamW, with/without dropout, weight noise, gradient noise, different learning rates) as baselines against each other."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The baselines are standard optimization methods (Adam, AdamW with weight decay) that were contemporary at the time of writing."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Section 3.3 provides ablations over optimization methods, learning rates, weight decay, dropout, and gradient noise, measuring their individual impact on data efficiency."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper reports both accuracy and loss (training and validation) as evaluation metrics, shown in Figures 1 and 4."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is irrelevant — the paper studies neural network generalization on algorithmic datasets with ground-truth answers."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "For each run, a fraction of equations is the training set and the rest is the validation set (Appendix A.1.1). Results are reported on the held-out validation portion."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Figure 2 (right) provides per-operation breakdowns across 12 different binary operations, showing data efficiency curves for each."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper notes that some operations (e.g., x³ + xy² + y mod 97) 'didn't lead to generalization within the allowed optimization budget at any percentage of data up to 95%' (Section 3.2)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that some operations failed to generalize entirely, and that suboptimal hyperparameters 'severely limit generalization' (Figure 2 left caption). Section A.4 shows outliers hindering generalization."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract's claims about grokking, data efficiency, and optimization time are all supported by the experimental results in Sections 3.1-3.3 and corresponding figures."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims (e.g., 'weight decay improves generalization') are supported by controlled ablation experiments where only the intervention variable is changed (Section 3.3, Figure 2 left)."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Claims are appropriately bounded to 'small algorithmic datasets' and 'binary operation tables.' The paper explicitly frames these as testbeds, not general claims about all neural network training."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 4 and Appendix A.3 discuss whether grokking is the same as double descent (concluding it may be distinct), and Section A.5 explores sharpness as a potential explanatory mechanism. The paper considers the role of noise in SGD driving optimization to flatter minima."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper measures exactly what it claims: validation accuracy and loss on held-out equations from binary operation tables. There is no proxy gap — 'generalization' is directly measured as performance on unseen equations."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The model is fully specified: 'a standard decoder-only transformer with 2 layers, width 128, and 4 attention heads, with a total of about 4·10⁵ non-embedding parameters' (Appendix A.1.2). This is a custom model, not an API model requiring version numbers."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper does not use prompting — it trains a transformer from scratch on equation tokens."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix A.1.2 provides full hyperparameters: AdamW with lr=10⁻³, weight decay=1, β₁=0.9, β₂=0.98, warmup over first 10 updates, minibatch size 512, and optimization budget."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The paper trains a standard transformer."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix A.1.1 fully documents the data generation: binary operations are specified, equations are tokenized as ⟨x⟩⟨op⟩⟨y⟩⟨=⟩⟨x◦y⟩, and random fractions are selected for train/validation splits."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. The Discussion (Section 4) mentions future work but does not systematically discuss limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed. The paper does not address whether grokking is architecture-dependent or whether the hyperparameter choices bias the results."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper explicitly frames its scope as 'small algorithmically generated datasets' and 'binary operation tables,' and states these are 'testbeds for theories of generalization' rather than claiming broad applicability."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (training curves, model checkpoints) is released. The datasets are algorithmically generated but the experimental outputs are not available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix A.1.1 fully describes data generation: all 12 binary operations are specified with parameters, and the equation format is documented."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data is algorithmically generated from standard mathematical operations."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline from operation definition → equation generation → tokenization → train/validation split is fully documented in Section 2 and Appendix A.1."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding disclosure or acknowledgments section is present. The authors are from OpenAI and Google."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: OpenAI for four authors, Google for Vedant Misra (with note that he was at OpenAI during the work)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "The work was conducted at OpenAI, which has a commercial interest in demonstrating neural network capabilities. No independent funding source is mentioned."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is provided."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "The paper trains models from scratch on algorithmically generated data. There is no pre-trained model whose training cutoff could cause contamination."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "No pre-trained model is evaluated on a benchmark. The train/test split is a random partition of the generated equations — overlap is impossible by construction."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "No pre-trained model or public benchmark is used. Models are trained from scratch on synthetic data."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost or training cost is reported, despite experiments running up to 10⁶ optimization steps."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The paper mentions experiments 'can be quickly reproduced on a single GPU' but does not quantify GPU hours or total compute."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Results in Figure 2 (right) are 'averaged over 3 seeds' and Section 3.1.1 uses 7 random seeds with median reported."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Appendix A.1.2: 'We've repeated each experiment for each dataset size with 3 random seeds, with the exception of experiments in section 3.1.1, where we've aggregated results over 7 random seeds.'"
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Appendix A.1.2 mentions 'We have tuned optimization hyperparameters by running experiments on modular addition and product in S5' but does not report how many configurations were tried or the search budget."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Appendix A.1.2 states the final configuration was 'a balance of performance we saw on S5 and simplicity,' providing justification for the selection criterion."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "No discussion of self-comparison bias. All baselines are the authors' own implementations of standard optimizers."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Figure 1 (center) and Section 3.1.1 explicitly show performance (time to generalization) as a function of dataset size/compute, and Section 3.3 compares methods at matched optimization budgets (10⁵ steps)."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper explicitly discusses what the algorithmic datasets test (generalization on binary operation tables) and argues they are useful testbeds for studying generalization phenomena, with clear acknowledgment of the scope."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved — the paper trains a standard transformer from scratch."
    336       }
    337     }
    338   },
    339   "claims": [
    340     {
    341       "claim": "Neural networks exhibit 'grokking': generalization that occurs well after overfitting, sometimes requiring 1000x more optimization steps than needed for memorization.",
    342       "evidence": "Figure 1 (left) shows training accuracy reaching near-perfect at <10³ steps while validation accuracy requires ~10⁶ steps for modular division mod 97.",
    343       "supported": "strong"
    344     },
    345     {
    346       "claim": "The amount of optimization required for generalization increases rapidly as dataset size decreases.",
    347       "evidence": "Figure 1 (center) shows median optimization steps to 99% validation accuracy on S5, with a 1% decrease in training data leading to 40-50% increase in optimization time (Section 3.1.1).",
    348       "supported": "strong"
    349     },
    350     {
    351       "claim": "Weight decay is the most effective regularization for improving data efficiency on these tasks, more than halving the required samples.",
    352       "evidence": "Figure 2 (left) compares multiple optimization methods on S5, showing weight decay substantially shifts the data efficiency curve left compared to all other interventions.",
    353       "supported": "moderate"
    354     },
    355     {
    356       "claim": "Symmetric operations require less data for generalization than non-symmetric counterparts.",
    357       "evidence": "Figure 2 (right) shows x+y and x*y requiring lower data fractions than x-y and x/y respectively, consistent across multiple operation pairs.",
    358       "supported": "moderate"
    359     },
    360     {
    361       "claim": "Sharpness of the loss landscape is predictive of grokking, with Spearman correlation of -0.79548 (p < 0.000014).",
    362       "evidence": "Appendix A.5 reports this correlation between validation accuracy and sharpness measure φ on S5 composition, with Figure 7 visualizing the relationship.",
    363       "supported": "moderate"
    364     }
    365   ],
    366   "red_flags": [
    367     {
    368       "flag": "No variance or spread measures reported",
    369       "detail": "Despite running 3-7 seeds per experiment, the paper reports only means or medians without standard deviations, IQR, or error bars, making it impossible to assess result stability."
    370     },
    371     {
    372       "flag": "Narrow architecture evaluation",
    373       "detail": "All experiments use a single small transformer architecture (2 layers, 128 width, 4 heads). Claims about grokking being a general phenomenon rest on a single architecture, though the paper acknowledges some effects 'might be partially architecture-dependent.'"
    374     }
    375   ],
    376   "cited_papers": [
    377     {
    378       "title": "Deep Double Descent: Where Bigger Models and More Data Hurt",
    379       "authors": ["Preetum Nakkiran", "Gal Kaplun", "Yamini Bansal", "Tristan Yang", "Boaz Barak", "Ilya Sutskever"],
    380       "year": 2019,
    381       "arxiv_id": "1912.02292",
    382       "relevance": "Foundational work on double descent in neural networks, directly related to the grokking phenomenon studied here."
    383     },
    384     {
    385       "title": "Understanding deep learning requires rethinking generalization",
    386       "authors": ["Chiyuan Zhang", "Samy Bengio", "Moritz Hardt", "Benjamin Recht", "Oriol Vinyals"],
    387       "year": 2016,
    388       "arxiv_id": "1611.03530",
    389       "relevance": "Key paper on generalization in overparameterized networks — shows networks can memorize random labels, foundational to understanding grokking."
    390     },
    391     {
    392       "title": "Attention is all you need",
    393       "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N Gomez", "Lukasz Kaiser", "Illia Polosukhin"],
    394       "year": 2017,
    395       "arxiv_id": "1706.03762",
    396       "relevance": "Introduced the transformer architecture used in all experiments in this paper."
    397     },
    398     {
    399       "title": "Fantastic generalization measures and where to find them",
    400       "authors": ["Yiding Jiang", "Behnam Neyshabur", "Hossein Mobahi", "Dilip Krishnan", "Samy Bengio"],
    401       "year": 2019,
    402       "arxiv_id": "1912.02178",
    403       "relevance": "Studies which generalization measures predict neural network generalization — directly relevant to understanding why grokking occurs."
    404     },
    405     {
    406       "title": "Analysing mathematical reasoning abilities of neural models",
    407       "authors": ["David Saxton", "Edward Grefenstette", "Felix Hill", "Pushmeet Kohli"],
    408       "year": 2019,
    409       "arxiv_id": "1904.01557",
    410       "relevance": "Studies neural network generalization on procedurally generated math problems, closely related to the algorithmic datasets used here."
    411     }
    412   ]
    413 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs