scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22331B)
      1 {
      2   "paper": {
      3     "title": "Omnigrok: Grokking Beyond Algorithmic Data",
      4     "authors": ["Ziming Liu", "Eric J. Michaud", "Max Tegmark"],
      5     "year": 2022,
      6     "venue": "International Conference on Learning Representations",
      7     "arxiv_id": "2210.01117",
      8     "doi": "10.48550/arXiv.2210.01117"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["theoretical", "benchmark-eval"],
     13   "key_findings": "Grokking is caused by a mismatch between training and test loss landscapes ('LU mechanism'), where training loss is L-shaped and test loss is U-shaped against weight norm. The authors demonstrate grokking beyond algorithmic datasets on MNIST, IMDb sentiment, and QM9 molecular property prediction by increasing initialization scale and reducing training set size. Representation learning quality determines how dramatic grokking is — algorithmic datasets show the most dramatic grokking because generalization depends heavily on learning good representations from scratch. Constraining weight norm can almost eliminate grokking.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Code is available at https://github.com/KindXiaoming/Omnigrok, stated in Section 1."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All datasets used are standard public benchmarks: MNIST, IMDb, QM9, and synthetic teacher-student setups. No proprietary data."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is mentioned. Only PyTorch default initialization is referenced."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. Experiment details are in Appendix A but lack specific commands or scripts to run."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results are presented as single training curves and landscape plots without confidence intervals or error bars on the main claims."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used. Claims like 't ∝ γ⁻¹' are supported by visual inspection of plots rather than formal tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports quantitative relationships with context, e.g., 'the time to reach 95% test accuracy is inversely proportional to the weight decay' (Figure 2c), and provides specific accuracy thresholds (95%, 60%, 70%) with step counts."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for the number of experimental configurations, seeds, or training samples chosen (e.g., why Ntrain=100 for teacher-student, why 1k for MNIST grokking)."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Figure 10a-b shows results across 4 seeds (seed 0-3), demonstrating seed-dependent variation in generalization time. Figure 3e shows multiple runs with explicit marking of runs that did/didn't reach the accuracy threshold."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper systematically compares standard initialization (α=1) vs large initialization, different weight decay values (0, small, large), and different training set sizes as baselines for grokking behavior."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper uses the setup of Nanda et al. (2023) for transformer experiments on modular addition, which is contemporary. Related work discussion covers recent grokking explanations (Liu et al. 2022, Thilak et al. 2022, Barak et al. 2022)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper systematically varies initialization scale (α), weight decay (γ), training data size (N), and representation messiness (m) to isolate their effects on grokking, constituting ablation-like analysis."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports training loss, test loss, training accuracy, test accuracy, weight norm, and time to generalize as separate metrics across experiments."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is irrelevant for this theoretical/empirical study of neural network training dynamics."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Explicit train/test splits are used throughout: 100/100 for teacher-student, train/test for MNIST, 50/50 for QM9, 75/25 for IMDb."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by dataset (teacher-student, MNIST, IMDb, QM9, modular addition), by initialization scale, by weight decay value, and by training set size."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper explicitly discusses when grokking fails to occur: standard initialization, large training sets, and no regularization. Section 5.2 explains why MNIST does not grok with standard setup. Section 9 discusses where the LU mechanism does not perfectly hold."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Negative results are reported: no grokking with standard initialization (α=1), no generalization with γ=0, grokking signal for IMDb is 'not as sharp as on algorithmic datasets' (Section 4), and language models have not shown clear grokking signals (Section 5.2 Discussion)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims are supported: LU mechanism is demonstrated across multiple setups (Figures 2-8), grokking on images/language/molecules is shown (Figures 3-5), representation learning's role is shown (Figures 6-8), and elimination of grokking is demonstrated (Figure 7b)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims ('LU mechanism causes grokking') and supports them through controlled experiments varying single parameters (initialization scale, weight decay, data size) while holding others constant, plus analytical derivation of the t ∝ 1/γ relationship."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper bounds its claims appropriately: 'grokking signals observed for these tasks are usually less dramatic than for algorithmic datasets' (abstract), and the Discussion in Section 5.2 notes they 'have not yet observed clear grokking signals for large language models' with specific hypotheses for why."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 6 discusses alternative explanations for grokking (slingshot mechanism, Fourier gap, random walk) and explains how the LU mechanism relates to each: 'Our conclusion supports (a)(b)(c)(d), but does not necessarily negate (e)(f).'"
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's claims match its measurements. It measures loss landscapes, accuracy curves, and weight norms, and claims are stated at that same granularity. No broader proxy gap exists."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper does not use pre-trained LLMs. All models are trained from scratch with architectures fully described (MLP dimensions, LSTM dimensions, GCNN layers, transformer config)."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting is used. All models are trained from scratch on supervised learning tasks."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Key hyperparameters are reported: learning rates (3×10⁻⁴, 0.001), optimizer (Adam, AdamW), architectures (5-100-100-5 MLP, width-200 depth-3 MLP, etc.), weight decay values, training steps (10⁵), batch size (200 for MNIST). Details in Appendix A."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data preprocessing is documented: teacher-student inputs drawn from N(0,I), MNIST subset selection, IMDb tokenization with 1000 most frequent words padded to length 500, QM9 50/50 split. Appendix A provides additional details."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. The paper discusses some limitations inline (e.g., not observing grokking in LLMs, reduced trajectory being a 'thought experiment') but has no structured limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The reduced trajectory analysis assumptions (scale separation, linear representation evolution) are acknowledged as potentially unmet but not framed as validity threats."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper states specific scope boundaries: grokking in non-standard setups (reduced training size, increased initialization), grokking not yet observed in LLMs, and the discussion of why (Section 5.2). The assumptions of the reduced trajectory analysis are also stated explicitly (Table 1, Appendix D)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "All datasets are publicly available standard benchmarks (MNIST, IMDb, QM9). Code is released at GitHub for generating the synthetic data."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data generation is described: teacher-student uses Gaussian inputs fed through a teacher network, MNIST/IMDb/QM9 are standard datasets with clear subsetting procedures described."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard benchmarks or synthetic generation."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from raw data to results is documented: data generation/subsetting → constrained weight norm optimization → loss landscape computation, and separately the training dynamics experiments with specified optimizers and schedules."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is disclosed in the Acknowledgement section: Casey and Family Foundation, Foundational Questions Institute, Rothberg Family Fund, NSF Graduate Research Fellowship (Grant No. 2141064), NSF IAIFI (Grant No. PHY-2019786)."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All authors are from MIT Department of Physics / Institute for AI and Fundamental Interactions. No product being evaluated, so no conflict."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funders are foundations and NSF — none have a financial interest in whether grokking occurs or not."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper trains all models from scratch. No pre-trained model's capability is evaluated on a benchmark."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No pre-trained model is evaluated. Train/test splits are explicitly constructed by the authors."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No pre-trained model is evaluated on benchmarks. Models are trained from scratch with author-controlled data splits."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, training time, or computational cost is reported for any experiment."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No GPU hours, hardware specifications, or total compute budget is stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Figure 10a-b shows results across 4 seeds (seed 0-3) for both transformer and MLP experiments, demonstrating seed-dependent variation in generalization time."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Figure 10 shows 4 seeds per configuration. Figure 3e shows multiple runs explicitly marked."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The paper sweeps over initialization scales and weight decays but does not state the total search budget."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper does not claim a 'best' configuration. Instead, it systematically shows results across a range of configurations (α, γ, N) to characterize the grokking phenomenon."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical hypothesis tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "The paper does not compare its system against baselines in a competitive sense — it characterizes a phenomenon. No self-comparison bias applies."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "The paper is not comparing methods at different compute budgets. It studies training dynamics, where compute (steps) is the x-axis, not a confound."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper explicitly discusses what each dataset tests and why: algorithmic datasets test representation learning heavily (Section 5.1), MNIST tests it weakly (Section 5.2), and why this matters for grokking. The choice of datasets is motivated by the theoretical framework."
    332       }
    333     }
    334   },
    335   "claims": [
    336     {
    337       "claim": "Grokking is caused by the mismatch between training and test loss landscapes (LU mechanism), where training loss is L-shaped and test loss is U-shaped against weight norm.",
    338       "evidence": "Demonstrated across teacher-student setup (Figure 2a), MNIST (Figure 3a-c), IMDb (Figure 4a-c), QM9 (Figure 5a-c), and modular addition with transformers (Figure 9). Analytical derivation of t ∝ 1/γ in Section 2.",
    339       "supported": "strong"
    340     },
    341     {
    342       "claim": "Grokking can be induced on tasks beyond algorithmic datasets, including image classification (MNIST), sentiment analysis (IMDb), and molecular property prediction (QM9).",
    343       "evidence": "Grokking demonstrated on MNIST (Figure 3d), IMDb with LSTM (Figure 4d), and QM9 with GCNN (Figure 5d), though in non-standard setups with increased initialization and reduced training data.",
    344       "supported": "strong"
    345     },
    346     {
    347       "claim": "The time to generalize scales inversely with weight decay (t ∝ 1/γ).",
    348       "evidence": "Derived analytically in Section 2, confirmed empirically in Figure 2c (teacher-student), Figure 10a (transformer, ~2 orders of magnitude), and Figure 10b (MLP on MNIST).",
    349       "supported": "strong"
    350     },
    351     {
    352       "claim": "The dramaticness of grokking depends on how much the task relies on representation learning.",
    353       "evidence": "Comparison of algorithmic dataset landscapes (Figure 6, strong dependence on representation) vs MNIST landscapes (Figure 8, weak dependence). Section 5 provides the theoretical argument via wc(bad representation) > wc(good representation).",
    354       "supported": "moderate"
    355     },
    356     {
    357       "claim": "Constraining weight norm can almost eliminate grokking on algorithmic datasets.",
    358       "evidence": "Figure 7b shows that constrained optimization at α=0.8 brings train and test accuracy curves together on modular addition with a 1-layer transformer.",
    359       "supported": "moderate"
    360     }
    361   ],
    362   "red_flags": [
    363     {
    364       "flag": "Non-standard setup for inducing grokking",
    365       "detail": "Grokking on MNIST, IMDb, and QM9 requires artificially reducing training set size and increasing initialization scale — conditions unlikely in practice. The paper acknowledges this but the title 'Omnigrok' may overstate the generality."
    366     },
    367     {
    368       "flag": "No statistical tests on key claims",
    369       "detail": "The t ∝ 1/γ relationship is assessed visually from log-log plots rather than with formal regression or statistical tests. While the relationship appears convincing, formal quantification would strengthen it."
    370     }
    371   ],
    372   "cited_papers": [
    373     {
    374       "title": "Grokking: Generalization beyond overfitting on small algorithmic datasets",
    375       "authors": ["Alethea Power", "Yuri Burda", "Harri Edwards", "Igor Babuschkin", "Vedant Misra"],
    376       "year": 2022,
    377       "arxiv_id": "2201.02177",
    378       "relevance": "Original paper that discovered grokking on algorithmic datasets, the phenomenon this paper aims to explain."
    379     },
    380     {
    381       "title": "Towards understanding grokking: An effective theory of representation learning",
    382       "authors": ["Ziming Liu", "Ouail Kitouni", "Niklas Nolte", "Eric J Michaud", "Max Tegmark", "Mike Williams"],
    383       "year": 2022,
    384       "arxiv_id": "2205.10343",
    385       "relevance": "Prior work by same authors attributing grokking to slow representation formation, which the LU mechanism extends."
    386     },
    387     {
    388       "title": "Progress measures for grokking via mechanistic interpretability",
    389       "authors": ["Neel Nanda", "Lawrence Chan", "Tom Liberum", "Jess Smith", "Jacob Steinhardt"],
    390       "year": 2023,
    391       "arxiv_id": "2301.05217",
    392       "relevance": "Mechanistic interpretability approach to understanding grokking, whose transformer setup this paper replicates."
    393     },
    394     {
    395       "title": "Deep double descent: Where bigger models and more data hurt",
    396       "authors": ["Preetum Nakkiran", "Gal Kaplun", "Yamini Bansal", "Tristan Yang", "Boaz Barak", "Ilya Sutskever"],
    397       "year": 2021,
    398       "relevance": "Related phenomenon of double descent in model performance; paper argues the U-shape can be recovered by plotting against weight norm instead of parameter count."
    399     },
    400     {
    401       "title": "Hidden progress in deep learning: SGD learns parities near the computational limit",
    402       "authors": ["Boaz Barak", "Benjamin L Edelman", "Surbhi Goel", "Sham Kakade", "Eran Malach", "Cyril Zhang"],
    403       "year": 2022,
    404       "arxiv_id": "2207.08799",
    405       "relevance": "Alternative explanation of grokking via Fourier gap and hidden progress of SGD."
    406     }
    407   ]
    408 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs