scan.json (18458B)
1 { 2 "paper": { 3 "title": "Grokking modular arithmetic", 4 "authors": ["Andrey Gromov"], 5 "year": 2023, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2301.02679", 8 "doi": "10.48550/arXiv.2301.02679" 9 }, 10 "scan_version": 2, 11 "active_modules": [], 12 "methodology_tags": ["theoretical"], 13 "key_findings": "The paper presents analytic weight solutions for two-layer fully-connected networks that solve modular addition and related tasks, exhibiting grokking under vanilla gradient descent with MSE loss and no regularization. The learned features are periodic functions with frequencies determined by the modular base, and grokking corresponds to the network learning these specific Fourier features. Width increases accuracy through better destructive interference, and regularization/adaptive optimizers reduce grokking time and data requirements but are not necessary.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No code repository URL or archive is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The data is deterministically generated modular arithmetic tables (e.g., all pairs (n,m) in Z_p). The generation procedure is fully specified — no external dataset is needed." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, library versions, or dependency information is provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions or scripts are provided. The setup is described mathematically but there are no concrete implementation details." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results are presented as single curves and point estimates (e.g., accuracy vs. width, learning curves) without confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": false, 45 "answer": false, 46 "justification": "The paper does not make comparative claims between competing methods — it presents analytic solutions and shows gradient descent finds them. No statistical tests are needed." 47 }, 48 "effect_sizes_reported": { 49 "applies": false, 50 "answer": false, 51 "justification": "The paper is primarily theoretical, presenting analytic solutions rather than comparative effect sizes." 52 }, 53 "sample_size_justified": { 54 "applies": false, 55 "answer": false, 56 "justification": "This is a theoretical paper. The dataset size is the full modular arithmetic table (p^2 points), which is deterministic, not sampled." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Experimental results (learning curves, accuracy vs. width) appear to be single-run results with no variance or standard deviation reported across runs or seeds." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares GD, AdamW, and the analytic solution, and discusses prior work using transformers and other architectures on the same tasks." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines reference contemporary work from 2022: Power et al. (original grokking), Liu et al., Thilak et al., all from 2022." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper varies multiple components: activation functions (quadratic, ReLU, GELU), optimizers (GD, AdamW, SGD with various regularizers), width N, and data fraction α (Fig. 4, Fig. 6)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports train/test loss, train/test accuracy, weight norms, gradient norms, IPR (inverse participation ratio), and Fourier analysis of preactivations." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is irrelevant to a theoretical paper about neural network dynamics on synthetic data." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The dataset is explicitly split into train (D_train) and test (D_test) subsets with fraction α = |D_train|/|D| (Eq. 3), and test accuracy/loss are reported separately." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by modular function type (n+m, n²+m², (n+m)², nm, n²+m²+nm, n³+nm²+m) with different behaviors noted for each (Section 3.2, Appendix C)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses functions that cannot be grokked (f(n,m) = n³+nm²+m never rises above 1% test accuracy, Fig. 10) and functions with incomplete grokking (n²+m²+nm reaches ~97% with a persistent train-test gap)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that certain functions cannot be learned (n³+nm²+m), that the analytic solution for modular square root only achieves ~50% accuracy due to non-invertibility, and that no analytic solution was found for multiplication." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims (i) grokking with MSE/no regularization, (ii) feature maps determined by task, (iii) analytic weight expressions, (iv) GD/AdamW find these features — are all supported by the analytic derivations and experiments in Sections 2-4." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about what features the network learns and why grokking occurs. These are justified by the analytic solution (constructive/destructive interference mechanism) and empirical verification that GD finds the same solution (Fig. 2, Fig. 3)." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper clearly bounds results to specific function classes (f1(n)+f2(m) mod p and F(f1(n)+f2(m)) mod p), explicitly states which functions it cannot solve analytically (multiplication), and which functions it cannot grok at all." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 1 reviews competing explanations for grokking (slingshot mechanism, encoder-decoder competition, regularization necessity) and Section 5.1 addresses them: e.g., grokking occurs without learnable encoders contrary to Liu et al., and without regularization contrary to Nanda & Lieberum." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper directly measures test accuracy on modular arithmetic tasks and claims the network learns modular arithmetic — the measurement matches the claim with no proxy gap." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": false, 141 "answer": false, 142 "justification": "The paper trains its own simple neural networks from scratch — no pre-trained models or API versions are involved." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "The paper does not use prompting. It trains neural networks directly." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Key hyperparameters like learning rate, number of epochs, and specific optimizer settings are not explicitly reported. Only α (data fraction) and N (width) are specified for experiments." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Data preprocessing is fully described: one-hot encoding of integers in Z_p, input dimension 2p, output dimension p, random train/test split with fraction α (Section 2)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 5.2 'Discussions' extensively discusses open problems including complexity classes of modular functions, scaling to many variables, role of depth, and connection to real-world datasets." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "The paper identifies specific limitations: analytic solutions only work for single-hidden-layer networks, the exact solution for multiplication is unknown, certain functions cannot be grokked at all, and the connection to real-world feature learning is speculative." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper explicitly states which function classes are solved (f1(n)+f2(m) mod p) vs. not (multiplication, mixed additive-multiplicative), that solutions are for single-hidden-layer networks only, and that the analytic solution is approximate (improving with width)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": false, 185 "answer": false, 186 "justification": "Data is deterministically generated modular arithmetic tables — no raw data release is needed as anyone can reconstruct it from the description." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data generation is fully specified: all p² pairs (n,m) for modular arithmetic over Z_p, with one-hot encoding (Section 2)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants; data is synthetic/deterministic." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is simple and fully documented: generate all p² data points, one-hot encode, split into train/test with fraction α." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Acknowledgments section discloses NSF CAREER Award DMR-2045181, Sloan Foundation, and Laboratory for Physical Sciences through the Condensed Matter Theory Center." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliation with Meta AI and University of Maryland is clearly stated on the first page." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Funders are NSF, Sloan Foundation, and Laboratory for Physical Sciences — none have a financial stake in the grokking results." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is provided. The author works at Meta AI which develops neural networks, but no financial interest disclosure is made." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper trains its own networks from scratch on synthetic data — no pre-trained model evaluation on benchmarks." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "No pre-trained model benchmark evaluation. Train/test split is controlled by the authors." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "No pre-trained model benchmark evaluation." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "This is a theoretical paper. Cost is irrelevant." 285 }, 286 "compute_budget_stated": { 287 "applies": false, 288 "answer": false, 289 "justification": "This is a theoretical paper studying toy neural networks." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "Fully-connected two-layer networks exhibit grokking on modular arithmetic tasks under vanilla gradient descent with MSE loss and no regularization.", 296 "evidence": "Fig. 1 shows training dynamics with delayed generalization onset. Section 2 describes the minimal setup. Multiple modular functions are tested.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Analytic weight expressions (periodic cosine functions) solve modular addition and the class f(n,m) = f1(n) + f2(m) mod p.", 301 "evidence": "Claim I (Eqs. 6-7) with full derivation through Eqs. 8-16 showing constructive/destructive interference mechanism. Claim II (Eq. 18) generalizes to arbitrary single-variable functions.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Gradient descent and AdamW find approximately the same solution as the analytic one.", 306 "evidence": "Fig. 2 shows matching Fourier structure between GD-found and analytic preactivations. Fig. 3 shows the phase constraint ϕ1+ϕ2−ϕ3≈0 is satisfied by trained networks.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "Regularization and adaptive optimizers reduce grokking time and critical data fraction but are not necessary for grokking.", 311 "evidence": "Fig. 4a shows grokking time vs. data fraction for GD, GD+momentum, AdamW. Section 2 states grokking occurs with full-batch GD and no regularization.", 312 "supported": "strong" 313 }, 314 { 315 "claim": "Some modular functions (e.g., n³+nm²+m) cannot be grokked by the tested architectures.", 316 "evidence": "Fig. 10 shows generalization never rises above 1% for this function. Section 3.2 discusses complexity classes of modular functions.", 317 "supported": "moderate" 318 } 319 ], 320 "red_flags": [ 321 { 322 "flag": "No error bars or multiple-run results", 323 "detail": "All experimental results appear to be single runs. For a paper about training dynamics which can be sensitive to initialization and hyperparameters, the absence of variance across seeds is a concern." 324 }, 325 { 326 "flag": "Missing hyperparameter details", 327 "detail": "Key training hyperparameters (learning rate, number of epochs, specific optimizer parameters beyond what's shown in figure captions) are not systematically reported, making reproduction difficult." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "Grokking: Generalization beyond overfitting on small algorithmic datasets", 333 "authors": ["Alethea Power", "Yuri Burda", "Harri Edwards", "Igor Babuschkin", "Vedant Misra"], 334 "year": 2022, 335 "arxiv_id": "2201.02177", 336 "relevance": "Original grokking discovery paper studying how transformers learn algorithmic datasets — foundational work on delayed generalization." 337 }, 338 { 339 "title": "Towards understanding grokking: An effective theory of representation learning", 340 "authors": ["Ziming Liu", "Ouail Kitouni", "Niklas Nolte", "Eric J Michaud", "Max Tegmark", "Mike Williams"], 341 "year": 2022, 342 "arxiv_id": "2205.10343", 343 "relevance": "Develops theoretical framework for grokking as encoder-decoder competition — relevant to understanding neural network representation learning." 344 }, 345 { 346 "title": "A mechanistic interpretability analysis of grokking", 347 "authors": ["Neel Nanda", "Tom Lieberum"], 348 "year": 2022, 349 "relevance": "Reverse-engineers the algorithm learned by transformers for modular addition — directly relevant to mechanistic interpretability of neural networks." 350 }, 351 { 352 "title": "Hidden progress in deep learning: SGD learns parities near the computational limit", 353 "authors": ["Boaz Barak", "Benjamin L Edelman", "Surbhi Goel", "Sham Kakade", "Eran Malach", "Cyril Zhang"], 354 "year": 2022, 355 "arxiv_id": "2207.08799", 356 "relevance": "Theoretical analysis of grokking on sparse parity problems with scaling laws for grokking time." 357 }, 358 { 359 "title": "Neural tangent kernel: Convergence and generalization in neural networks", 360 "authors": ["Arthur Jacot", "Franck Gabriel", "Clément Hongler"], 361 "year": 2018, 362 "relevance": "Foundational work on NTK theory — relevant because grokking requires feature learning beyond the NTK regime." 363 } 364 ] 365 }