ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (18481B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Grokking modular arithmetic",
      6     "authors": [
      7       "A. Gromov"
      8     ],
      9     "year": 2023,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2301.02679",
     12     "doi": "10.48550/arXiv.2301.02679"
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "All abstract claims are verified: grokking occurs in 2-layer networks (Figure 1), feature maps correspond to periodic Fourier functions (Figure 2, Section 3), analytic weight expressions solve modular arithmetic (Equations 6-7, 18), and gradient descent finds these solutions (Figure 3, Section 4.1).",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Causal claims about width and data effects are supported by controlled experiments (Figure 4a/b showing scaling relationships). The mechanistic explanation via constructive/destructive interference (Equations 8-16) provides theoretical justification.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Scope is explicitly bounded: modular arithmetic on Z_p (Equation 3), 2-layer MLPs only (Equation 1-2), specific parametrization (mean-field), and acknowledged non-grokking functions (Section 2, Appendix C).",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Section 1 reviews competing explanations (slingshot mechanism [14], encoder-decoder competition [7], random feature models). Paper contrasts their constructive interference mechanism with prior accounts and shows NTK regime doesn't exhibit grokking.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": true,
     43         "justification": "Paper claims networks learn modular functions and measures test accuracy/loss directly. No proxy mismatch—accuracy on the target task is the claim and measurement.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No dedicated limitations or threats-to-validity section. Discussion of scope constraints scattered throughout (Section 2, 5.2) but not in a formal section.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Paper discusses constraints (e.g., 'unclear how to predict which functions will grok', Section 2; 'only single-hidden-layer', Section 5.2) but not systematically in one section with specific threat categories.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Explicit boundaries: modular arithmetic on Z_p, 2-layer MLPs, mean-field parametrization, MSE loss, full-batch GD. Paper acknowledges functions that don't grok and extension to depth uncertain.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Funding explicitly stated: 'A.G.'s work at the University of Maryland was supported in part by NSF CAREER Award DMR-2045181, Sloan Foundation and the Laboratory for Physical Sciences.'",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Author affiliations clearly listed: Meta AI, University of Maryland (title page).",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "NSF, Sloan Foundation, and academic lab funding are independent of research outcome. No company funding the paper's findings.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No explicit competing interests statement or financial disclosures beyond funding sources. No mention of patents or equity interests.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Key terms defined: 'grokking' characterized as 'sudden jump in generalization' (Abstract, Figure 1), modular arithmetic formalized over Z_p, feature maps explained via Fourier analysis (Section 3-4).",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Contributions explicitly stated in Abstract and Section 2: (i) minimal model exhibiting grokking without regularization, (ii) analytic weight expressions, (iii) complete interpretability via periodic features.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 1 systematically reviews prior grokking work [11,7,14,9,1,8,16], contrasting their minimal model with each approach and showing how it advances the mechanistic understanding.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "theoretical": {
    116       "formal_quality": {
    117         "assumptions_stated_explicitly": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "Architecture specified (Equations 1-2), mean-field parametrization justified (footnote 1), loss function (MSE), optimizer (vanilla GD), initialization (N(0,1)), and input encoding (one-hot) all explicit.",
    121           "source": "haiku"
    122         },
    123         "proofs_complete_or_sketched": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Claim I (modular addition) has complete derivation (Equations 8-16 with trigonometric proof). Claim II sketched as verbatim application. Corollary sketched. Empirical validation provided in Section 4.",
    127           "source": "haiku"
    128         },
    129         "bounds_tight_or_discussed": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "Paper acknowledges approximate solutions improve with width N (Section 3.1), loose bounds discussed (e.g., destructive interference condition Equation 15), and critical data fraction α_c noted as 'hard to determine precisely' (Section 4.2).",
    133           "source": "haiku"
    134         },
    135         "counterexamples_explored": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "Paper explicitly tests functions that fail: f(n,m)=n³+nm²+m never grokks (Section 2, Figure 10); (n+m)² has reduced accuracy due to non-invertible operations (Section 3.2, Appendix C).",
    139           "source": "haiku"
    140         },
    141         "notation_consistent": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "Notation consistent throughout: W^(1), W^(2) for layer weights, h^(l) for preactivations, z^(l) for activations, ϕ for phases. No overloading issues.",
    145           "source": "haiku"
    146         },
    147         "constructive_vs_existence_noted": {
    148           "applies": true,
    149           "answer": true,
    150           "justification": "Solutions are constructive with explicit formulas (Equations 6-7, 18). Weights can be computed exactly; paper provides the construction and verifies correctness.",
    151           "source": "haiku"
    152         }
    153       },
    154       "connections": {
    155         "connection_to_practice_discussed": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Paper motivates grokking as a platform for studying 'fundamental questions of deep learning in a controlled setting' (Introduction), discusses interpretability for practitioners, and suggests cryptography applications (Section 5.2).",
    159           "source": "haiku"
    160         },
    161         "relationship_to_prior_theoretical_work_clear": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Paper clearly contrasts with NTK regime (where random features don't grok), relates to feature learning literature, and shows differences from competing mechanisms (encoder-decoder model, slingshot). Extends solvable model tradition.",
    165           "source": "haiku"
    166         },
    167         "computational_complexity_discussed": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Data efficiency (α scaling, critical α_c) discussed in Section 4.2. Time complexity studied empirically (epochs vs. data). Formal computational complexity not analyzed, but practical constraints addressed.",
    171           "source": "haiku"
    172         },
    173         "limitations_of_formal_model_stated": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "Model limitations acknowledged: synthetic modular arithmetic (unlike real data), limited to 2-layer networks, specific to MSE loss and vanilla GD. Section 5.2 discusses unknowns (which functions grok, extension to depth).",
    177           "source": "haiku"
    178         }
    179       }
    180     }
    181   },
    182   "claims": [
    183     {
    184       "claim": "Two-layer MLPs with quadratic activation exhibit grokking on modular arithmetic without explicit regularization under vanilla full-batch gradient descent.",
    185       "evidence": "Figure 1 shows train/test loss dynamics, Figure 1c shows delayed generalization onset. Section 2 empirically demonstrates grokking on multiple modular tasks (α-dependent).",
    186       "supported": "strong"
    187     },
    188     {
    189       "claim": "Grokking corresponds to learning periodic feature maps with Fourier frequencies 2πk/p determined by the modulus p.",
    190       "evidence": "Analytic solutions (Equations 6-7) use trigonometric basis. Figure 2 Fourier transform confirms gradient descent finds same periodic structure. Figure 5 IPR metric shows feature localization correlates with grokking.",
    191       "supported": "strong"
    192     },
    193     {
    194       "claim": "Explicit closed-form weight expressions solve modular addition and additive-decomposable functions (f(n,m)=f₁(n)+f₂(m) mod p) exactly.",
    195       "evidence": "Claim I (Equations 6-7) derived analytically, validated on multiple functions. Claim II generalizes construction. Figure 3 confirms GD finds these solutions.",
    196       "supported": "strong"
    197     },
    198     {
    199       "claim": "Gradient descent and AdamW find nearly identical solutions to the analytic expressions despite different optimization paths.",
    200       "evidence": "Figure 3 shows distribution of phase constraints (ϕ₁+ϕ₂=ϕ₃) peaked near zero for both optimizers. Figure 4b shows test accuracy of GD/AdamW solutions vs analytic.",
    201       "supported": "strong"
    202     },
    203     {
    204       "claim": "Grokking exhibits a critical data fraction α_c below which generalization does not occur, and above which generalization is rapid and complete.",
    205       "evidence": "Figure 4a shows abrupt onset of grokking time reduction at threshold. Section 4.2 discusses α_c dependency but notes precise value is 'hard to determine'.",
    206       "supported": "strong"
    207     },
    208     {
    209       "claim": "Not all modular functions are learnable by the network. Some require excessive data (α>0.9), others never generalize (e.g., f(n,m)=n³+nm²+m).",
    210       "evidence": "Section 2 states 'unclear how to predict which functions will generalize'. Appendix C Figure 10 shows f(n,m)=n³+nm²+m achieves <1% test accuracy even with α=0.9.",
    211       "supported": "strong"
    212     },
    213     {
    214       "claim": "The mechanism of grokking is constructive interference of trigonometric waves when network learns periodic weights; destructive interference cancels spurious terms.",
    215       "evidence": "Equations 8-16 provide complete derivation. Equation 13 shows constructive interference yields δ-function. Equation 15 justifies destructive cancellation via random phase distribution.",
    216       "supported": "strong"
    217     },
    218     {
    219       "claim": "Network width N affects solution accuracy; infinite width limit needed for exact δ-function, but finite N provides good approximation.",
    220       "evidence": "Section 3.1 states solutions 'can be made increasingly more accurate' with increasing N. Figure 4b shows accuracy vs. width. Figure 5 shows IPR2 gap between theory and practice persists for finite N.",
    221       "supported": "strong"
    222     }
    223   ],
    224   "methodology_tags": [
    225     "theoretical"
    226   ],
    227   "key_findings": "The paper resolves the mystery of grokking in a minimal setting: 2-layer MLPs learn modular arithmetic by discovering periodic Fourier basis functions whose frequencies match the task's modulus. Analytic closed-form solutions exist for additive-decomposable functions, and gradient descent empirically recovers these solutions with high fidelity. The phenomenon is driven by sudden feature learning (detectable via inverse participation ratio) rather than continued optimization, and depends critically on having sufficient training data (fraction α above task-specific threshold α_c). This provides complete interpretability: networks don't use memorization or implicit regularization tricks—they solve the task exactly through constructive/destructive interference of trigonometric waves.",
    228   "red_flags": [
    229     {
    230       "flag": "Limited to synthetic tasks",
    231       "detail": "Results are specific to modular arithmetic on Z_p. Generalization to real datasets or non-algebraic tasks is unclear. The controlled setting is useful for theory but distant from practical deep learning."
    232     },
    233     {
    234       "flag": "Predictability gap",
    235       "detail": "Section 2 explicitly states: 'It is not clear how to predict which functions will generalize and which will not.' This limits theoretical completeness—no principled criterion for learnability."
    236     },
    237     {
    238       "flag": "Theory-practice gap",
    239       "detail": "Analytic solutions assume N→∞; finite-width solutions have remaining noise (Figure 5). Paper acknowledges 'appreciable N' approximation but doesn't quantify the gap."
    240     },
    241     {
    242       "flag": "Single architecture class",
    243       "detail": "Results limited to 2-layer networks with specific parametrization. Extension to depth is unknown. Section 5.2 calls this an 'open problem'."
    244     },
    245     {
    246       "flag": "Incomplete analytic coverage",
    247       "detail": "Multiplicative functions f(n,m)=g₁(n)·g₂(m) mod p have no analytic solution. Complex compositions (e.g., n³+nm²+m) don't grok—no characterization of which functions admit solutions."
    248     },
    249     {
    250       "flag": "Missing formal complexity theory",
    251       "detail": "Critical data fraction α_c measured empirically but no closed-form expression. Section 4.2 notes 'difficult to determine precisely'—limiting theoretical predictive power."
    252     }
    253   ],
    254   "cited_papers": [
    255     {
    256       "title": "Grokking: Generalization beyond overfitting on small algorithmic datasets",
    257       "relevance": "Original empirical discovery of grokking phenomenon; paper provides theoretical explanation for this effect."
    258     },
    259     {
    260       "title": "Towards understanding grokking: An effective theory of representation learning",
    261       "relevance": "Prior mechanistic theory via encoder-decoder competition; paper contrasts and extends with constructive interference explanation."
    262     },
    263     {
    264       "title": "The slingshot mechanism: An empirical study of adaptive optimizers and the grokking phenomenon",
    265       "relevance": "Competing explanation for grokking via slingshot mechanism in Adam; paper shows grokking occurs without slingshot in vanilla GD."
    266     },
    267     {
    268       "title": "Hidden progress in deep learning: SGD learns parities near the computational limit",
    269       "relevance": "Theoretical work on learning sparse parity (algorithmic task); shares complexity-phase-transition themes with grokking."
    270     },
    271     {
    272       "title": "Wide neural networks of any depth evolve as linear models under gradient descent",
    273       "relevance": "Neural tangent kernel regime; paper shows NTK (random features) does NOT exhibit grokking, implying feature learning is essential."
    274     },
    275     {
    276       "title": "The large learning rate phase of deep learning: the catapult mechanism",
    277       "relevance": "Feature learning phase transitions under different initialization regimes; related mechanistic phenomenon."
    278     },
    279     {
    280       "title": "Feature learning in infinite-width neural networks",
    281       "relevance": "Theoretical framework for understanding when networks learn features vs. remain in kernel regime."
    282     },
    283     {
    284       "title": "A mean field view of the landscape of two-layers neural networks",
    285       "relevance": "Theoretical foundation for mean-field parametrization used in this paper's architecture."
    286     }
    287   ],
    288   "engagement_factors": {
    289     "practical_relevance": {
    290       "score": 1,
    291       "justification": "Results are on synthetic modular arithmetic. While interpretability is valuable, direct applicability to real tasks is unclear; requires further theoretical work to bridge to practice."
    292     },
    293     "surprise_contrarian": {
    294       "score": 3,
    295       "justification": "Highly surprising: a mysterious empirical phenomenon (grokking) becomes fully explained and interpretable via exact analytic solutions. This is a significant theoretical breakthrough."
    296     },
    297     "fear_safety": {
    298       "score": 0,
    299       "justification": "No AI safety implications. Pure mechanistic understanding of a learning phenomenon with no alignment or risk relevance."
    300     },
    301     "drama_conflict": {
    302       "score": 1,
    303       "justification": "Academic interest in resolving competing mechanistic theories (encoder-decoder vs. slingshot vs. interference), but no broader controversy or societal conflict angle."
    304     },
    305     "demo_ability": {
    306       "score": 2,
    307       "justification": "Simple enough to reproduce (2-layer network, modular arithmetic), but limited audience appeal outside mechanistic interpretability specialists. Results don't produce striking visualizations or demos."
    308     },
    309     "brand_recognition": {
    310       "score": 2,
    311       "justification": "Author affiliated with Meta AI, but this is a first-author single-author theoretical paper. Medium institutional recognition."
    312     }
    313   },
    314   "hn_data": {
    315     "threads": [],
    316     "top_points": 0,
    317     "total_points": 0,
    318     "total_comments": 0
    319   }
    320 }

Impressum · Datenschutz