scan-v5.json (21608B)
1 { 2 "scan_version": 5, 3 "paper_type": "theoretical", 4 "paper": { 5 "title": "Hidden Progress in Deep Learning: SGD Learns Parities Near the Computational Limit", 6 "authors": [ 7 "B. Barak", 8 "Benjamin L. Edelman", 9 "Surbhi Goel", 10 "S. Kakade", 11 "Eran Malach", 12 "Cyril Zhang" 13 ], 14 "year": 2022, 15 "venue": "Neural Information Processing Systems", 16 "arxiv_id": "2207.08799", 17 "doi": "10.48550/arXiv.2207.08799" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "All abstract claims — phase transitions in training curves, nO(k) convergence matching SQ lower bounds, and the Fourier gap feature-amplification mechanism — are backed by Empirical Finding 1, Theorems 4 and 7, and the hidden progress measure experiments in Section 5.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "The causal claim that 'SGD amplifies the sparse solution via a Fourier gap' is supported by controlled ablations (no early convergence, sensitivity to initialization not SGD randomness, elbows in scaling curves) plus formal theorems giving a mechanistic account.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "The paper explicitly scopes results to synthetic sparse parity on small instances (n≤40 for most experiments) and states in the conclusion that extending to real-world combinatorial settings 'is an open problem,' bounding generalizations appropriately.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": false, 41 "answer": false, 42 "justification": "The paper is primarily theoretical with empirical validation on a synthetic problem; the 'alternative explanations' question is NA as there are no empirical claims about real-world phenomena where rival interpretations would apply.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper measures convergence time (iterations to 0 classification error) and claims exactly that — how quickly SGD solves the parity task — with no conflation between proxy and actual measure.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": false, 56 "justification": "There is no dedicated limitations or threats-to-validity section; limitations are discussed in the conclusion (online learning coupling, small-batch theory gap, synthetic setting) but not as a standalone section.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "The conclusion identifies specific gaps: Theorem 4 requires batch size nΩ(k) while empirically small batches work; sign-vector initialization is required by theory but not matched in all experiments; extending to non-disjoint cases for the PolyNet is open.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper explicitly states it focuses on the online learning case (coupling time and samples), synthetic parity distributions only, and that 'understanding the extent to which these insights extend from parity learning to more complex settings' is deferred to future work.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": true, 76 "justification": "The acknowledgements state 'Sham Kakade acknowledges funding from the Office of Naval Research under award N00014-22-1-2377,' though funding for the other five authors is not disclosed.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All six authors have institutional affiliations listed on the title page: Harvard University, Microsoft Research, University of Pennsylvania, and Hebrew University of Jerusalem.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": true, 88 "justification": "The ONR grant is for academic research with no product stake; Microsoft Research employment (Cyril Zhang) is an affiliation not a funder, and the research is not evaluating a Microsoft product.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "There is no competing interests or financial interests declaration in the paper; the acknowledgements mention only one grant and make no statement about patents, equity, or consulting relationships.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "Sparse parity, SQ model, Fourier gap, phase transition, convergence time, and all architectural variants are formally defined in Section 2 and Appendix A before use.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 1.1 explicitly lists contributions: empirical findings on SGD learning parities, Informal Theorems 2 and 3, the hidden progress measure framework, and further empirical explorations including grokking and depth counterexamples.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper has extensive related work in Section 1.2 and Appendix A.3, directly comparing and contrasting with NTK analysis, prior parity learning results (Daniely & Malach 2020, Frei et al. 2022), grokking (Power et al. 2022), and the statistical mechanics ML tradition.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "theoretical": { 121 "formal_quality": { 122 "assumptions_stated_explicitly": { 123 "applies": true, 124 "answer": true, 125 "justification": "Theorem 4 states all conditions explicitly: specific width (r = Ω(2k k log(k/ϵ))), batch size (B = Ω(nk log(n/ϵ))), initialization scheme, and even/odd k constraint; Theorem 7 likewise enumerates all preconditions.", 126 "source": "haiku" 127 }, 128 "proofs_complete_or_sketched": { 129 "applies": true, 130 "answer": true, 131 "justification": "Full proofs with all lemmas (Lemmas 1–7, Propositions 9) are given in Appendix B, spanning 30+ pages; main text provides informal theorem statements and proof sketches referencing the appendix.", 132 "source": "haiku" 133 }, 134 "bounds_tight_or_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "The paper explicitly discusses that Theorem 4's nΩ(k) batch size requirement may be loose relative to empirical small-batch success, and that matching the SQ lower bound is 'near' but not exact; tightness gaps are acknowledged as open problems.", 138 "source": "haiku" 139 }, 140 "counterexamples_explored": { 141 "applies": true, 142 "answer": true, 143 "justification": "Appendix C.7 constructs an explicit counterexample showing where greedy layer-by-layer learning fails while end-to-end SGD succeeds; the paper also tests settings outside the theory (non-sign initialization, small batch) to map the theory's boundaries.", 144 "source": "haiku" 145 }, 146 "notation_consistent": { 147 "applies": true, 148 "answer": true, 149 "justification": "Notation is defined once in Section 2 (χS, DS, Fourier coefficients, gradient update rule) and used consistently throughout proofs; the paper explicitly flags when different layers use different learning rates.", 150 "source": "haiku" 151 }, 152 "constructive_vs_existence_noted": { 153 "applies": true, 154 "answer": true, 155 "justification": "Theorems 4 and 7 are constructive — they give explicit initialization schemes, learning rate schedules, and batch sizes; the paper notes that the PolyNet analysis gives exact trajectory characterization while the MLP result uses a non-standard initialization.", 156 "source": "haiku" 157 } 158 }, 159 "connections": { 160 "connection_to_practice_discussed": { 161 "applies": true, 162 "answer": true, 163 "justification": "Section 5 and the conclusion discuss implications for understanding emergence in large language models, grokking in practice, and the role of computational (vs. statistical) scaling — connecting the synthetic results to observed phenomena in real deep learning.", 164 "source": "haiku" 165 }, 166 "relationship_to_prior_work_clear": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly positions itself relative to NTK analysis (Theorem 5 shows NTK requires nΩ(k) width), prior parity learning work (Daniely & Malach 2020 assumes favorable input distribution; this paper does not), and the statistical mechanics tradition.", 170 "source": "haiku" 171 }, 172 "computational_complexity_discussed": { 173 "applies": true, 174 "answer": true, 175 "justification": "Computational complexity is the central subject: SQ lower bounds of nΩ(k), the gap between statistical (k log n) and computational limits, and how SGD achieves nO(k) convergence nearly matching these bounds are all analyzed in detail.", 176 "source": "haiku" 177 }, 178 "limitations_of_formal_model_stated": { 179 "applies": true, 180 "answer": true, 181 "justification": "The paper explicitly states the disjoint-PolyNet is 'idealized' (restricts S to one index per partition), that Theorem 4's sign-vector initialization is non-standard, and that the small-batch regime 'would require a better understanding of anti-concentration behavior of Boolean Fourier coefficients' — left as open.", 182 "source": "haiku" 183 } 184 } 185 } 186 }, 187 "claims": [ 188 { 189 "claim": "A wide variety of neural networks (MLPs, Transformers, PolyNets, single-neuron architectures) solve the k-sparse n-parity problem in nO(k) iterations, nearly matching the SQ lower bound.", 190 "evidence": "Empirical Finding 1 and Figure 1 (right) show median convergence times scaling as c·nαk across all 18 architecture configurations for n≤40, k≤8.", 191 "supported": "strong" 192 }, 193 { 194 "claim": "SGD does not learn sparse parities via random search (Langevin-like mechanism), but instead makes continual hidden progress via feature amplification.", 195 "evidence": "Four empirical observations: convergence times scale with k (not random), no early-convergence lucky runs, sensitivity to initialization not SGD stochasticity, elbows in scaling curves — plus formal Theorems 4 and 7.", 196 "supported": "strong" 197 }, 198 { 199 "claim": "SGD amplifies the correct sparse features through a Fourier gap in the population gradient at initialization.", 200 "evidence": "Theorem 4 proves this for ReLU MLPs with sign-vector initialization; Proposition 9 formalizes the Fourier gap → feature recoverability link; Figure 11–12 show Fourier gaps empirically for other activations.", 201 "supported": "strong" 202 }, 203 { 204 "claim": "Gradient flow on disjoint-PolyNets exhibits a phase transition where 1−o(1) fraction of training time is spent with error ≥49%.", 205 "evidence": "Theorem 6 (full version in Appendix B.4) formally proves T(0.49)/T(0) = 1 − O((n′)1−k/2) for k≥3; Figure 5 (left) shows the corresponding training curve.", 206 "supported": "strong" 207 }, 208 { 209 "claim": "Increasing model width does not yield parallel speedups in convergence time, unlike random search.", 210 "evidence": "Figure 4 (left) and Figure 14 show that convergence time plateaus at large widths rather than decreasing as 1/r, across n∈{30,40,50}, k=3.", 211 "supported": "moderate" 212 }, 213 { 214 "claim": "Grokking (delayed generalization after training overfitting) emerges in the finite-sample setting for sparse parity learning.", 215 "evidence": "Figure 4 (right) and Appendix C.5 show grokking training curves for m≪nk sample sizes; described as 'preliminary' findings.", 216 "supported": "moderate" 217 } 218 ], 219 "methodology_tags": [ 220 "theoretical", 221 "benchmark-eval" 222 ], 223 "key_findings": "SGD learns k-sparse n-bit parity — a canonical computationally hard problem — in approximately nO(k) iterations across diverse architectures, nearly matching the statistical query lower bound despite no explicit sparse prior. Rather than performing random search ('stumbling in the dark'), SGD makes continual hidden progress by gradually amplifying the correct sparse features via a Fourier gap in the population gradient, even while loss and accuracy metrics appear flat. This explains phase transitions in training curves as a natural consequence of sequential feature amplification, not stochastic luck. The work provides both formal convergence theorems (for ReLU MLPs and disjoint-PolyNets) and a rich empirical characterization that also surfaces grokking and a counterexample to layer-by-layer learning.", 224 "red_flags": [ 225 { 226 "flag": "Theory-practice gap", 227 "detail": "Theorem 4 requires batch size nΩ(k) and sign-vector initialization, but the paper also demonstrates empirically that small batches (B=1) and standard random initialization work — the theory does not cover the practically relevant regime." 228 }, 229 { 230 "flag": "Synthetic-only setting", 231 "detail": "All experiments use the synthetic sparse parity distribution; the connection to real-world emergence in LLMs is purely speculative and explicitly deferred as future work." 232 }, 233 { 234 "flag": "Small-scale experiments", 235 "detail": "Most convergence time experiments use n≤40; the paper notes in Figure 2 (right) that the nΘ(k) power-law scaling deteriorates for larger n, raising questions about whether the claimed near-optimality holds at scale." 236 }, 237 { 238 "flag": "Incomplete funding disclosure", 239 "detail": "Only one author's funding (Kakade via ONR) is disclosed; Microsoft Research affiliation for Cyril Zhang is listed but no statement on whether this work was conducted as part of employment." 240 } 241 ], 242 "cited_papers": [ 243 { 244 "title": "Grokking: Generalization beyond overfitting on small algorithmic datasets", 245 "relevance": "Introduces grokking phenomenon studied in Section 5; directly connected as this paper provides a theoretical and empirical account of the same phase-transition dynamics." 246 }, 247 { 248 "title": "Language models are few-shot learners (GPT-3)", 249 "relevance": "Primary motivation for studying emergent capabilities and phase transitions; cited as the canonical example of discontinuous improvement with scale." 250 }, 251 { 252 "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models (BIG-bench)", 253 "relevance": "Documents emergent capabilities at scale that this paper seeks to theoretically explain through the sparse parity lens." 254 }, 255 { 256 "title": "Neural tangent kernel: Convergence and generalization in neural networks", 257 "relevance": "The NTK regime is the foil for this paper's results; Theorem 5 proves NTK requires nΩ(k) width, establishing that their results lie outside the NTK regime." 258 }, 259 { 260 "title": "Learning parities with neural networks (Daniely and Malach 2020)", 261 "relevance": "Most directly related prior work on parity learning with NNs; key difference is they assume favorable (non-uniform) input distributions, while this paper uses the hard uniform distribution." 262 }, 263 { 264 "title": "A mechanistic interpretability analysis of grokking (Nanda and Lieberum 2022)", 265 "relevance": "Concurrent work analyzing hidden progress in Transformers on arithmetic tasks, corroborating this paper's findings in a different setting." 266 }, 267 { 268 "title": "High-dimensional asymptotics of feature learning: How one gradient step improves the representation (Ba et al. 2022)", 269 "relevance": "Studies the first gradient step for feature learning, directly related to the paper's Fourier gap mechanism which also operates via the initial population gradient." 270 }, 271 { 272 "title": "Quantifying the benefit of using differentiable learning over tangent kernels (Malach et al. 2021)", 273 "relevance": "Shows feature learning beyond NTK is essential; foundational to the paper's argument that low-width feature learning (not fixed-kernel) drives parity learning." 274 }, 275 { 276 "title": "Efficient noise-tolerant learning from statistical queries (Kearns 1998)", 277 "relevance": "Establishes the SQ lower bound of nΩ(k) that this paper's algorithm nearly matches; the SQ framework is central to the paper's computational complexity analysis." 278 } 279 ], 280 "engagement_factors": { 281 "practical_relevance": { 282 "score": 1, 283 "justification": "Provides conceptual insight into why phase transitions occur in deep learning, but the synthetic parity setting and specialized theory offer limited direct practitioner guidance." 284 }, 285 "surprise_contrarian": { 286 "score": 3, 287 "justification": "Directly challenges the intuitive 'random search' explanation for phase transitions and demonstrates that SGD makes mathematically structured hidden progress invisible to standard metrics — a genuinely counterintuitive finding." 288 }, 289 "fear_safety": { 290 "score": 0, 291 "justification": "The paper explicitly states 'we see no direct societal impacts' and the work is purely foundational theory on a synthetic problem." 292 }, 293 "drama_conflict": { 294 "score": 1, 295 "justification": "Mild theoretical drama in refuting the Langevin/random-search hypothesis; no community controversy or competing claims." 296 }, 297 "demo_ability": { 298 "score": 1, 299 "justification": "Parity learning experiments are reproducible and well-documented (PyTorch, ~1500 CPU hours), but there is no interactive demo or immediate practitioner application." 300 }, 301 "brand_recognition": { 302 "score": 2, 303 "justification": "Boaz Barak is a prominent theoretician; Microsoft Research and Harvard affiliations add credibility; NeurIPS venue; no famous product association." 304 } 305 }, 306 "hn_data": { 307 "threads": [ 308 { 309 "hn_id": "40975320", 310 "title": "Large models of what? Mistaking engineering achievements for linguistic agency", 311 "points": 184, 312 "comments": 156, 313 "url": "https://news.ycombinator.com/item?id=40975320", 314 "created_at": "2024-07-16T10:54:31Z" 315 }, 316 { 317 "hn_id": "41371123", 318 "title": "Is the Hubble crisis connected with the extinction of dinosaurs? (2022)", 319 "points": 6, 320 "comments": 0, 321 "url": "https://news.ycombinator.com/item?id=41371123", 322 "created_at": "2024-08-27T18:35:38Z" 323 }, 324 { 325 "hn_id": "25080547", 326 "title": "Discovering Reinforcement Learning Algorithms", 327 "points": 4, 328 "comments": 0, 329 "url": "https://news.ycombinator.com/item?id=25080547", 330 "created_at": "2020-11-13T09:14:29Z" 331 }, 332 { 333 "hn_id": "42657991", 334 "title": "Progress in Deep Learning: SGD Learns Parities Near the Computational Limit", 335 "points": 2, 336 "comments": 0, 337 "url": "https://news.ycombinator.com/item?id=42657991", 338 "created_at": "2025-01-10T17:55:55Z" 339 }, 340 { 341 "hn_id": "32795736", 342 "title": "No Grammar to Rule Them All: A Survey of JSON-Style DSLs for Visualization", 343 "points": 2, 344 "comments": 0, 345 "url": "https://news.ycombinator.com/item?id=32795736", 346 "created_at": "2022-09-11T00:27:05Z" 347 }, 348 { 349 "hn_id": "44561609", 350 "title": "One Token to Fool LLM-as-a-Judge", 351 "points": 2, 352 "comments": 0, 353 "url": "https://news.ycombinator.com/item?id=44561609", 354 "created_at": "2025-07-14T15:51:17Z" 355 }, 356 { 357 "hn_id": "39484586", 358 "title": "What Artificial Neural Networks Can Tell Us About Human Language Acquisition", 359 "points": 2, 360 "comments": 0, 361 "url": "https://news.ycombinator.com/item?id=39484586", 362 "created_at": "2024-02-23T18:47:05Z" 363 }, 364 { 365 "hn_id": "23982601", 366 "title": "Discovering Reinforcement Learning Algorithms", 367 "points": 2, 368 "comments": 0, 369 "url": "https://news.ycombinator.com/item?id=23982601", 370 "created_at": "2020-07-29T01:30:36Z" 371 }, 372 { 373 "hn_id": "27891525", 374 "title": "Detecting Oxbow Code in Erlang Codebases with the Highest Degree of Certainty", 375 "points": 1, 376 "comments": 0, 377 "url": "https://news.ycombinator.com/item?id=27891525", 378 "created_at": "2021-07-20T09:13:52Z" 379 } 380 ], 381 "top_points": 184, 382 "total_points": 205, 383 "total_comments": 156 384 } 385 }