scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23855B)
      1 {
      2   "paper": {
      3     "title": "Hidden Progress in Deep Learning: SGD Learns Parities Near the Computational Limit",
      4     "authors": ["Boaz Barak", "Benjamin L. Edelman", "Surbhi Goel", "Sham Kakade", "Eran Malach", "Cyril Zhang"],
      5     "year": 2022,
      6     "venue": "Neural Information Processing Systems",
      7     "arxiv_id": "2207.08799",
      8     "doi": "10.48550/arXiv.2207.08799"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["theoretical", "benchmark-eval"],
     13   "key_findings": "Neural networks trained with SGD can learn k-sparse parities of n bits in nO(k) iterations, nearly matching SQ lower bounds, across a variety of architectures (MLPs, Transformers, PolyNets, single neurons). Despite flat loss curves for most of training, SGD makes hidden progress by gradually amplifying sparse features via a Fourier gap in the population gradient, rather than performing random search. The paper provides theoretical proofs for convergence on MLPs and disjoint-PolyNets, and demonstrates phase transitions and grokking phenomena in this setting.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository URL is provided in the paper. Appendix D.3 mentions implementation in PyTorch but provides no link to code."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The data is synthetically generated from well-defined distributions (sparse parity distributions over {±1}^n). The generation procedure is fully specified, making the data trivially reproducible."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Appendix D.3 mentions PyTorch and GPU types (NVIDIA Tesla P100, P40, RTX A6000) but provides no requirements.txt, library versions, or environment specification."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. Hyperparameters are scattered across the appendix but there are no scripts or README for reproducing experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Figure 10 shows 95% bootstrap confidence intervals for median convergence times. Figure 14 shows interquartile ranges with whiskers."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are performed. Claims about scaling (nO(k)) are supported by visual inspection of log-log plots rather than formal tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports concrete scaling exponents (α such that tc ∝ n^(αk)) in Figure 9, providing magnitude context for convergence time relationships."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The number of random trials (25 for main results, 1000 for scaling plots) is stated but not justified. No power analysis or justification for why these numbers suffice."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Variance is reported through multiple mechanisms: 95% bootstrap CIs (Figure 10), interquartile ranges (Figure 14), convergence time histograms (Figure 2), and multiple training curves from different seeds."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Random search is used as a baseline comparison (green curves in Figure 4 left and Figure 14), and SQ lower bounds serve as theoretical baselines."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baselines are the known theoretical computational limits (SQ lower bounds), which are the appropriate comparison for this problem. The paper positions against concurrent work by Telgarsky (2022) and Damian et al. (2022)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper systematically varies architectures (15+ configurations), batch sizes, learning rates, initializations, loss functions, and problem parameters (n, k) to understand which components matter."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: classification error, convergence time, Fourier gap, weight movement norm (hidden progress measure), and loss curves."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is irrelevant for a theoretical/empirical study on synthetic learning problems."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The main results use online learning (fresh i.i.d. samples at each iteration). Validation accuracy is computed on a separate batch of 2^13 = 8192 samples (Appendix D.2)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per architecture (15+ settings), per batch size, per learning rate, per (n,k) problem instance, in Figures 7-9 and extensive appendix tables."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Less robust configurations are discussed: settings (*i), (*ii), (*iii) require larger batch sizes. Figure 2 (right) shows scaling degradation at larger n. Appendix C.2 discusses counterintuitive failure modes."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results: Transformer setting (*iii) requires Adam instead of SGD, underparameterized models (r<k) cannot reach 100% accuracy, scaling exponents worsen at larger n (Figure 2 right), and layer-by-layer learning fails for deep polynomial networks (Appendix C.7)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about nO(k) convergence, phase transitions, and hidden progress via Fourier gap are all supported by both empirical results (Figures 1, 7-10) and theoretical analysis (Theorems 4, 6, 7)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper's causal claims (SGD amplifies features via Fourier gap rather than random search) are supported by both formal proofs (Theorems 4, 6, 7) and controlled empirical ablations (varying individual parameters while holding others fixed)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper is careful to bound claims to the sparse parity setting: 'these insights extend from parity learning to more complex...problem settings' is posed as an open question (Section 6), not a claim. The broader impact statement notes the 'heavily-idealized synthetic problem setting.'"
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper explicitly considers and empirically refutes the 'stumbling in the dark' (random search) hypothesis with four lines of evidence (Section 3.2), and discusses NTK as an alternative explanation (Section 4.1, Theorem 5)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper's claims match the granularity of its measurements: convergence times, classification error, and Fourier gaps are measured and reported directly without broader framing beyond the synthetic setting."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper does not use pre-trained LLMs. All models are trained from scratch with fully specified architectures."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting. All experiments train neural networks from scratch."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Hyperparameters are extensively documented: learning rates, batch sizes, weight decay schedules, initialization schemes, and architecture configurations are all specified in Section 3, Appendix D.1, and figure captions."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The paper trains standard neural networks with SGD."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The data generation process is fully specified: uniform distribution over {±1}^n with parity labels. The synthetic data requires no preprocessing. Section 2 provides complete formal definitions."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 (Conclusion) discusses open questions including extension to small-batch settings, other architectures and losses, and the gap between theory and the full range of empirical settings."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The paper identifies specific limitations: the theoretical result requires sign vector initialization while experiments work with other initializations (Section 4.1), batch size scaling with nΩ(k) is required theoretically but B=1 works empirically, and scaling exponents worsen at larger n (Figure 2 right, Appendix C.2)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states scope: 'this work largely focuses on the online learning case' (Section 6), notes the synthetic setting does not directly address real-world tasks, and poses extension to 'more complex...real-world combinatorial problem settings' as future work."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (convergence times, training logs) is made available. The synthetic data itself is trivially reproducible but experimental results are not downloadable."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data generation is fully specified: synthetic parity distributions with uniform random inputs. Experimental procedure (number of trials, convergence criteria, validation methodology) is described in Appendix D.2."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data is synthetically generated from a well-defined mathematical distribution."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from data generation to result reporting is documented: synthetic data → SGD training → convergence time measurement → aggregation (10th percentile, median). Appendix D.2 describes validation batch sizes and convergence criteria."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgements section states: 'Sham Kakade acknowledges funding from the Office of Naval Research under award N00014-22-1-2377.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Harvard University, Microsoft Research, University of Pennsylvania, Hebrew University of Jerusalem."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The Office of Naval Research funding has no apparent stake in the theoretical results about learning parities. Microsoft Research affiliation exists but the paper evaluates no Microsoft products."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate a pre-trained model's capability on any benchmark. All models are trained from scratch on synthetic data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable — models are trained from scratch on synthetic distributions, not pre-trained models evaluated on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — the paper uses synthetic data generated from mathematical distributions, not pre-existing benchmarks that could be in training data."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No per-experiment cost or latency is reported, though the paper mentions total compute in Appendix D.3."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Appendix D.3 states: 'approximately 1500 CPU hours' for main experiments and 'approximately 200 GPU hours' for GPU-accelerated experiments, with specific GPU types listed."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Results are reported across many random seeds: 25 trials for main results (Figures 7-8), 1000 trials for scaling plots (Figure 10), and 10^6 trials for convergence time distributions (Figure 2 left)."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Number of runs is explicitly stated: 25 random trials (Figure 7-8), 1000 runs (Figure 10), 10^6 trials (Figure 2 left), 5 runs for training curves (Figure 1)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "The search space is stated: learning rates from {0.001, 0.01, 0.1, 1}, batch sizes from {1, 2, 4, ..., 1024}, three initialization schemes, three loss functions. Best learning rate is reported explicitly."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Figure 7 reports results for the best learning rate, and the search space is explicit. The paper reports results across all configurations rather than cherry-picking, showing the full grid in Figures 7-9."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No formal statistical hypothesis tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper introduces PolyNets and compares against standard architectures without discussing potential bias in this comparison. However, the main claims are about matching theoretical lower bounds rather than outperforming other methods."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "The core contribution is characterizing performance (convergence time) as a function of computational budget (iterations), shown extensively in Figures 1, 7, 10."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper extensively discusses why sparse parity is a meaningful benchmark: it has known computational limits (SQ lower bounds), separates statistical from computational difficulty, and is connected to broader phenomena like grokking and emergent capabilities."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. The paper trains neural networks with standard SGD."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "Not applicable — all models are trained from scratch on synthetic data, not pre-trained models evaluated on benchmarks."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "Not applicable — synthetic data with known ground truth. No risk of feature leakage by construction."
    349       },
    350       "non_independence_addressed": {
    351         "applies": false,
    352         "answer": false,
    353         "justification": "Not applicable — data is i.i.d. by construction from synthetic distributions."
    354       },
    355       "leakage_detection_method": {
    356         "applies": false,
    357         "answer": false,
    358         "justification": "Not applicable — no pre-trained models or pre-existing benchmarks used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "A variety of neural networks (MLPs, Transformers, sinusoidal neurons, PolyNets) successfully learn k-sparse n-dimensional parities in at most c·n^(αk) SGD steps, for small constants c, α.",
    365       "evidence": "Figures 1, 7-10 show convergence times across 15+ architecture configurations, 3 initialization schemes, multiple batch sizes and learning rates, for n≤30, k≤4. 10th percentile convergence times and median convergence times are reported.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "SGD does not learn parities via random exhaustive search ('stumbling in the dark'), but rather makes continual hidden progress via Fourier gap amplification.",
    370       "evidence": "Four empirical arguments against random search (Section 3.2): convergence times scale with k, no early convergence in histograms (Figure 2 left), sensitivity to initialization not SGD samples (Figure 2 center), elbows in scaling curves (Figure 2 right). Theoretical analysis shows population gradient encodes relevant features (Theorems 4, 6, 7).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "SGD on 2-layer MLPs of width 2^Θ(k), with batch size n^O(k), converges to a solution with at most ε error in at most 2^O(k)·poly(1/ε) iterations (Theorem 4).",
    375       "evidence": "Full proof in Appendix B.1. The result relies on Fourier gap analysis of the majority function and is proven for ReLU activation with sign vector initialization and hinge loss.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "For disjoint-PolyNets, gradient flow exhibits a phase transition: it spends a 1−o(1) fraction of convergence time with error ≥49% (Theorem 6).",
    380       "evidence": "Full proof in Appendix B.3-B.4 for both ±1 and Gaussian initializations. Supported empirically in Figure 5.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "In the finite-sample setting, neural networks exhibit grokking: delayed generalization after initial overfitting.",
    385       "evidence": "Figure 4 (right) and Figure 15 show grokking for width-100 ReLU MLPs at small sample sizes. Weight decay modulates the statistical-computational tradeoff (Appendix C.5).",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [],
    390   "cited_papers": [
    391     {
    392       "title": "Beyond the Imitation Game: Quantifying and Extrapolating the Capabilities of Language Models",
    393       "authors": ["Aarohi Srivastava", "Abhinav Rastogi"],
    394       "year": 2022,
    395       "arxiv_id": "2206.04615",
    396       "relevance": "BIG-bench benchmark identifying emergent capabilities in language models at critical scale thresholds."
    397     },
    398     {
    399       "title": "Language models are few-shot learners",
    400       "authors": ["Tom B Brown"],
    401       "year": 2020,
    402       "arxiv_id": "2005.14165",
    403       "relevance": "GPT-3 paper demonstrating emergent few-shot learning capabilities in large language models."
    404     },
    405     {
    406       "title": "Training compute-optimal large language models",
    407       "authors": ["Jordan Hoffmann"],
    408       "year": 2022,
    409       "arxiv_id": "2203.15556",
    410       "relevance": "Chinchilla scaling laws for compute-optimal training of large language models."
    411     },
    412     {
    413       "title": "Grokking: Generalization beyond overfitting on small algorithmic datasets",
    414       "authors": ["Alethea Power"],
    415       "year": 2022,
    416       "arxiv_id": "2201.02177",
    417       "relevance": "Documented grokking phenomenon of delayed generalization in neural networks on algorithmic tasks."
    418     },
    419     {
    420       "title": "Neural tangent kernel: Convergence and generalization in neural networks",
    421       "authors": ["Arthur Jacot"],
    422       "year": 2018,
    423       "relevance": "Foundational work on NTK regime; this paper shows results outside the NTK regime where feature learning is essential."
    424     },
    425     {
    426       "title": "Attention is all you need",
    427       "authors": ["Ashish Vaswani"],
    428       "year": 2017,
    429       "relevance": "Transformer architecture used as one of the evaluated architectures for learning parities."
    430     },
    431     {
    432       "title": "Failures of gradient-based deep learning",
    433       "authors": ["Shai Shalev-Shwartz", "Ohad Shamir", "Shaked Shammah"],
    434       "year": 2017,
    435       "relevance": "Prior work demonstrating computational hardness of learning parities for gradient-based algorithms."
    436     },
    437     {
    438       "title": "A mechanistic interpretability analysis of grokking",
    439       "authors": ["Neel Nanda", "Tom Lieberum"],
    440       "year": 2022,
    441       "relevance": "Complementary analysis of hidden progress in Transformers trained on arithmetic tasks exhibiting grokking."
    442     }
    443   ]
    444 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs