scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25954B)
      1 {
      2   "paper": {
      3     "title": "Nested Learning: The Illusion of Deep Learning Architectures",
      4     "authors": ["Ali Behrouz", "Meisam Razaviyayn", "Peilin Zhong", "Vahab Mirrokni"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025",
      7     "arxiv_id": "2512.24695",
      8     "doi": "10.48550/arXiv.2512.24695"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses publicly available benchmarks: CLINC, Banking, DBpedia, RULER, BABILong, MTOB, Manchu, FineWeb-Edu, ImageNet-21K, and standard common-sense reasoning datasets. No proprietary datasets were created."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. Hardware and library versions are not specified."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Results in Tables 1-6 and Figures 6-12 report point estimates only, with no confidence intervals or error bars on the main results."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims Hope outperforms baselines across multiple benchmarks but provides no statistical significance tests. Comparisons are made by comparing raw numbers."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Results are reported with baseline context, showing absolute performance numbers for both the proposed method and baselines (e.g., Table 2 shows perplexity and accuracy for all models), allowing readers to compute relative improvements."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification is given for the number of tokens used for training (30B/100B), the number of benchmark instances evaluated, or why these sample sizes are sufficient."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No standard deviations, variance across seeds, or spread measures are reported for any experiment. All results appear to be single-run numbers."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Extensive baselines are included across all experiments: Transformer++, Samba, RetNet, DeltaNet, RWKV-7, Comba, TTT, Miras, DLA, Titans, GPT-4, Llama-8B, RMT, ARMT, and others (Tables 1-5, Figures 6-9)."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include very recent models: RWKV-7 (2025), Comba (2025), Titans (2025), Miras (2025), DLA (2025), GPT-4o-mini, all published in 2024-2025."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Table 6 provides an ablation study removing individual components: DGD, momentum, weight decay, CMS, and inner projections for k, v, q, showing each contributes to performance."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics are used: perplexity (Wikitext, LMB), accuracy (PIQA, HellaSwag, WinoGrande, ARC, SIQA, BoolQ), ChRF for translation, and task-specific metrics for NIAH, BABILong, MAD, and formal language tasks."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation is not relevant to the claims made here, which are about architecture design, memory systems, and performance on automated benchmarks."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Standard benchmark test sets are used (RULER, BABILong, common-sense reasoning benchmarks). The paper follows established evaluation protocols from prior work."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are broken down per task across multiple tables: NIAH variants (S-NIAH-1/2/3, MK/MQ/MV) in Table 1, individual reasoning benchmarks in Table 2, individual recall tasks in Tables 3-4, and formal language tasks in Table 5."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper discusses limitations: Hope's MK-NIAH and multi-value performance lags behind Transformers (Table 1), short in-context recall tasks show a gap with Transformers (Table 3), and M3 optimizer has computational overhead (Figure 12, Section 10)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Ablation study (Table 6) shows components that hurt performance when removed. The paper also notes that M3 optimizer is slower than Muon (Figure 12) and that Hope underperforms Transformers on certain recall tasks (Table 3)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims 'promising results in language modeling, knowledge incorporation, and few-shot generalization tasks, continual learning, and long-context reasoning tasks,' which are supported by experimental sections 9.1-9.6."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Causal claims like 'CMS design helps with continual learning' are supported by controlled ablation experiments (Table 6, Figure 7) with single-variable manipulation. The paper isolates individual components and measures their effect."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title 'The Illusion of Deep Learning Architectures' and broad claims about 'all modern architectures' being uniform are not bounded to the tested settings. Experiments use models up to 1.3B parameters, but claims extend to general deep learning."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper does not substantively discuss alternative explanations for why Hope outperforms baselines. Could the improvements come from simply having more parameters? Different training dynamics? These are not explored."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper measures specific benchmarks and generally states results in terms of those benchmarks (e.g., 'performance on BABILong', 'accuracy on common-sense reasoning'). Claims about 'continual learning' are tested with specific continual learning benchmarks."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Model sizes are specified (760M, 1.3B parameters) and backbone models are named (Llama3-8B, Llama-3B for continual learning experiments). GPT-4 and GPT-4o-mini are used as baselines in BABILong."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "The paper does not use prompting — it trains and evaluates neural architectures directly on their respective tasks."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Training details include: vocabulary size 32K, AdamW optimizer, 'tuned learning rate for each model,' 'default optimizer configuration in Behrouz et al. (2025c),' 50B tokens for NIAH experiments, 30B/100B tokens for LM experiments, 15B tokens for continual pre-training."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. The paper presents neural architecture designs trained end-to-end."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "Training data is described as 'a mixture of FineWeb-Edu and long-context documents' but the mixing ratio, any filtering criteria, and document selection process are not described."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 10 (Conclusion) includes substantive discussion: 'Is Catastrophic Forgetting Solved?' explicitly states 'the undesirable phenomenon of catastrophic forgetting is not solved in general' and discusses compression-forgetting tradeoff."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The limitations discussion is about conceptual scope ('NL as a roadmap rather than a destination') rather than specific threats to validity of the experiments. No mention of specific experimental threats like training instability, hyperparameter sensitivity, or benchmark selection bias."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The paper states specific scope boundaries: 'we focus on the first stage: memory consolidation as an online process' (Section 1.1), notes M3 'might suffer from computational overhead and so face challenges when scaling to larger networks' (Section 7.2), and excludes comparison with Cartridges 'due to fundamental differences in computational costs' (Section 9.1)."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw experimental data (model outputs, training logs, per-example predictions) is made available for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Benchmark datasets are referenced with citations: CLINC, Banking, DBpedia with descriptions (Section 9.1), RULER and BABILong with protocol references, FineWeb-Edu and ImageNet-21K with citations."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data sources are standard public benchmarks."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The pipeline from raw data (FineWeb-Edu + long-context documents) to training is not documented. No filtering criteria, mixing ratios, or preprocessing steps are specified."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding or acknowledgments section is present in the paper. Authors have Google email addresses but no explicit funding disclosure."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are disclosed via email addresses: three authors at Google ({alibehrouz, razaviyayn, mirrokni}@google.com) and one at Columbia University."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Three of four authors are at Google, which has direct commercial interest in novel architecture designs. Google is not independent of the outcome. No explicit funding disclosure to evaluate further."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "Models are trained from scratch on FineWeb-Edu data, but the temporal coverage of this dataset is not stated. For continual learning experiments using Llama3-8B/3B, no training cutoff is mentioned."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether FineWeb-Edu training data could overlap with benchmark test sets (e.g., CLINC, Banking, DBpedia, common-sense reasoning datasets)."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "Many benchmarks used (PIQA, HellaSwag, ARC, BoolQ, etc.) were published years before the training data was collected. No contamination analysis is provided."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "Figure 12 compares training time/efficiency of M3 vs Muon vs AdaMuon at 140M and 1.3B scales. Section 7.1 analyzes computational overhead of CMS. Section 8.2 discusses parallelizable training."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Training budgets stated: 30B tokens for 760M models, 100B tokens for 1.3B models, 50B tokens for NIAH experiments, 15B tokens for continual pre-training. Figure 12 shows wall-clock training times."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No multi-seed experiments reported. All results appear to be from single training runs."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The number of experimental runs is never explicitly stated for any experiment."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper mentions 'tuned learning rate for each model' but does not disclose the search budget, method, or number of configurations tried."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No description of how the best configuration was selected. Learning rates were 'tuned' but selection procedure not described."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No statistical tests are performed, so no multiple comparison correction is applied despite comparisons across many models and benchmarks."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors propose Hope and compare against their own implementations of baselines (all trained from scratch with the same data). The bias of authors implementing and tuning their own system vs baselines is not acknowledged."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Figure 12 directly compares training time of different optimizers. Section 7.1 discusses the compute overhead of CMS. Section 9.1 excludes Cartridges comparison due to 'fundamental differences in their computational costs.'"
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "No discussion of whether the benchmarks actually measure the claimed capabilities (e.g., whether NIAH tests 'continual learning' vs just 'long context retrieval')."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is used. Models are evaluated directly on benchmarks."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of temporal leakage. Many benchmarks (PIQA 2020, HellaSwag 2019, ARC 2018) predate the training data but this is not addressed."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether evaluation setup leaks information. For fine-tuned models on BABILong, no analysis of whether fine-tuning data overlaps with test."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of independence between training and test data across any experiments."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No leakage detection or prevention methods are used or described."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Hope outperforms all baselines on average in language modeling and common-sense reasoning at both 760M and 1.3B parameter scales.",
    363       "evidence": "Table 2 shows Hope achieves 52.28 avg accuracy at 760M/30B and 58.04 at 1.3B/100B, compared to next best Titans at 51.68 and 56.82 respectively. Perplexity also lowest.",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "Hope with CMS design maintains performance at 10M context length on BABILong while other models degrade.",
    368       "evidence": "Figure 9 shows Hope maintaining performance at 10M context while Titans and ARMT drop fast after 1M. However, this requires fine-tuning.",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "Multi-level memory (CMS) improves continual learning over single-level in-context learning.",
    373       "evidence": "Figure 6 shows Hope outperforms ICL, EWC, and InCA on class-incremental learning across CLINC, Banking, and DBpedia. Figure 7 shows increasing levels improves NIAH, LongHealth, and QASPER.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Known gradient-based optimizers (Adam, SGD with Momentum) can be reformulated as associative memory modules.",
    378       "evidence": "Sections 4.1-4.2 and Appendix B provide mathematical derivations showing Adam, momentum SGD, AdaGrad, and related optimizers as associative memories compressing gradients. The derivations are rigorous.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "M3 optimizer finds more effective solutions than AdamW and Muon on ImageNet ViT training.",
    383       "evidence": "Figure 11 shows M3 achieves lower training and test loss on ViT 24M and 86M models on ImageNet-21K. However, single-run results with no error bars.",
    384       "supported": "weak"
    385     },
    386     {
    387       "claim": "Hope achieves perfect scores on all formal language recognition tasks while Transformers fail.",
    388       "evidence": "Table 5 shows Hope gets 100% on all tasks (Parity, Non-Star-Free Regular, Counter, Parallel, Shuffle-2) while Transformer fails on Parity and Non-Star-Free Regular. However, LSTM and SRWM also achieve perfect scores.",
    389       "supported": "moderate"
    390     }
    391   ],
    392   "methodology_tags": ["theoretical", "benchmark-eval"],
    393   "key_findings": "The paper presents Nested Learning, a paradigm that reframes deep learning models as interconnected systems of nested optimization problems with different update frequencies. It shows that popular optimizers (Adam, SGD with momentum) and architectures (Transformers, linear attention) can be unified as associative memories compressing their own context flow. The proposed Hope architecture, combining self-modifying Titans with Continuum Memory Systems, demonstrates improvements in continual learning, long-context reasoning (maintaining performance at 10M context on BABILong), and language modeling over existing architectures including Transformers and modern RNNs.",
    394   "red_flags": [
    395     {
    396       "flag": "No error bars or multi-seed experiments",
    397       "detail": "All experimental results across Tables 1-6 and Figures 6-12 appear to be single-run point estimates. For architecture comparisons where hyperparameters were tuned per model, seed sensitivity could significantly affect conclusions."
    398     },
    399     {
    400       "flag": "Authors evaluate their own system with self-implemented baselines",
    401       "detail": "All models are trained from scratch by the authors. The bias of authors implementing and tuning their own system vs baselines is well-documented (Lucic et al. 2018) but not acknowledged."
    402     },
    403     {
    404       "flag": "Broad theoretical claims beyond experimental scope",
    405       "detail": "The title claims 'The Illusion of Deep Learning Architectures' and the paper makes sweeping claims about all modern architectures being uniform. Experiments are limited to models up to 1.3B parameters and a specific set of benchmarks."
    406     },
    407     {
    408       "flag": "Company affiliation evaluating novel architectures",
    409       "detail": "Three of four authors are Google employees. Google has commercial interest in novel architectures. No competing interests statement is provided."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Attention is All you Need",
    415       "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N Gomez", "Łukasz Kaiser", "Illia Polosukhin"],
    416       "year": 2017,
    417       "relevance": "Foundational Transformer architecture paper; NL reinterprets attention as a non-parametric associative memory."
    418     },
    419     {
    420       "title": "Language models are few-shot learners",
    421       "authors": ["Tom Brown"],
    422       "year": 2020,
    423       "relevance": "Defines in-context learning in LLMs; NL reframes ICL as a consequence of multi-level nested optimization."
    424     },
    425     {
    426       "title": "Titans: Learning to Memorize at Test Time",
    427       "authors": ["Ali Behrouz", "Peilin Zhong", "Vahab Mirrokni"],
    428       "year": 2025,
    429       "relevance": "Direct predecessor to Hope architecture; introduces deep memory modules that Hope extends with self-modification and CMS."
    430     },
    431     {
    432       "title": "Scaling laws for neural language models",
    433       "authors": ["Jared Kaplan"],
    434       "year": 2020,
    435       "relevance": "Establishes neural scaling laws that NL reinterprets through the lens of nested optimization and parameter capacity across levels."
    436     },
    437     {
    438       "title": "Adam: A method for stochastic optimization",
    439       "authors": ["Diederik P Kingma", "Jimmy Ba"],
    440       "year": 2014,
    441       "relevance": "NL shows Adam is an optimal associative memory for L2 regression on gradients, reframing the most widely used optimizer."
    442     },
    443     {
    444       "title": "Learning to (learn at test time): RNNs with expressive hidden states",
    445       "authors": ["Yu Sun"],
    446       "year": 2024,
    447       "relevance": "TTT framework for test-time training; NL reframes this as parametric in-context learning within the nested paradigm."
    448     },
    449     {
    450       "title": "It's All Connected: A Journey Through Test-Time Memorization, Attentional Bias, Retention, and Online Optimization",
    451       "authors": ["Ali Behrouz", "Meisam Razaviyayn", "Peilin Zhong", "Vahab Mirrokni"],
    452       "year": 2025,
    453       "arxiv_id": "2504.13173",
    454       "relevance": "Miras framework unifying sequence models as associative memories; direct theoretical foundation for NL."
    455     },
    456     {
    457       "title": "RWKV-7 'Goose' with Expressive Dynamic State Evolution",
    458       "authors": ["Bo Peng"],
    459       "year": 2025,
    460       "arxiv_id": "2503.14456",
    461       "relevance": "Modern RNN architecture used as baseline; NL positions it within the nested associative memory framework."
    462     },
    463     {
    464       "title": "Muon: An optimizer for hidden layers in neural networks",
    465       "authors": ["K Jordan", "Y Jin", "V Boza"],
    466       "year": 2024,
    467       "relevance": "Optimizer that NL reinterprets as gradient orthogonalization via nested optimization; basis for M3 optimizer."
    468     },
    469     {
    470       "title": "Model-agnostic meta-learning for fast adaptation of deep networks",
    471       "authors": ["Chelsea Finn", "Pieter Abbeel", "Sergey Levine"],
    472       "year": 2017,
    473       "relevance": "MAML as knowledge transfer via initialization; NL subsumes meta-learning as a special case of nested optimization."
    474     }
    475   ]
    476 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs