scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20141B)
      1 {
      2   "paper": {
      3     "title": "How Data Mixing Shapes In-Context Learning: Asymptotic Equivalence for Transformers with MLPs",
      4     "authors": ["Samet Demir", "Zafer Doğan"],
      5     "year": 2025,
      6     "venue": "NeurIPS 2025",
      7     "arxiv_id": "2510.25753",
      8     "doi": "10.48550/arXiv.2510.25753"
      9   },
     10   "scan_version": 2,
     11   "active_modules": [],
     12   "methodology_tags": ["theoretical"],
     13   "key_findings": "Transformers with nonlinear MLP heads are asymptotically equivalent to finite-degree polynomial predictors in terms of ICL error under high-dimensional asymptotics. Nonlinear MLPs meaningfully enhance ICL performance over linear baselines on nonlinear tasks. Data sources with structured covariances and low noise are critical for effective ICL, and feature learning emerges only when the task covariance exhibits sufficient structure. Results validated on synthetic data and a multilingual sentiment analysis task.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Source code link provided: https://github.com/KU-MLIP/Data-Mixing-Shapes-ICL-by-Transformers, stated on page 1."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The real-world experiments use the publicly available Multilingual Amazon Reviews Corpus [22]. Synthetic data is generated by the code. No proprietary data was collected."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment specifications, requirements.txt, or dependency details are mentioned in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README or reproduction guide is described."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Figures show 'average over 20 Monte Carlo runs' (Figures 1-2) and 'mean of 100 Monte Carlo trials' (Figure 3c), but no error bars, confidence intervals, or uncertainty bounds are shown on the plots."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No statistical significance tests are used. Claims of performance differences between models are based on visual comparison of plotted curves without any formal testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Performance differences between linear and nonlinear Transformers are shown only in figures without quantifying effect sizes numerically."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The number of Monte Carlo runs (20 or 100) is stated but not justified. No discussion of why these numbers are sufficient."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Only averages over Monte Carlo runs are reported. No standard deviations, error bars, or spread measures accompany the results."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The linear Transformer (without MLP) serves as a baseline throughout all experiments (Figures 1-3). The equivalent polynomial model is also compared."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The linear Transformer baseline is the standard comparison in the theoretical ICL literature (Zhang et al. 2024, Lu et al. 2024). The paper positions itself against recent work [23, 24, 32]."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper systematically varies individual components: step size η (Figure 3), input covariance (Figure 2a), task covariance (Figure 2b), noise variance (Figure 2c), sample size, context length, and hidden dimension (Figure 1)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only one metric is used throughout: ICL error (mean squared prediction error) as defined in equation (8). No alternative metrics are reported."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is a theoretical paper with synthetic simulations and one automated real-world experiment. Human evaluation is not relevant."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The training procedure explicitly uses separate data for the two stages: a fresh sample set for the second layer (Section 3.3.2: 'A fresh sample set is used to train w'). ICL error is evaluated on new contexts."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Per-source ICL errors are provided in Appendix G (Figures 4-6), breaking down performance by data source rather than only showing overall averages."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 3(a) shows a case where increasing step size does NOT improve performance (isotropic task vectors), explicitly discussed as a negative result showing when feature learning fails."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section 4.3.4: 'In (a), increasing η does not improve ICL error' — this is an explicit negative result showing feature learning fails with isotropic task vectors."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about asymptotic equivalence (Theorem 4.12), nonlinear MLP benefits (Figure 1), data mixing effects (Figure 2), and feature learning dependence on task structure (Figure 3) are all supported by formal proofs and experiments."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims like 'nonlinear MLPs meaningfully enhance ICL performance' are justified by controlled single-variable manipulations in the experimental design (varying one factor at a time) and formal theoretical proofs."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper is explicit about its scope: single-block linear attention, two-layer MLP with specific training procedure, high-dimensional asymptotics. Section 4.1 lists specific assumptions. The conclusion does not overclaim beyond the tested settings."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for the observed results. For example, the role of the specific training procedure (single gradient step) vs. other optimization schemes is not explored as a potential confound."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper precisely defines its measurement (ICL error in equation 8) and claims match the measurement granularity. No broader framing beyond what is measured."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "The paper does not use any pre-trained LLMs. The Transformer model is defined mathematically and trained from scratch in the experimental setup."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No prompting is used. The model operates on structured numerical data, not text prompts."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Key hyperparameters are reported in figure captions: d=80, η≍d², λ=5×10⁻⁵, polynomial degree p=4 or 5, and various dimension ratios (ℓ/d, n/d², k/n)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "For the real-world experiment (Figure 3c): text embedded using 'multilingual-e5-small', reduced to 64 dimensions via PCA, normalized. Labels demeaned and scaled to [-1,1]. Synthetic data generation is fully specified by the mathematical model."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section is present. Assumptions 4.3-4.5 are noted as 'limitations of our theoretical results' in passing (Section 4.1), but there is no substantive limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No specific threats to validity are discussed. The paper notes assumptions 'might look unnatural' and 'can be partially relaxed in practice' but does not discuss specific validity threats."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper clearly states its scope: single-block linear attention (Section 3.3), two-layer MLP with specific training (Section 3.3.2), proportional limit regime (Assumption 4.1). The Conclusion also bounds claims to 'high-dimensional asymptotics.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (simulation outputs) is released. The real-world dataset is public but simulation results are only shown in figures."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The data generation process is fully specified mathematically in Section 3.2 (equations 1-2). The real-world data source (Multilingual Amazon Reviews Corpus) is described in Figure 3 caption."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are synthetic generation and a standard public benchmark."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The full pipeline from data generation to model training and evaluation is specified mathematically: data sampling (Section 3.2), embedding construction (equation 2), training procedure (Section 3.3.2), and evaluation (Section 3.4)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section: 'supported partially by TÜBİTAK under project 124E063 in ARDEB 1001 program.' S.D. supported by KUIS AI Fellowship and TÜBİTAK BİDEB 2211 scholarship."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Authors are from MLIP Research Group, KUIS AI Center, and Department of EEE at Koç University. No product being evaluated, so no conflict."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funding is from TÜBİTAK (Turkish government research agency) and KUIS AI Center (academic). Neither has a stake in the specific theoretical results."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper does not evaluate a pre-trained model on any benchmark. All models are trained from scratch within the experimental setup."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "No pre-trained model is evaluated on a benchmark. Train/test separation is handled by fresh sampling in the mathematical framework."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No pre-trained model benchmark evaluation. Models are trained from scratch on generated data."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Purely theoretical paper with supporting simulations. Cost reporting is not relevant."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Purely theoretical paper. The simulations are small-scale Monte Carlo runs, not large-scale compute."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "A Transformer with a nonlinear MLP head is asymptotically equivalent to a finite-degree polynomial model in terms of ICL error.",
    296       "evidence": "Theorem 4.12 provides formal proof. Figures 1a-c show close alignment between the Transformer with nonlinear MLP and its polynomial surrogate across varying sample sizes, context lengths, and hidden dimensions (d=80, 20 Monte Carlo runs).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Nonlinear MLPs meaningfully enhance ICL performance compared to linear Transformer baselines on nonlinear tasks.",
    301       "evidence": "Figure 1 shows consistent ICL error reduction of the nonlinear MLP Transformer over the linear Transformer across all three dimensions (sample size, context length, hidden dimension).",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "High-quality data sources for ICL are characterized by low noise, structured covariances for input and task vectors.",
    306       "evidence": "Figure 2 systematically varies input covariance (a), task covariance (b), and noise variance (c), showing that structured covariances and low noise improve ICL error. Per-source errors in Appendix G confirm the trends.",
    307       "supported": "moderate"
    308     },
    309     {
    310       "claim": "Feature learning emerges only when the task covariance exhibits sufficient structure, not from structured input covariance alone.",
    311       "evidence": "Figure 3: increasing step size η improves ICL only with structured task covariance (b), not with structured input covariance (a). This asymmetry is consistent with the theoretical framework.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "The theoretical findings extend to real-world scenarios (multilingual sentiment analysis).",
    316       "evidence": "Figure 3(c) shows that the equivalent polynomial model aligns with the Transformer on the Multilingual Amazon Reviews Corpus, and feature learning effects are consistent with synthetic experiments.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "red_flags": [
    321     {
    322       "flag": "No error bars on any experimental results",
    323       "detail": "Despite running 20-100 Monte Carlo trials per experiment, no variance, standard deviation, or confidence intervals are reported. The reader cannot assess the stability or significance of the observed differences."
    324     },
    325     {
    326       "flag": "Simplified training procedure",
    327       "detail": "The two-stage training (single gradient step on first layer, ridge regression on second) differs substantially from standard end-to-end Transformer training. While theoretically motivated, the gap between this procedure and real practice limits the applicability of the conclusions."
    328     },
    329     {
    330       "flag": "Real-world experiment is minimal",
    331       "detail": "The multilingual sentiment analysis experiment (Figure 3c) is presented as evidence that 'findings extend to real-world cases,' but it uses PCA-reduced embeddings (384→64 dims), small context lengths (l=64), and a highly simplified setup that may not generalize to actual Transformer ICL behavior."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Language models are few-shot learners",
    337       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    338       "year": 2020,
    339       "relevance": "Foundational paper on in-context learning capabilities of large language models."
    340     },
    341     {
    342       "title": "Attention is all you need",
    343       "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar"],
    344       "year": 2017,
    345       "relevance": "Original Transformer architecture paper, fundamental to all work on Transformer-based ICL."
    346     },
    347     {
    348       "title": "What learning algorithm is in-context learning? Investigations with linear models",
    349       "authors": ["Ekin Akyürek", "Dale Schuurmans", "Jacob Andreas", "Tengyu Ma", "Denny Zhou"],
    350       "year": 2023,
    351       "relevance": "Key theoretical work on ICL mechanisms showing Transformers learn to implement learning algorithms."
    352     },
    353     {
    354       "title": "Emergent abilities of large language models",
    355       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    356       "year": 2022,
    357       "relevance": "Empirical study of emergent ICL capabilities scaling with model size."
    358     },
    359     {
    360       "title": "Are emergent abilities of large language models a mirage?",
    361       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    362       "year": 2023,
    363       "relevance": "Challenges the narrative of emergent abilities, relevant to understanding ICL capabilities."
    364     },
    365     {
    366       "title": "Trained transformers learn linear models in-context",
    367       "authors": ["Ruiqi Zhang", "Spencer Frei", "Peter L. Bartlett"],
    368       "year": 2024,
    369       "relevance": "Foundational theoretical analysis of ICL in trained linear Transformers, direct precursor to this work."
    370     },
    371     {
    372       "title": "Transformers learn in-context by gradient descent",
    373       "authors": ["Johannes Von Oswald", "Eyvind Niklasson", "Ettore Randazzo"],
    374       "year": 2023,
    375       "relevance": "Theoretical work connecting ICL to gradient descent, key reference for understanding learned algorithms."
    376     },
    377     {
    378       "title": "Data mixing laws: Optimizing data mixtures by predicting language modeling performance",
    379       "authors": ["Jiasheng Ye", "Peiju Liu", "Tianxiang Sun"],
    380       "year": 2025,
    381       "relevance": "Directly relevant to data mixing effects on model training, which this paper extends to the ICL setting."
    382     }
    383   ]
    384 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs