scan.json (23771B)
1 { 2 "paper": { 3 "title": "Asymptotic Study of In-Context Learning with Random Transformers Through Equivalent Models", 4 "authors": ["Samet Demir", "Zafer Doğan"], 5 "year": 2025, 6 "venue": "IEEE International Workshop on Machine Learning for Signal Processing (MLSP)", 7 "arxiv_id": "2509.15152", 8 "doi": "10.1109/MLSP62443.2025.11204336" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. The simulations are described but no code is released." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": false, 20 "justification": "The experiments use synthetically generated data following the distributions in Equations (1)-(2), but no data or generation scripts are released. The data generation process is described mathematically but no downloadable artifacts are provided." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "No environment specifications, library versions, or dependency information is provided. The paper does not mention what software or framework was used for the simulations." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions are provided. While the mathematical setup is described in detail (Section 3), there are no scripts, README, or concrete instructions for replicating the experiments." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "Figures 1 and 2 plot 'the mean of 20 Monte Carlo runs' but no error bars, confidence intervals, or uncertainty bands are shown on the plots. Only point estimates (means) are displayed." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper makes comparative claims (e.g., 'the Transformer augmented with a nonlinear MLP head consistently achieves lower ICL error than the linear Transformer baseline') but no statistical significance tests are used. Comparisons are made purely by visual inspection of curves." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": false, 47 "justification": "No effect sizes are reported. The paper describes performance differences qualitatively ('significantly outperforms', 'limited or no improvement') without quantifying the magnitude of differences." 48 }, 49 "sample_size_justified": { 50 "applies": false, 51 "answer": false, 52 "justification": "This is primarily a theoretical paper with simulations on synthetic data. The choice of 20 Monte Carlo runs is stated but sample size justification or power analysis is not applicable in this theoretical context." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "Although the paper states results are averaged over 20 Monte Carlo runs, no variance, standard deviation, or any spread measure is reported. Only means are plotted in Figures 1 and 2." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares the Transformer with nonlinear MLP against the linear attention Transformer (without MLPs) as a baseline, and also compares with the equivalent polynomial model from Theorem 3. These comparisons are shown in Figures 1 and 2." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The linear attention Transformer baseline comes from [27, 28] (Lu et al., 2024-2025), which is recent. The related work section discusses contemporary approaches [7, 8, 9] from ICML 2024 and NeurIPS 2024." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "Figure 2 systematically varies individual parameters: (a) context length l, (b) hidden dimension m, and (c) regularization constant lambda, studying their individual effects on ICL error. This functions as an ablation over key design choices." 75 }, 76 "multiple_metrics": { 77 "applies": false, 78 "answer": false, 79 "justification": "The paper studies a single theoretical quantity — the ICL error defined in Equation (8) — which is the natural and only relevant metric for the theoretical analysis. Using additional metrics would not be meaningful for this type of asymptotic analysis." 80 }, 81 "human_evaluation": { 82 "applies": false, 83 "answer": false, 84 "justification": "This is a theoretical/simulation paper studying asymptotic properties of Transformers. Human evaluation is not relevant to these claims." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "The paper analyzes theoretical ICL error computed in expectation over distributions (Equation 8). There is no dataset with train/test splits in the conventional sense; the evaluation is on the population-level error." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "The paper provides breakdowns across different activation functions (ReLU vs tanh in Figures 1a-b), different context lengths, hidden dimensions, and regularization strengths (Figure 2a-c), rather than reporting only a single aggregate result." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "The paper explicitly discusses when the nonlinear MLP does NOT help: 'the MLP with tanh activation offers limited or no improvement over the linear model' (Section 4, Figure 1). The double-descent phenomenon near the interpolation threshold is also identified as a failure mode." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that tanh activation provides 'limited or no improvement' when the target function is also tanh (Figure 1b), and documents the double-descent phenomenon where performance degrades near the interpolation threshold (Figure 2b). These are genuine negative results about when the approach fails." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims: (1) equivalence to a polynomial model — supported by Theorem 3 and Figures 1-2; (2) validation through simulations — shown in Figures 1-2; (3) double-descent phenomenon — shown in Figure 2b; (4) insights on when MLPs enhance ICL — documented in Section 4 analysis. All abstract claims are supported." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper makes causal claims about the effect of MLP activation functions, context length, and hidden dimension on ICL error. These are justified through formal mathematical analysis (Theorem 3, Lemma 1, Corollary 2) and controlled simulations where single variables are varied in isolation." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper clearly bounds its setting: random Transformers with linear attention and a specific nonlinear MLP structure (first layer random and fixed, second layer trained), in a particular asymptotic regime (d, n, m, l, k jointly diverge with specific ratios). The model uses Gaussian data (Eq. 1-2). These assumptions are explicitly stated in Section 3." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper does not discuss alternative explanations for its empirical findings. For example, the visual alignment between the Transformer and polynomial model in Figures 1-2 could be affected by finite-size effects not captured by the asymptotic theory, but this is not discussed." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": false, 132 "answer": false, 133 "justification": "The paper does not use any pre-trained LLM or API. It defines and trains its own random Transformer models from scratch using the mathematical formulation in Section 3. Model version specification is not applicable." 134 }, 135 "prompts_provided": { 136 "applies": false, 137 "answer": false, 138 "justification": "The paper does not use any LLM prompting. The 'in-context learning' studied here is a mathematical formulation where input-output examples are provided to a Transformer architecture, not natural language prompting." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Key hyperparameters are reported for each experiment: d=80, l=d, k=0.5d, m=d^2, rho=0.01, lambda=10^-8 (Figure 1 caption), and n=1.5d^2, k=0.5d, rho=0.01 (Figure 2 caption). The number of Monte Carlo runs (20) is also stated." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used. This is a theoretical/simulation study of Transformer architectures." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "The data generation process is fully specified mathematically in Equations (1)-(3): inputs drawn from N(0, I_d/d), labels generated via a nonlinear function with Gaussian noise, task vectors drawn from N(0, I_d), and the embedding matrix Z construction. This is a complete specification." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions future directions (multi-head attention, deep MLP stacking, adaptive regularization) but does not substantively discuss limitations of the current work." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to validity are discussed. For example, the paper does not address: the gap between the asymptotic regime and practical finite-dimensional settings, the restriction to linear attention, or the assumption of Gaussian data." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": true, 170 "justification": "The paper clearly states its scope in Section 3: it considers random Transformers with linear attention, a specific MLP structure (first layer random/fixed, second layer trained), an asymptotic regime with jointly diverging dimensions, and Gaussian data. The conclusion explicitly mentions what is NOT covered: multi-head attention, deep MLP stacking, and adaptive regularization." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "No raw simulation data is released. Only aggregated means are shown in the figures. The underlying Monte Carlo run data is not available for verification." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "The data generation procedure is fully specified mathematically in Section 3 (Equations 1-3), including distributions for inputs, labels, noise, and task vectors. This is a complete procedural description that would allow independent data generation." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants. Data is synthetically generated from specified distributions. This criterion does not apply." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The complete pipeline from data generation (Equations 1-2) to embedding construction (Equation 3) to model training (optimization in Section 3) to error measurement (Equation 8) is mathematically specified. Each step is documented." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "Funding is disclosed in the footnote on page 1: 'this work is supported partially by TÜBİTAK under project 124E063 in ARDEB 1001 program. S.D. is supported by an AI Fellowship provided by KUIS AI Research Center and a PhD Scholarship (BİDEB 2211) from TÜBİTAK.'" 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly stated: 'MLIP Research Group, KUIS AI Center & Department of EEE, Koç University, İstanbul, Turkey.'" 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "Funding comes from TÜBİTAK (Turkish government science agency) and KUIS AI Research Center (university research center). These are academic/governmental funders with no commercial stake in the specific theoretical results about Transformer ICL." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": false, 220 "answer": false, 221 "justification": "The paper does not evaluate any pre-trained model on a benchmark. It trains its own random Transformer models from scratch on synthetic data. Contamination is not applicable." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": false, 225 "answer": false, 226 "justification": "No pre-trained model is evaluated on any benchmark. The paper uses freshly generated synthetic data and custom-trained models. Train/test overlap is not applicable." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": false, 230 "answer": false, 231 "justification": "No pre-trained model is evaluated on any public benchmark. All experiments use synthetic data generated from known distributions. Benchmark contamination is not applicable." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants. This is a theoretical/simulation paper." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants. This is a theoretical/simulation paper." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants. This is a theoretical/simulation paper." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants. This is a theoretical/simulation paper." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants. This is a theoretical/simulation paper." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants. This is a theoretical/simulation paper." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants. This is a theoretical/simulation paper." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "This is a theoretical paper studying asymptotic properties of Transformers. Cost/latency reporting is not applicable." 276 }, 277 "compute_budget_stated": { 278 "applies": false, 279 "answer": false, 280 "justification": "This is a theoretical paper with small-scale simulations (d=80). Computational budget reporting is not applicable for this type of work." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "A random Transformer with a nonlinear MLP head is asymptotically equivalent to a finite-degree Hermite polynomial model in terms of ICL error.", 287 "evidence": "Theorem 3 (Section 4) proves this equivalence under the asymptotic regime where d, n, m, l, k jointly diverge. Figures 1 and 2 validate that the polynomial model's ICL error matches the Transformer's across multiple settings.", 288 "supported": "strong" 289 }, 290 { 291 "claim": "The MLP head significantly reduces ICL error when the activation function matches the nonlinearity of the target function (e.g., ReLU activation with ReLU target).", 292 "evidence": "Figure 1a shows the Transformer with ReLU MLP consistently achieves lower ICL error than the linear Transformer when the target function is ReLU. Figure 1b shows tanh activation provides limited improvement when the target is tanh.", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "The benefits of the nonlinear MLP emerge only when the context length exceeds a dimension-dependent threshold.", 297 "evidence": "Figure 2a shows that the Transformer with MLP begins to significantly outperform the linear Transformer only when l exceeds approximately d.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "The ICL error exhibits a double-descent phenomenon as a function of model complexity (hidden dimension m), which can be mitigated through regularization.", 302 "evidence": "Figure 2b shows the non-monotonic ICL error curve with a peak near the interpolation threshold (m/n approximately 1). Figure 2c demonstrates that increasing the regularization strength lambda smooths and attenuates this peak.", 303 "supported": "moderate" 304 } 305 ], 306 "methodology_tags": ["theoretical"], 307 "key_findings": "This paper proves that a random Transformer with a nonlinear MLP head (random first layer, trained second layer) is asymptotically equivalent to a finite-degree Hermite polynomial model for in-context learning error. The analysis reveals three key conditions for MLPs to enhance ICL: the activation function must match the target nonlinearity, the context length must exceed a dimension-dependent threshold, and the hidden dimension must be appropriately sized or regularized. The paper also demonstrates that the ICL error exhibits double-descent behavior with respect to model complexity, extending known double-descent phenomena to the in-context learning setting.", 308 "red_flags": [ 309 { 310 "flag": "No error bars on simulation results", 311 "detail": "Figures 1 and 2 report means of 20 Monte Carlo runs but show no error bars, confidence intervals, or any measure of variability. The reader cannot assess how stable the reported means are or whether visual differences between curves are statistically meaningful." 312 }, 313 { 314 "flag": "Highly simplified architecture", 315 "detail": "The analysis assumes linear attention (not softmax attention), a single attention layer, random first-layer MLP weights, and Gaussian data. These simplifications may limit the applicability of the findings to practical Transformer architectures, and this gap is not discussed." 316 }, 317 { 318 "flag": "Self-citation concentration", 319 "detail": "Three of the 29 references ([11], [12], [14]) are self-citations to the same author pair (Demir and Dogan), and the asymptotic analysis tools (Gaussian universality results) rely heavily on these works. While not necessarily problematic, the paper's core methodology builds primarily on the authors' own prior work." 320 } 321 ], 322 "cited_papers": [ 323 { 324 "title": "Attention is all you need", 325 "authors": ["Ashish Vaswani", "Noam Shazeer", "Niki Parmar", "Jakob Uszkoreit", "Llion Jones", "Aidan N Gomez", "Łukasz Kaiser", "Illia Polosukhin"], 326 "year": 2017, 327 "relevance": "Foundational paper on Transformer architecture, directly relevant to understanding architectural components studied in ICL research." 328 }, 329 { 330 "title": "Language models are few-shot learners", 331 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 332 "year": 2020, 333 "relevance": "Introduced in-context learning as a capability of large language models, motivating the theoretical study of ICL in this paper." 334 }, 335 { 336 "title": "What learning algorithm is in-context learning? Investigations with linear models", 337 "authors": ["Ekin Akyürek", "Dale Schuurmans", "Jacob Andreas", "Tengyu Ma", "Denny Zhou"], 338 "year": 2023, 339 "relevance": "Key theoretical work characterizing ICL as implicit algorithm execution, part of the research program studying ICL mechanisms." 340 }, 341 { 342 "title": "Trained transformers learn linear models in-context", 343 "authors": ["Ruiqi Zhang", "Spencer Frei", "Peter L. Bartlett"], 344 "year": 2024, 345 "relevance": "Provides generalization analysis for linear attention Transformers in ICL, directly extended by this paper to the nonlinear MLP setting." 346 }, 347 { 348 "title": "How do nonlinear transformers learn and generalize in in-context learning?", 349 "authors": ["Hongkang Li", "Meng Wang", "Songtao Lu", "Xiaodong Cui", "Pin-Yu Chen"], 350 "year": 2024, 351 "relevance": "Investigates ICL in classification settings with ReLU-based MLPs; one of the key prior works this paper extends to broader activation functions and regression tasks." 352 }, 353 { 354 "title": "Emergent abilities of large language models", 355 "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"], 356 "year": 2022, 357 "relevance": "Documents emergent ICL abilities as model size increases, relevant to understanding when and why in-context learning capabilities appear." 358 }, 359 { 360 "title": "Are emergent abilities of large language models a mirage?", 361 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 362 "year": 2023, 363 "relevance": "Challenges the emergence narrative for LLM abilities, relevant to understanding whether ICL is truly emergent or an artifact of evaluation." 364 }, 365 { 366 "title": "What can transformers learn in-context? A case study of simple function classes", 367 "authors": ["Shivam Garg", "Dimitris Tsipras", "Percy S Liang", "Gregory Valiant"], 368 "year": 2022, 369 "relevance": "Studies ICL capabilities on controlled synthetic benchmarks, directly related to the synthetic regression setting used in this paper." 370 }, 371 { 372 "title": "Asymptotic theory of in-context learning by linear attention", 373 "authors": ["Yue M Lu", "Mary Letey", "Jacob A Zavatone-Veth", "Anindita Maiti", "Cengiz Pehlevan"], 374 "year": 2025, 375 "relevance": "Provides the asymptotic theory for linear attention ICL that this paper extends to nonlinear MLPs; the most directly related prior work." 376 }, 377 { 378 "title": "Optimal regularization can mitigate double descent", 379 "authors": ["Preetum Nakkiran", "Prayaag Venkat", "Sham M Kakade", "Tengyu Ma"], 380 "year": 2021, 381 "relevance": "Establishes the role of regularization in mitigating double descent, which this paper extends to the ICL setting with Transformer architectures." 382 } 383 ] 384 }