scan.json (28948B)
1 { 2 "paper": { 3 "title": "Is In-Context Learning Learning?", 4 "authors": ["Adrian de Wynter"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2509.10414", 8 "doi": "10.48550/arXiv.2509.10414" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "theoretical"], 13 "key_findings": "Large-scale empirical study (1.89M predictions per LLM) finds that ICL mathematically constitutes learning but is empirically weak. In the limit (50-100 exemplars), accuracy gaps between LLMs and prompting strategies narrow, and lexical features become less relevant than data features. However, ICL is brittle to out-of-distribution inputs (especially CoT and APO), shows inconsistent performance across formally similar tasks (up to 31% accuracy gaps), and overfocuses on spurious features from the observed distribution rather than learning feature relations within data.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository explicitly provided: 'Code and data is in https://github.com/adewynter/is-icl-learning' (Section 1). Section 10 states it will be open-sourced under MIT licence." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Synthetic data is generated by code in the repository. Section 10: 'All code is included in the repository.' The data generation procedures are fully specified in Section 4.5 and Appendix E." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Appendix F specifies: 'Standard ND40rs v2 instance in Azure, equipped with eight NVIDIA Tesla V100 GPUs with 32 Gb of memory each.' API details and model versions in Table 4. Baselines use scikit-learn (Pedregosa et al., 2011)." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": true, 34 "justification": "Section 10: 'All code is included in the repository.' Detailed methodology in Appendix F, full prompts in Appendix H and repository. Temperature set to zero for reproducibility." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper reports ±σ (standard deviation) throughout tables, but these represent variation across models or tasks, not confidence intervals. No formal CIs (e.g., 95% CI [x, y]) are reported." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes many comparative claims (e.g., 'CoT was most sensitive to OOD') based on comparing raw numbers and OLS slopes. No statistical significance tests (p-values, t-tests, etc.) are used to support these claims." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Effect sizes are reported throughout as absolute accuracy differences with context: e.g., '31% accuracy difference' between related tasks (Table 1), OLS slopes for trend magnitudes (Table 2), and specific accuracy ranges like '80±3%' vs '16±1%'." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "Test sets are 2000 entries (1000 evaluated due to cost), training sets 2000 entries. No power analysis or formal justification for these sizes. Cost is mentioned as a constraint but no formal sample size justification." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Temperature is set to zero, making each run deterministic. The σ values reported (e.g., Tables 1-2) represent variation across models, tasks, or prompts—not across repeated experimental runs. No multiple-seed or multiple-run variance is reported." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Traditional ML baselines included: decision trees (DT), k-nearest neighbours (kNN), and multilayer perceptron (MLP). Section 4.4: 'We tested decision trees, k-nearest neighbours, and a multilayer perceptron in succession, and reported the best.'" 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The baselines (DT, kNN, MLP) are appropriate for the research question of whether ICL constitutes learning compared to traditional learning algorithms. The paper tests 4 contemporary LLMs (GPT-4 Turbo, GPT-4o, Mixtral, Phi-3.5) as the main comparison." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Extensive ablation studies in Section 6: impact of lexical features (word salad, SoT), positionality of exemplars (shuffled vs unshuffled), alternate distributions (imbalanced, random labels), and compliance vs learning separation." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "The primary metric is accuracy (1 - error). While OLS slopes are used to measure trends, this is an analysis technique on the same metric rather than an independent metric. No additional metrics like F1, precision/recall, or calibration are reported." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is irrelevant—tasks are formal language recognition with objectively verifiable correct answers (membership in formal languages)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Clear separation: training set (2000 entries from P) used only for APO and baselines; five separate test sets (1000 entries each) from distributions Q with varying δ. Section 4.5." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Extensive breakdowns: per-task (9 tasks), per-model (4 LLMs), per-prompt (7 strategies), per-δ (5 distributions). Tables 1-2, Figure 3, and Appendix G provide comprehensive per-category results." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Failure cases discussed extensively: Vending Machine (Sum) had near-zero slope indicating complete inability to learn (Section 7); SoT had lowest average performance (23±4%); CoT brittleness to OOD; inconsistent performance on related tasks." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Many negative results: CoT and APO brittle to OOD (Section 5.2), SoT near-random performance (Section 6.1), ICL underperforms traditional baselines in half the tasks on average (Section 5.1), related tasks show 31% accuracy gaps (Table 1)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims are well-supported: 'accuracy is insensitive to exemplar distribution, model, prompt style' supported by Table 2 convergence; 'distributional sensitivity, especially in CoT' supported by δ slopes in Table 2; 'limited all-purpose generalisability' supported by task inconsistency in Table 1." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims (e.g., 'ICL leverages statistical features from the prompt, as opposed to feature relations within the data') are supported by controlled ablation studies: word salad vs description, shuffled vs unshuffled exemplars, alternate distributions. The experimental design with controlled variables supports these claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Generalization explicitly bounded: 'Our findings are constrained to easily-verifiable tasks in a single call' (Section 1.2). Appendix A discusses limitations. Appendix D explicitly discusses impact on conclusions when extended to natural language. Section 4.2 excludes reasoning models and multi-call strategies." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 7 discusses two alternative explanations: (1) contamination (addressed with synthetic data and complex alphabets), and (2) tokenization effects (BPE impact on arithmetic). Appendix D discusses natural language priors as a factor." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures accuracy on formal language recognition tasks and claims about ICL as a learning paradigm. The theoretical framework (PAC learning, Section 3) explicitly connects the measurement (empirical error) to the claim (learning ability). The gap between synthetic tasks and real-world ICL use is explicitly acknowledged in Section 1.2 and Appendix D." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Table 4 specifies versions: GPT-4 Turbo (GPT-4-0125), GPT-4o (GPT-4-0125), Phi-3.5-MoE-Instruct, Mixtral-8x7B instruct v01. Context windows and parameter counts given for open models." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt texts provided in Appendix H (Prompts 1-5) with actual examples for multiple tasks and strategies. Section 10: prompts also in the repository." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix F: temperature=0, max return tokens=3 (except CoT: 1024, APO: 512). APO: batch size 1024, beam width 4, search depth 6. Baselines: default scikit-learn params, random seed 13213." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. All evaluations are single-call LLM predictions." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Synthetic data generation fully documented: automata with transition probabilities (Section 4.5, Figure 1), per-task specifications (Appendix E, Table 3), deduplication, 5% mislabelling rate, alphabet specifications. Baseline data preprocessing described in Appendix F.2." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Appendix A is titled 'Limitations' with substantive discussion of model updates, cost, limited testing paradigms, and baseline interpretation nuances." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Specific threats discussed: LLMs continuously updated making reproducibility difficult (Appendix A); synthetic data may not translate to natural language (Appendix D); input representation length affects baseline performance (Appendix A); BPE tokenization impact on arithmetic (Section 7)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Explicit scope boundaries: 'constrained to non-natural language tasks' (Section 1.2); 'single call' only (Section 4.2); reasoning models excluded; multi-step strategies excluded; 'results are limited to the ability of ICL to draw conclusions from the data's features alone' (Appendix D)." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Code to generate all data and run all experiments is in the public repository (Section 1, Section 10). Synthetic data is fully reproducible from the code with temperature=0." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data generation described in detail: Section 4.5 describes automata with transition probabilities, Figure 1 shows PARITY automaton, Appendix E provides full task descriptions with alphabets, ID/OOD characterizations, and string lengths (Table 3)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data is synthetically generated." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Full pipeline documented: automata generate strings → 2000 training entries from P → 5 test sets of 2000 from Q at different δ → 1000 evaluated per test set → 5% mislabelling → up to 5 retries per call → compliance/learning separation. Section 4.5 and Appendix F." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed anywhere in the paper. No acknowledgments section mentioning grants or sponsors." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliation clearly stated: 'Microsoft and the University of York.' This is relevant as Microsoft's Phi-3.5 is one of the four evaluated models." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding disclosed, so independence cannot be assessed. The author works at Microsoft, which produces one of the evaluated models (Phi-3.5), creating a potential conflict." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates stated for any of the four models. While the synthetic data mitigates contamination risk, the cutoff dates are not provided." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": true, 235 "justification": "Section 7 discusses contamination as an alternative explanation. The entire experimental design uses synthetic data with custom alphabets (e.g., '¯\\(ツ)/¯') specifically to prevent overlap with training data. Section 4.5: 'synthetic to account for the task's G and Σ.'" 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Contamination is a central design consideration. Section 7: 'Contamination could explain the accuracy in Pattern Matching, perhaps due to the (easy) Σ, {a, b, c}. Other tasks, like Reversal, used more complex Σ and had lower scores.' Custom alphabets and synthetic data generation are the primary contamination mitigation strategy." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. All experiments use synthetic data evaluated by LLMs." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. Ethics section (Section 9) discusses carbon footprint and potential misuse, not human subjects." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No per-example or total inference cost reported. Section 9 mentions 'very high carbon footprint' qualitatively. Appendix A mentions cost constraints affected decisions (e.g., evaluating 1000 of 2000 test entries) but no dollar amounts or token counts." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Hardware specified (Appendix F: 'Standard ND40rs v2 instance in Azure, eight NVIDIA Tesla V100 GPUs') but no total GPU hours, wall-clock time, or API spend reported. Appendix A notes 'running synchronously a single task per LLM could and has taken months.'" 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Temperature set to zero for deterministic output. No seed sensitivity analysis across different random seeds. While determinism is a strength for reproducibility, it means results may be specific to this one configuration." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Appendix F: 'Every model was called up to five times to account for any potential parsing errors or rate limitations from APIs.' Temperature=0 means deterministic output, so one effective run per configuration." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "APO parameters (batch 1024, beam 4, depth 6) are stated but no search budget for selecting them. Baseline parameters left as defaults. No discussion of how many configurations were tried." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "All configurations are reported (all 7 prompting strategies × 7 shot counts × 5 δ values × 4 models), not just the best. Tables 1-2 and Appendix G show comprehensive results across all settings." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable. The paper uses OLS slopes and descriptive statistics rather than hypothesis testing." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The author is at Microsoft, and Phi-3.5 (a Microsoft model) is one of four evaluated models. This potential bias is not acknowledged. While the study is not proposing a new system, the author's employer's model could receive favorable treatment in analysis." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No analysis of performance as a function of compute. GPT-4 Turbo and GPT-4o (128k context, unknown parameters) are compared with Mixtral (12.9B active) and Phi-3.5 (6.6B active) without discussing compute differences." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": true, 331 "justification": "Extensive discussion of what synthetic formal language tasks measure vs ICL claims. Section 3 provides formal framework linking tasks to learning theory. Appendix D explicitly discusses limitations when extending to natural language. Section 1.2: 'our findings seek to characterise ICL as a learning paradigm, and not as an evaluation of prompt-based problem-solving capabilities.'" 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. All evaluations are single-call LLM predictions." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": true, 343 "justification": "Addressed by experimental design: all data is synthetically generated at experiment time, so it cannot have appeared in training data. Section 7 discusses contamination. Custom alphabets further mitigate leakage risk." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No explicit discussion of whether evaluation setup leaks answer information through context. While the synthetic design mitigates many leakage types, the paper does not discuss whether prompt structure or formatting provides hints." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "Training and test sets explicitly drawn from different distributions (P vs Q with controlled δ). Section 4.5: test sets are 'balanced, deduplicated.' Independence is structural by design." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No formal leakage detection method (canary strings, membership inference, n-gram overlap). The approach relies on contamination avoidance through synthetic data generation rather than detection." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "In the limit (50-100 exemplars), the average accuracy gap narrows between LLMs and all prompting strategies steadily improve accuracy.", 365 "evidence": "Table 2 shows positive shot slopes for all prompts (3.3-8.3 average). Section 5.3: per-shot σ OLS fit of -2.6±0.5 indicates narrowing model gap. Figure 2 and Figure 4 visualize convergence.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "ICL is brittle to altered test distributions (OOD), especially in CoT and APO.", 370 "evidence": "Table 2: CoT has largest negative δ slope (-1.4), APO at -0.5. Section 5.2: all δ slopes were negative. GPT-4o most sensitive at -1.2 average. Figure 2 shows CoT brittleness visually.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Closely-related tasks do not necessarily have closely-related performances, with accuracy differences as large as 31%.", 375 "evidence": "Table 1: Pattern Matching (94±1%) vs Maze Solve (63±5%) = 31% gap, both FSA-class. Reversal vs Stack = 12% gap, both PDA-class.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Traditional baselines (DT, kNN, MLP) outperform ICL average performance in half of the tasks evaluated.", 380 "evidence": "Table 1 rightmost column: MLP beats ICL average in PARITY (95 vs 80), kNN in Pattern Matching (87 vs 94 best-of but context unclear), kNN in Reversal (72 vs 61), kNN in Stack (72 vs 73), DT in V.M. Verification (84 vs 81).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Word salad prompts eventually reach equivalence with description-based prompts in the limit.", 385 "evidence": "Table 6 and Section 6.1: word salad matched non-salad best-of to within σ or σ/2 in most tasks. Slopes of 11±4.6 for word salad vs 4.4±2.2 for description. Figure 5 shows convergence.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Autoregression can distinguish data features from lexical relations, but cannot fully identify feature relations within the data.", 390 "evidence": "Section 7: word salad reaches equivalence (data features learned despite random words); SoT still achieves above-random scores (data features distinguishable); but task inconsistency and OOD brittleness show inability to learn feature relations.", 391 "supported": "moderate" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No statistical significance tests", 397 "detail": "All comparative claims ('CoT is most sensitive to OOD', 'Mixtral improved the most') are based on comparing raw numbers and OLS slopes without formal significance testing. Given the large scale (1.89M predictions), tests would be easy to compute and would strengthen the claims." 398 }, 399 { 400 "flag": "Single deterministic run per configuration", 401 "detail": "Temperature=0 gives determinism but means all results are from a single stochastic realization. No analysis of whether results change with different random states (e.g., different exemplar orderings beyond the one ablation, API non-determinism despite temp=0)." 402 }, 403 { 404 "flag": "Undisclosed conflict of interest", 405 "detail": "Author is affiliated with Microsoft while evaluating Microsoft's Phi-3.5 model alongside competitors. This conflict is not acknowledged in the paper." 406 }, 407 { 408 "flag": "Possible version typo", 409 "detail": "Table 4 lists both GPT-4 Turbo and GPT-4o with version 'GPT-4-0125', which appears to be the same version string. This may be a typo that could affect reproducibility." 410 } 411 ], 412 "cited_papers": [ 413 { 414 "title": "Language models are few-shot learners", 415 "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"], 416 "year": 2020, 417 "relevance": "Foundational paper on in-context learning capabilities of LLMs, central reference for the claims being evaluated." 418 }, 419 { 420 "title": "Chain-of-thought prompting elicits reasoning in large language models", 421 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 422 "year": 2022, 423 "relevance": "Key prompting technique evaluated in this study; CoT shown to be brittle to OOD." 424 }, 425 { 426 "title": "Quantifying memorization across neural language models", 427 "authors": ["Nicholas Carlini", "Daphne Ippolito", "Matthew Jagielski"], 428 "year": 2023, 429 "relevance": "Data contamination in LLMs—relevant to the survey's contamination assessment category." 430 }, 431 { 432 "title": "Awes, laws, and flaws from today's LLM research", 433 "authors": ["Adrian de Wynter"], 434 "year": 2025, 435 "relevance": "Meta-research on LLM evaluation methodology quality—directly relevant to the survey's scope." 436 }, 437 { 438 "title": "Quantifying language models' sensitivity to spurious features in prompt design", 439 "authors": ["Melanie Sclar", "Yejin Choi", "Yulia Tsvetkov", "Alane Suhr"], 440 "year": 2024, 441 "relevance": "Prompt sensitivity evaluation for LLMs—key related work on evaluation methodology." 442 }, 443 { 444 "title": "Many-shot in-context learning", 445 "authors": ["Rishabh Agarwal", "Avi Singh", "Lei M Zhang"], 446 "year": 2024, 447 "relevance": "Expanding shots improves ICL performance; independently confirms findings of this paper." 448 }, 449 { 450 "title": "Faith and fate: Limits of transformers on compositionality", 451 "authors": ["Nouha Dziri", "Ximing Lu", "Melanie Sclar"], 452 "year": 2023, 453 "relevance": "LLM accuracy decay with task complexity—key finding expanded by this paper." 454 }, 455 { 456 "title": "Are emergent abilities of large language models a mirage?", 457 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 458 "year": 2023, 459 "relevance": "Challenges emergent ability claims in LLMs—relevant to evaluation methodology and overclaiming." 460 }, 461 { 462 "title": "Neural networks and the Chomsky Hierarchy", 463 "authors": ["Grégoire Délétang", "Anian Ruoss", "Jordi Grau-Moya"], 464 "year": 2023, 465 "relevance": "Evaluates neural networks on formal language tasks from Chomsky hierarchy—directly comparable methodology." 466 }, 467 { 468 "title": "The expressive power of transformers with chain of thought", 469 "authors": ["William Merrill", "Ashish Sabharwal"], 470 "year": 2024, 471 "relevance": "Theoretical analysis of transformer capabilities with CoT—relevant to understanding ICL limitations." 472 }, 473 { 474 "title": "Position: Stop making unscientific AGI performance claims", 475 "authors": ["Patrick Altmeyer", "Andrew M. Demetriou", "Antony Bartlett", "Cynthia C. S. Liem"], 476 "year": 2024, 477 "relevance": "Meta-research critique of LLM evaluation claims—directly relevant to survey's methodology assessment." 478 }, 479 { 480 "title": "GSM-symbolic: Understanding the limitations of mathematical reasoning in large language models", 481 "authors": ["Seyed Iman Mirzadeh", "Keivan Alizadeh", "Hooman Shahrokhi"], 482 "year": 2025, 483 "relevance": "Benchmark evaluating LLM reasoning limitations—relevant to understanding ICL capability boundaries." 484 } 485 ] 486 }