scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23152B)
      1 {
      2   "scan_version": 3,
      3   "active_modules": [
      4     "experimental_rigor"
      5   ],
      6   "paper": {
      7     "title": "Data Distributional Properties Drive Emergent In-Context Learning in Transformers",
      8     "authors": [
      9       "Stephanie C.Y. Chan",
     10       "Adam Santoro",
     11       "Andrew K. Lampinen",
     12       "Jane X. Wang",
     13       "Aaditya K. Singh",
     14       "Pierre H. Richemond",
     15       "James L. McClelland",
     16       "Felix Hill"
     17     ],
     18     "year": 2022,
     19     "venue": "NeurIPS 2022",
     20     "arxiv_id": "2205.05055"
     21   },
     22   "methodology_tags": [
     23     "benchmark-eval"
     24   ],
     25   "checklist": {
     26     "artifacts": {
     27       "code_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Code released at https://github.com/deepmind/emergent_in_context_learning, stated in footnote 1."
     31       },
     32       "data_released": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Uses the publicly available Omniglot dataset (Lake et al., 2019, MIT License)."
     36       },
     37       "environment_specified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No requirements.txt, Dockerfile, or detailed environment specification provided. Hardware mentioned (TPU v2/v3) but no software environment details."
     41       },
     42       "reproduction_instructions": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Paper states 'code will be released with the camera-ready version' and provides architectural details in text, but no step-by-step reproduction instructions are included in the paper itself."
     46       }
     47     },
     48     "statistical_methodology": {
     49       "confidence_intervals_or_error_bars": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Appendix A states 'error bars indicate standard deviation around the mean' and shaded regions are shown in all figures."
     53       },
     54       "significance_tests": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No statistical significance tests are reported. Claims of difference between conditions (e.g., burstiness levels, architectures) are based on visual comparison of curves without formal tests."
     58       },
     59       "effect_sizes_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Results are presented as accuracy curves and bar plots but no formal effect sizes (Cohen's d, etc.) are reported. Accuracy values are shown but without baseline context framing as effect sizes."
     63       },
     64       "sample_size_justified": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Number of seeds (3 or 5 depending on experiment) is stated but not justified. No power analysis or discussion of whether this is sufficient."
     68       },
     69       "variance_reported": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Standard deviation across runs shown as shaded error bars in all figures. Appendix A: 'experiments were run with 3 seeds each... all other experiments were run with 5 runs each.'"
     73       }
     74     },
     75     "evaluation_design": {
     76       "baselines_included": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Multiple baseline conditions included: non-bursty training, different numbers of classes, uniform vs Zipfian distributions, and RNN/LSTM architectures as comparison points."
     80       },
     81       "baselines_contemporary": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Comparisons are against standard architectures (transformer, LSTM, vanilla RNN) matched on parameters. The paper's contribution is mechanistic understanding, not outperformance, so these are appropriate."
     85       },
     86       "ablation_study": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The entire paper is essentially a series of ablations: varying burstiness (Fig 2), number of classes (Fig 3), label multiplicity (Fig 4), within-class variation (Fig 5), and Zipf exponent (Fig 6)."
     90       },
     91       "multiple_metrics": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Two complementary metrics: in-context learning accuracy on holdout classes and in-weights learning accuracy on trained classes. Also multi-class vs two-way evaluation (Appendix C.4)."
     95       },
     96       "human_evaluation": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "This is a controlled experiment on synthetic training regimes for transformers; human evaluation is not relevant."
    100       },
    101       "held_out_test_set": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "In-context learning evaluated on holdout image classes never seen in training (Section 2.3). Training/holdout split explicitly described."
    105       },
    106       "per_category_breakdown": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Results broken down by condition (burstiness level, number of classes, Zipf exponent, architecture type) across multiple figures. Zipfian experiments separate common vs rare class performance (Fig 6d-e)."
    110       },
    111       "failure_cases_discussed": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Discusses cases where in-context learning fails: non-bursty training, too few classes, extreme Zipf exponents. Also discusses RNN/LSTM failure to achieve in-context learning."
    115       },
    116       "negative_results_reported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Reports tradeoff between in-context and in-weights learning (most conditions cannot achieve both). RNN/LSTM completely fail at in-context learning. High Zipf exponents destroy in-context learning."
    120       }
    121     },
    122     "claims_and_evidence": {
    123       "abstract_claims_supported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Abstract claims about burstiness, rare classes, dynamic meanings, Zipfian sweet spot, and transformer-specificity are all supported by corresponding experiments (Figs 2-7)."
    127       },
    128       "causal_claims_justified": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "Causal claims ('drive', 'promote') are supported by controlled experiments that manipulate single variables (burstiness, class count, etc.) while holding others constant. This is adequate causal design."
    132       },
    133       "generalization_bounded": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "Title claims 'in Transformers' broadly but experiments use only Omniglot image classification with small transformers (831K params). Discussion section speculates about implications for LLMs and cognition without bounding these generalizations."
    137       },
    138       "alternative_explanations_discussed": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Discussion considers whether recurrent models simply have a bias towards in-weights learning (ruled out by Fig 8). Addresses the narrative that LLMs just memorize training data (Section 4). Discusses the ambiguity between in-context and in-weights strategies."
    142       },
    143       "proxy_outcome_distinction": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The paper measures in-context learning accuracy on Omniglot classification tasks with controlled distributional properties (burstiness, number of classes, Zipfian distribution). Claims match the measurement granularity: 'in-context learning emerges when training data exhibits particular distributional properties.' The paper does not overclaim to broader 'intelligence' or 'reasoning' — it stays within the measured construct."
    147       }
    148     },
    149     "setup_transparency": {
    150       "model_versions_specified": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "Paper trains its own models from scratch; no pre-trained model versions to specify."
    154       },
    155       "prompts_provided": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "No prompting used. Models are trained from scratch on image-label sequences."
    159       },
    160       "hyperparameters_reported": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "Appendix A details: 12 layers, embedding dim 64, 8 heads, ResNet architecture, Adam optimizer, learning rate 3e-4 with warmup to 4000 steps, 500k training steps. Hyperparameter sweep details in Appendix C.1."
    164       },
    165       "scaffolding_described": {
    166         "applies": false,
    167         "answer": false,
    168         "justification": "No agentic scaffolding used."
    169       },
    170       "data_preprocessing_documented": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 2.1 describes sequence construction in detail: context structure (8 image-label pairs), bursty vs non-bursty sequence construction, how rotated/flipped images were generated for 12800 classes."
    174       }
    175     },
    176     "limitations_and_scope": {
    177       "limitations_section_present": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No dedicated limitations or threats-to-validity section. The Discussion mentions future directions but does not systematically discuss limitations."
    181       },
    182       "threats_to_validity_specific": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No specific threats to validity discussed. The paper does not address whether Omniglot results generalize to language, or limitations of the small model scale."
    186       },
    187       "scope_boundaries_stated": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No explicit statements about what the results do NOT show. Discussion section speculates broadly about implications for language models, cognition, and neuroscience without bounding the scope."
    191       }
    192     },
    193     "data_integrity": {
    194       "raw_data_available": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Omniglot dataset is publicly available. Code for generating training sequences is released."
    198       },
    199       "data_collection_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "Section 2.1 describes how training sequences are constructed from Omniglot: image-label pair sequences, bursty/non-bursty composition, class sampling procedures."
    203       },
    204       "recruitment_methods_described": {
    205         "applies": false,
    206         "answer": false,
    207         "justification": "No human participants. Data is a standard public benchmark (Omniglot)."
    208       },
    209       "data_pipeline_documented": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Full pipeline from Omniglot images to training sequences documented: embedding (ResNet for images, standard embedding for labels), sequence construction, evaluation protocol."
    213       }
    214     },
    215     "conflicts_of_interest": {
    216       "funding_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Acknowledgments state: 'This work was funded by DeepMind.'"
    220       },
    221       "affiliations_disclosed": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "All author affiliations listed: 7 of 8 authors are at DeepMind, 1 at UCL, 1 jointly at DeepMind and Stanford."
    225       },
    226       "funder_independent_of_outcome": {
    227         "applies": true,
    228         "answer": true,
    229         "justification": "DeepMind funded the work. The paper studies basic science of in-context learning mechanisms; DeepMind does not have a direct financial stake in whether burstiness or Zipfian distributions drive in-context learning."
    230       },
    231       "financial_interests_declared": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No competing interests or financial interests statement found in the paper."
    235       }
    236     },
    237     "contamination": {
    238       "training_cutoff_stated": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "Models are trained from scratch on controlled synthetic data distributions; no pre-trained model evaluated on a benchmark."
    242       },
    243       "train_test_overlap_discussed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "Models trained from scratch on Omniglot subsets with explicit train/holdout splits. Not evaluating a pre-trained model's knowledge."
    247       },
    248       "benchmark_contamination_addressed": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No pre-trained model evaluated on an external benchmark. Contamination is structurally impossible in this experimental design."
    252       }
    253     },
    254     "human_studies": {
    255       "pre_registered": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants."
    259       },
    260       "irb_or_ethics_approval": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "demographics_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "inclusion_exclusion_criteria": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       },
    275       "randomization_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants."
    279       },
    280       "blinding_described": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       },
    285       "attrition_reported": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "No human participants."
    289       }
    290     },
    291     "cost_and_practicality": {
    292       "inference_cost_reported": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "This is a basic science paper studying emergent properties, not proposing a practical method with inference costs."
    296       },
    297       "compute_budget_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Appendix A: '500k training steps on 16 TPU v2 or v3 cores.' Appendix C.1: 90 runs total for architecture comparison."
    301       }
    302     },
    303     "experimental_rigor": {
    304       "seed_sensitivity_reported": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "Appendix A: experiments run with 3 or 5 seeds. Error bars (std dev) shown across seeds in all figures."
    308       },
    309       "number_of_runs_stated": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Appendix A: 'experiments shown in Figs 5 and 6 were run with 3 seeds each... all other experiments were run with 5 runs each.' Architecture comparison: 15 runs per architecture (90 total)."
    313       },
    314       "hyperparameter_search_budget": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Appendix C.1: hyperparameter sweep over 15 samples of learning rate (log-uniform [1e-5, 0.1]) and 15 samples of warmup steps (log-uniform [1, 10000]), 15 runs per architecture."
    318       },
    319       "best_config_selection_justified": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "For the architecture comparison, all hyperparameter sweep runs are shown (each color = one run), which is transparent. However, for main experiments the fixed hyperparameters are not justified as optimal."
    323       },
    324       "multiple_comparison_correction": {
    325         "applies": false,
    326         "answer": false,
    327         "justification": "No statistical significance tests performed, so multiple comparison correction is not applicable."
    328       },
    329       "self_comparison_bias_addressed": {
    330         "applies": false,
    331         "answer": false,
    332         "justification": "The paper does not compare against other researchers' systems. It trains and evaluates its own models under different conditions, so self-comparison bias in the Lucic et al. sense does not apply."
    333       },
    334       "compute_budget_vs_performance": {
    335         "applies": true,
    336         "answer": true,
    337         "justification": "Architecture comparisons match transformer, RNN, and LSTM on number of parameters, depth, and hidden size (Appendix C.1). Training steps are equalized across conditions."
    338       },
    339       "benchmark_construct_validity": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "The paper carefully defines what in-context learning means (holdout classes with re-assigned labels) vs in-weights learning (trained classes without context support), providing clear construct validity for their evaluation measures."
    343       },
    344       "scaffold_confound_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "The paper trains models from scratch and evaluates them directly. No scaffolding or multi-model comparison through different tool frameworks is involved. The architectural comparison (transformer vs RNN) is a controlled experiment, not a scaffolding confound."
    348       }
    349     }
    350   },
    351   "claims": [
    352     {
    353       "claim": "In-context learning emerges when training data exhibits burstiness — higher proportion of bursty sequences leads to stronger in-context learning.",
    354       "evidence": "Figure 2a shows accuracy on holdout classes increasing with P(bursty) from 0.0 to 1.0 across training steps.",
    355       "supported": "strong"
    356     },
    357     {
    358       "claim": "In-context learning and in-weights learning exhibit a tradeoff in most training conditions.",
    359       "evidence": "Figures 2-5 consistently show that conditions promoting in-context learning reduce in-weights learning and vice versa.",
    360       "supported": "strong"
    361     },
    362     {
    363       "claim": "A Zipfian distribution with exponent ~1 allows both in-context and in-weights learning to coexist.",
    364       "evidence": "Figure 6c-d shows Zipf exponent = 1 maintains high accuracy on both holdout (in-context) and common trained classes (in-weights).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Recurrent architectures (RNN, LSTM) cannot achieve in-context learning even with identical training conditions.",
    369       "evidence": "Figure 7 shows RNN and LSTM never exceed chance on in-context evaluation across all hyperparameter sweep runs, while transformers do.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "These distributional properties may explain why in-context learning emerges in large language models.",
    374       "evidence": "Discussion section draws analogy between Omniglot experimental properties and natural language properties (burstiness, Zipfian distribution, polysemy).",
    375       "supported": "weak"
    376     }
    377   ],
    378   "key_findings": "In-context learning in transformers emerges from specific distributional properties of training data: burstiness, large numbers of rare classes, and dynamic item meanings. These properties trade off in-context against in-weights learning, but a Zipfian class distribution (exponent ~1, matching natural language) enables both simultaneously. Recurrent architectures (RNN, LSTM) fail to develop in-context learning under identical conditions, indicating architecture and data distribution jointly drive this capability.",
    379   "red_flags": [
    380     {
    381       "flag": "Generalization gap",
    382       "detail": "Experiments use small transformers (~831K params) on Omniglot image classification, but discussion speculates broadly about implications for large language models, cognition, and neuroscience without acknowledging the gap between these controlled experiments and real LLM training."
    383     },
    384     {
    385       "flag": "No statistical tests",
    386       "detail": "All comparisons between conditions rely on visual inspection of curves with error bars. No formal significance tests are reported for any of the claims about one condition outperforming another."
    387     },
    388     {
    389       "flag": "No limitations section",
    390       "detail": "Paper lacks any dedicated limitations or threats-to-validity discussion despite making broad mechanistic claims."
    391     }
    392   ],
    393   "cited_papers": [
    394     {
    395       "title": "Language Models are Few-Shot Learners",
    396       "authors": [
    397         "Tom B. Brown"
    398       ],
    399       "year": 2020,
    400       "arxiv_id": "2005.14165",
    401       "relevance": "Foundational demonstration of in-context learning in GPT-3, the phenomenon this paper mechanistically explains."
    402     },
    403     {
    404       "title": "Attention is All you Need",
    405       "authors": [
    406         "Ashish Vaswani"
    407       ],
    408       "year": 2017,
    409       "relevance": "Introduced the transformer architecture whose in-context learning properties this paper studies."
    410     },
    411     {
    412       "title": "Rethinking the Role of Demonstrations: What Makes In-Context Learning Work?",
    413       "authors": [
    414         "Sewon Min"
    415       ],
    416       "year": 2022,
    417       "arxiv_id": "2202.12837",
    418       "relevance": "Challenges whether LLMs genuinely perform in-context learning; this paper provides counter-evidence."
    419     },
    420     {
    421       "title": "An Explanation of In-context Learning as Implicit Bayesian Inference",
    422       "authors": [
    423         "Sang Michael Xie"
    424       ],
    425       "year": 2021,
    426       "arxiv_id": "2111.02080",
    427       "relevance": "Theoretical framework for understanding in-context learning that this paper's findings complement."
    428     },
    429     {
    430       "title": "Impact of Pretraining Term Frequencies on Few-Shot Reasoning",
    431       "authors": [
    432         "Yasaman Razeghi"
    433       ],
    434       "year": 2022,
    435       "relevance": "Studies relationship between training data frequency and in-context performance, directly related to this paper's distributional analysis."
    436     },
    437     {
    438       "title": "Meta-Learning with Memory-Augmented Neural Networks",
    439       "authors": [
    440         "Adam Santoro"
    441       ],
    442       "year": 2016,
    443       "relevance": "Established meta-training paradigm for few-shot learning that this paper contrasts with emergent in-context learning."
    444     },
    445     {
    446       "title": "Deep Residual Learning for Image Recognition",
    447       "authors": [
    448         "Kaiming He"
    449       ],
    450       "year": 2015,
    451       "arxiv_id": "1512.03385",
    452       "relevance": "ResNet architecture used as the image encoder in this paper's experimental setup."
    453     }
    454   ],
    455   "engagement_factors": {
    456     "practical_relevance": {
    457       "score": 1,
    458       "justification": "Offers theoretical insight into why in-context learning works that could inform dataset curation, but no immediately usable tool or technique."
    459     },
    460     "surprise_contrarian": {
    461       "score": 2,
    462       "justification": "The finding that data distribution (not just scale or architecture) drives in-context learning, and that Zipf exponent ~1 is a sweet spot, challenges the 'just scale up' narrative."
    463     },
    464     "fear_safety": {
    465       "score": 0,
    466       "justification": "No safety, security, or risk angle discussed."
    467     },
    468     "drama_conflict": {
    469       "score": 0,
    470       "justification": "No controversy, company rivalry, or challenge to specific claims — a constructive mechanistic study."
    471     },
    472     "demo_ability": {
    473       "score": 1,
    474       "justification": "Code released on GitHub but requires TPU training runs on Omniglot, not a quick demo."
    475     },
    476     "brand_recognition": {
    477       "score": 2,
    478       "justification": "From DeepMind with a Stanford co-author (McClelland), published at NeurIPS 2022."
    479     }
    480   }
    481 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs