scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25732B)
      1 {
      2   "paper": {
      3     "title": "Meta-Learning Transformers to Improve In-Context Generalization",
      4     "authors": ["Lorenzo Braccaioli", "Anna Vettoruzzo", "Prabhant Singh", "Joaquin Vanschoren", "Mohamed-Rafik Bouguelia", "Nicola Conci"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2507.05019",
      8     "doi": "10.48550/arXiv.2507.05019"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "The paper states 'code is provided as supplementary material of this submission and will be made publicly available upon acceptance of the paper' (Sect. 3.3). No working URL is provided — a promise of future release counts as NO."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses Meta-Album, a publicly available benchmark downloaded via the OpenML library (openml==0.14.2), and standard public datasets (CIFAR-fs, CUB, Aircraft, etc.). All datasets are publicly accessible."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper mentions Python and an NVIDIA A100-SXM4 GPU (Appendix B.3) but does not provide a requirements.txt, Dockerfile, or detailed library versions beyond openml==0.14.2."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions are provided. Training details are spread across Sect. 3.3 and Appendix B.3, but there are no runnable commands or scripts described."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Results throughout the paper report ± notation (e.g., '73.34 ± 1.34' in Tab. 7), indicating uncertainty from multiple runs."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper makes numerous comparative claims (GEOM vs GEOM-M vs GEOM-IN) but relies solely on comparing point estimates with standard deviations. No statistical significance tests are reported."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports absolute accuracy differences with baseline context throughout (e.g., 'accuracy increases by 26.1%, 9.4%, and 10.9%' in Sect. 5.1 for Manufacturing domain). Percentage improvements with baselines provide sufficient context."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "No justification is given for why 3 runs were chosen, nor is there any power analysis or discussion of whether 3 runs provides sufficient statistical power for the claims made."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Results are reported as 'average across three complete runs of the algorithms' with standard deviations (± notation) throughout all tables."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Multiple baselines are compared: GEOM vs GEOM-M (merged datasets) vs GEOM-IN (ImageNet-1k training), plus offline baselines and CAMeLU for the unsupervised scenario."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "CAML (2024) and CAMeLU (2025) are the primary baselines, representing the current state of the art in non-causal in-context learning for few-shot classification."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Multiple ablation-style experiments: number of datasets (Sect. 5.2), classes vs images (Sect. 5.3), dataset sizes Micro/Mini/Extended (Tab. 2), curriculum orderings (Sect. 6.3), label noise robustness (Appendix D.1)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Only classification accuracy is used as the evaluation metric throughout the paper. No other metrics (e.g., F1, precision, recall, calibration) are reported."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation is irrelevant for few-shot image classification accuracy on standard benchmarks."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Clear separation between training and test data: LOO excludes entire domains, class splits ensure no overlap ('{ytrain} ∩ {ytest} = ∅', Sect. 3.2), and external datasets (CIFAR-fs, CUB, etc.) are used for evaluation."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results are reported per-dataset and per-domain throughout the paper (e.g., Fig. 2, Tab. 7), with all 30 Meta-Album datasets individually reported."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "The paper discusses domains where GEOM underperforms GEOM-IN (e.g., Large Animals due to overlap, Manufacturing due to low-level features, Sect. 5.1) and the E2H curriculum failure pattern (Sect. 6.3.1)."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative results: GEOM-IN outperforms GEOM in overlapping domains, E2H curriculum performance deteriorates over time (Fig. 8), overfitting observed on Mini size (Sect. 5.3, Fig. 4), and early training stages show negative BWT (forgetting, Fig. 6)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims about comparable/improved generalization from small-scale datasets are supported by Fig. 2 and Tab. 7. Claims about sequential learning resilience to forgetting are supported by BWT analysis in Sect. 6.2. Unsupervised generalization is supported by Fig. 10."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper makes causal claims via ablation design: single-variable manipulations (number of datasets, classes vs images, dataset ordering). The LOO design with controlled single-variable changes is adequate for the claims made."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The paper bounds claims to the Meta-Album benchmark and tested external datasets. The title says 'In-Context Generalization' not general AI capability. Claims are specific to the few-shot image classification setting."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The paper discusses ImageNet overlap as an alternative explanation for GEOM-IN performance (Sect. 5.1, Appendix B.2), feature extractor influence (tested with CLIP in Tab. 8, and from-scratch training in Tab. 14), and low-level features vs class diversity (Sect. 5.2-5.3)."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper measures classification accuracy on few-shot tasks and claims generalization in terms of that same metric. The measurements match the claims — no proxy gap exists."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The model architecture is fully specified: ResNet-50 pre-trained on ImageNet-1k as feature extractor, 8-layer transformer encoder with 8 attention heads, MLP with 3072 reverse bottleneck, feature size 2304 (Sect. 3.3, Appendix B.3). This is not an API-based model where version drift is a concern."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "The paper does not use prompting. GEOM is a trained transformer architecture, not an LLM prompted with text."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix B.3 reports: 300,000 iterations, Adam optimizer, learning rate 10^-5, warmup cosine scheduler, 500 iterations per epoch, 5-way 5-shot, class encoder dimension 256, and unsupervised augmentation parameters (λ ∼ Beta(1,1), λ ∈ (0, 0.5))."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used. GEOM is a standard meta-learning framework."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix B.1 documents: images upscaled from 128×128 to 224×224 for ResNet-50, dataset splits (80%/20% for streaming, class-disjoint), and the OpenML library version used for downloading (openml==0.14.2). Dataset sizes and class distributions are detailed in Tab. 4-5."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "There is no dedicated limitations section. Sect. 8 (Conclusions and future work) mentions some future directions but does not substantively discuss limitations of the current work."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No threats to validity are discussed. The paper does not address potential issues such as dependence on the ImageNet-pretrained feature extractor, limited to image classification, or the small number of runs (3)."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what the results do NOT show. Future work mentions extending to causal transformers but does not frame current limitations as explicit scope boundaries."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "All datasets are publicly available via OpenML (Meta-Album) and standard benchmarks (CIFAR-fs, CUB, Aircraft, etc.). Raw data can be independently obtained."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Data sources are thoroughly described: Meta-Album benchmark (Ullah et al., 2022) with 30 datasets across 10 domains, downloaded via openml==0.14.2. External datasets referenced with standard splits from prior work (Appendix B.1)."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data sources are standard public benchmarks."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The pipeline is documented: download via OpenML → upscale to 224×224 → class-disjoint train/test split → episodic task sampling with probability proportional to dataset size. For unsupervised: augmentation pipeline and mixup procedure described in Appendix B.3."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding information or acknowledgments section is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: University of Trento, Eindhoven University, and University of Doha for Science and Technology."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding is disclosed, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests or financial interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "The paper trains its own model from scratch on Meta-Album — it does not evaluate a pre-trained model's knowledge on a benchmark. Contamination in the LLM sense does not apply."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Same as above — no pre-trained model being evaluated for benchmark knowledge. The paper does discuss ImageNet/Meta-Album class overlap (Appendix B.2), but this is about the feature extractor, not training data contamination."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Not applicable — the paper trains its own model and controls the train/test split explicitly via LOO and class-disjoint splits."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "No inference cost or latency is reported. The paper does not state how long inference takes per task or per episode."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "The paper mentions using an NVIDIA A100-SXM4 40GB GPU (Appendix B.3) but does not quantify total GPU hours, training time, or computational budget for any of the many experiments."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Results are reported as 'average across three complete runs of the algorithms' with standard deviations throughout all tables, showing sensitivity to random initialization."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Explicitly stated: 'Results show the average across three complete runs of the algorithms' in table captions throughout the paper."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search budget is reported. The paper uses fixed hyperparameters without stating whether these were tuned or how many configurations were tried."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Appendix B.3 states: 'the best-performing model is saved as the one resulting in the highest validation accuracy across 50,000 new tasks.' Selection criterion is clear and uses validation data."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The paper makes many comparisons across 30 datasets, multiple baselines, and multiple curricula without any correction for multiple comparisons. No statistical tests are performed at all."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors implement all baselines (GEOM-M, GEOM-IN, CAMeLU) themselves without acknowledging potential author-evaluation bias."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "GEOM-IN trains on ImageNet-1k (1.28M images) while GEOM uses Meta-Album Mini (163K images) — a ~8x difference in training data size. Compute implications are not discussed or controlled for."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "The paper discusses what Meta-Album measures and why it was chosen (Sect. 4): 'diverse and comprehensive suite of datasets tailored for few-shot learning, transfer learning, and meta-learning research.' The paper also validates with external datasets (Tab. 2)."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No scaffolding is involved — this is a direct model architecture comparison."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The paper explicitly addresses ImageNet-1k overlap with Meta-Album (Sect. 5.1, Appendix B.2), including both exact label matching and CLIP concept similarity analysis. This is the primary leakage concern for the feature extractor."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "The paper addresses feature leakage via the frozen ImageNet-pretrained feature extractor by testing with CLIP (Tab. 8) and training from scratch (Tab. 14) to confirm results are not artifacts of the feature extractor."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": true,
    351         "justification": "The LOO design ensures entire domains are excluded from training, and class-disjoint splits ensure '{ytrain} ∩ {ytest} = ∅' (Sect. 3.2). Meta-Album's design ensures minimal cross-domain overlap."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "Appendix B.2 applies two concrete detection methods: (1) exact label matching between ImageNet-1k and Meta-Album, and (2) CLIP embedding cosine similarity with a threshold at the median 90th percentile (0.83). Results visualized in Fig. 11-12."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Training on multiple small-scale domain-specific datasets (GEOM) achieves comparable or better cross-domain generalization than training on a single merged dataset (GEOM-M) or large-scale ImageNet-1k (GEOM-IN).",
    363       "evidence": "Fig. 2 and Tab. 7 show GEOM performs comparably or better than GEOM-M across Meta-Album, and outperforms GEOM-IN in domains with minimal class overlap (Sect. 5.1).",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "Increasing the number of classes (not images) is the primary driver of improved in-context generalization.",
    368       "evidence": "Tab. 2 shows larger improvement from Micro→Mini (more classes, same images/class) than Mini→Extended (more images, same classes). Across 6 external datasets (Sect. 5.3).",
    369       "supported": "moderate"
    370     },
    371     {
    372       "claim": "Sequential training does not cause catastrophic forgetting and can even improve performance on previously seen domains.",
    373       "evidence": "BWT analysis (Fig. 6, Tab. 3) shows positive BWT values in later training stages, indicating improved performance on earlier domains as new domains are introduced (Sect. 6.2).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Hard-to-easy (H2E) curriculum ordering outperforms easy-to-hard (E2H) and domain-based ordering in the sequential setting.",
    378       "evidence": "Fig. 7 and Tab. 13 show H2E achieves the highest average accuracy across all 30 datasets. The learning trend in Fig. 8 illustrates the mechanism.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Unsupervised training on diverse small-scale datasets (GEOM-U) outperforms unsupervised training on ImageNet-1k (CAMeLU).",
    383       "evidence": "Fig. 10 and Tab. 17 show GEOM-U outperforms CAMeLU on most Meta-Album datasets, with CAMeLU only winning in Large Animals (high ImageNet overlap).",
    384       "supported": "moderate"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval"],
    388   "key_findings": "Training in-context learners on collections of small-scale, domain-specific datasets (GEOM) achieves comparable or better cross-domain generalization than training on a single large merged dataset, with class diversity being more important than dataset size. In sequential learning, meta-trained transformers show positive backward transfer rather than catastrophic forgetting, and hard-to-easy curriculum ordering yields the best results. Even in fully unsupervised settings, diverse small-scale datasets outperform large-scale ImageNet-1k training for in-context few-shot classification.",
    389   "red_flags": [
    390     {
    391       "flag": "No statistical significance tests",
    392       "detail": "The paper makes numerous comparative claims across 30 datasets and multiple baselines but never performs any statistical significance test. Differences are assessed by visually comparing means ± std, which is insufficient given the overlapping confidence intervals in many comparisons."
    393     },
    394     {
    395       "flag": "Only 3 runs",
    396       "detail": "All results are averaged over only 3 runs. Given the variance visible in some results (e.g., '53.50 ± 2.97'), 3 runs may be insufficient to draw reliable conclusions, especially without significance tests."
    397     },
    398     {
    399       "flag": "Feature extractor confound",
    400       "detail": "The ImageNet-pretrained ResNet-50 feature extractor is frozen in most experiments. This creates a confound for GEOM-IN, which benefits from matching the feature extractor's training distribution. The paper partially addresses this with CLIP and from-scratch experiments, but the from-scratch results are substantially lower and not the primary basis for claims."
    401     },
    402     {
    403       "flag": "No limitations section",
    404       "detail": "The paper lacks any dedicated limitations or threats-to-validity discussion, which is a significant omission for a paper making broad generalization claims."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Language models are few-shot learners",
    410       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    411       "year": 2020,
    412       "relevance": "Foundational paper on in-context learning in large language models, directly relevant to understanding ICL capabilities."
    413     },
    414     {
    415       "title": "Data distributional properties drive emergent in-context learning in transformers",
    416       "authors": ["Stephanie Chan", "Adam Santoro", "Andrew Lampinen"],
    417       "year": 2022,
    418       "relevance": "Studies what data properties enable in-context learning emergence, relevant to understanding training data requirements for LLM capabilities."
    419     },
    420     {
    421       "title": "Leak, cheat, repeat: Data contamination and evaluation malpractices in closed-source LLMs",
    422       "authors": ["Simone Balloccu", "Patrícia Schmidtová", "Mateusz Lango", "Ondřej Dušek"],
    423       "year": 2024,
    424       "relevance": "Directly relevant to benchmark contamination and evaluation integrity concerns in LLM research."
    425     },
    426     {
    427       "title": "Are emergent abilities of large language models a mirage?",
    428       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    429       "year": 2023,
    430       "relevance": "Challenges emergent abilities narrative in LLMs, relevant to understanding LLM capability claims."
    431     },
    432     {
    433       "title": "MetaICL: Learning to learn in context",
    434       "authors": ["Sewon Min", "Mike Lewis", "Luke Zettlemoyer", "Hannaneh Hajishirzi"],
    435       "year": 2022,
    436       "relevance": "Combines meta-learning with in-context learning for NLP, directly relevant to the LLM meta-learning paradigm."
    437     },
    438     {
    439       "title": "PaLM: Scaling language modeling with pathways",
    440       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    441       "year": 2023,
    442       "relevance": "Major LLM scaling study with data mixture strategies relevant to understanding training data quality effects."
    443     },
    444     {
    445       "title": "The Llama 3 herd of models",
    446       "authors": ["Abhimanyu Dubey", "Abhinav Jauhri"],
    447       "year": 2024,
    448       "arxiv_id": "2407.21783",
    449       "relevance": "State-of-the-art open LLM with large-scale pretraining, relevant to understanding current LLM training paradigms."
    450     },
    451     {
    452       "title": "Rethinking the role of demonstrations: What makes in-context learning work?",
    453       "authors": ["Sewon Min", "Xinxi Lyu", "Ari Holtzman"],
    454       "year": 2022,
    455       "relevance": "Analyzes what aspects of demonstrations matter for in-context learning, directly relevant to ICL capability research."
    456     }
    457   ]
    458 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs