scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26229B)
      1 {
      2   "paper": {
      3     "title": "The Mechanistic Emergence of Symbol Grounding in Language Models",
      4     "authors": ["Shuyu Wu", "Ziqiao Ma", "Xiaoxi Luo", "Yidong Huang", "Josue Torres-Fonseca", "Freda Shi", "Joyce Chai"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2510.13796",
      8     "doi": "10.48550/arXiv.2510.13796"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Symbol grounding emerges in Transformers and Mamba-2 (state-space models) trained from scratch with next-token prediction, but not in unidirectional LSTMs. Grounding concentrates in middle-layer computations and is implemented through aggregate attention heads that retrieve environmental tokens to predict linguistic forms. The grounding effect goes beyond simple co-occurrence statistics, as R² between co-occurrence and grounding information gain declines while grounding continues to increase. These findings replicate across child-directed speech, caption-grounded dialogue, and image-grounded dialogue settings.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Two GitHub repositories are provided: https://github.com/Mars-tin/TraBank (model training) and https://github.com/Mars-tin/PyChildes (CHILDES processing), listed in Appendix B."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: CHILDES corpus (MacWhinney, 2000) and Visual Dialog (Das et al., 2017) with MSCOCO images. These are standard public datasets."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Appendix B lists hyperparameters but not software dependencies or library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While hyperparameters and training details are provided in Appendix B, there are no step-by-step reproduction instructions or README with commands to run experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results are averaged over 5 random seeds but no confidence intervals or error bars are shown on the main figures (Figures 2-4). Only point estimates of surprisal and information gain are plotted."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Statistical significance tests are used for causal intervention experiments, with p < 0.001 (***) reported in Tables 2 and Figure 7 comparing intervention vs. control surprisal."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported as surprisal differences between intervention and control conditions (e.g., Table 2: 5.62 vs 4.76 at step 20000) and as grounding information gain values, providing magnitude context."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The choice of 100 target nouns, 10 context templates per word, and 5 random seeds is stated but not justified with power analysis or explanation of why these numbers are sufficient."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results are stated as averaged over 5 seeds but no standard deviation, IQR, or any spread measure is reported. The reader cannot assess result stability across seeds."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper includes comparison across architectures (Transformer, Mamba-2, LSTM) and uses match vs. mismatch conditions as experimental vs. control comparisons. Random head zeroing serves as control for causal interventions."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Mamba-2 (Dao & Gu, 2024) is a recent state-space model architecture. The Transformer baseline follows GPT-2 style. LSTM serves as a negative control. Architecture choices are appropriate."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Causal interventions (zeroing out gather vs. aggregate heads) serve as ablation, showing that aggregate heads are causally important while gather heads are not (Table 2, Section 5.3)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: surprisal, grounding information gain, R² correlation with co-occurrence, saliency scores, and tuned lens probing accuracy."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to this mechanistic interpretability study. The claims are about internal model computations, not output quality."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Test examples are explicitly constructed separately from training data using context templates (Table 1, Section 3.2). The evaluation protocol uses distinct match/mismatch test conditions."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by architecture (4/12/18-layer Transformer, 4/12-layer Mamba-2, LSTM), by dataset (CHILDES, caption-grounded, image-grounded), and by head type (gather vs. aggregate)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "LSTM is discussed as a failure case where grounding does not emerge (Section 4.1, Figure 2d). The image-grounded setting shows smaller effects than caption-grounded (Section 4.3). Gather head interventions show no significant effect."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Negative results include: LSTM fails to ground (Section 4.1), gather head interventions have no significant effect (Table 2), and image-grounded dialogue shows weaker grounding than caption-based (Section 4.3)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about grounding concentrating in middle layers (Figure 5), aggregate mechanism (Figure 6, Table 2), replication across architectures (Figures 2-3), and failure in LSTMs (Figure 2d) are all supported by results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are supported by causal interventions (zeroing out attention heads) with matched random-head controls and significance tests (Section 5.3, Table 2). The controlled experimental design with match/mismatch conditions also supports causal inference."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Discussion section (Section 6) explicitly discusses limitations of generalization to full-scale VLMs, noting 'systematic detection and causal evaluation of such heads at scale remains an open challenge.' Findings are bounded to the tested architectures and settings."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The co-occurrence alternative explanation is substantively addressed in Section 4.2 with R² analysis showing grounding diverges from co-occurrence statistics. Section 6 discusses philosophical alternatives to their grounding interpretation."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper carefully defines grounding information gain as a proxy metric (Section 3.2), with explicit formalization distinguishing the measurement (surprisal difference) from the broader claim (symbol grounding). The paper discusses what grounding means philosophically in Section 6."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Models are trained from scratch, so versions are not applicable in the API sense. Architecture details are fully specified: GPT-2 style Transformer (4/12/18 layers), Mamba-2, LSTM. DINOv2 is specified as the vision encoder. LLaVA-1.5-7B is named for the case study."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use LLM prompting. Models are trained from scratch with standard causal language modeling. The context templates (Appendix A) are the evaluation inputs, which are fully provided."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Comprehensive hyperparameters are reported in Appendix B.2: learning rate, schedule, warmup steps, hidden size, batch size, weight decay, gradient clipping, betas for all three model types."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. This is a mechanistic interpretability study with models trained from scratch."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data preprocessing is documented: token selection procedure (Appendix A.1), word list construction from CDI intersection with CHILDES frequency (Section 3.1), context template construction (Appendix A), and the ENV/LAN tokenization scheme (Section 3.1)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 (Discussions) contains substantive discussion of limitations, particularly regarding generalization to full-scale VLMs and the challenges of systematic detection at scale."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats are discussed: full-scale VLMs use CLIP embeddings with language priors (confounding visual grounding), redundant artifact tokens may store global information rather than object-centric features, and computational cost makes systematic intervention at scale difficult (Section 6)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states that 'systematic detection and causal evaluation of such heads at scale remains an open challenge' and that full-scale VLM analysis is 'anecdotal case studies' rather than principled understanding (Section 6)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "CHILDES data is publicly available via TalkBank, Visual Dialog and MSCOCO are public datasets. Code repositories are provided for data processing."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data sources are described in detail: CHILDES annotation types (local events, action tiers, situational tiers) in Section 3.1, Visual Dialog structure, and MSCOCO image usage. Word selection procedure documented in Appendix A."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The study uses public datasets (CHILDES, Visual Dialog, MSCOCO) and trains models from scratch."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The data pipeline is documented: source corpora → ENV/LAN token separation → custom word-level tokenizer → training chunks of 512 tokens → match/mismatch test construction. Mismatch generation for images uses Stable Diffusion 2 inpainting (Section 4.3)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is disclosed in the Acknowledgement section: NSF IIS-1949634, NSF SES-2128623, NSERC RGPIN-2024-04395, Weinberg Cognitive Science Fellowship, Vector Scholarship, Canada CIFAR AI Chair."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All author affiliations are disclosed: University of Michigan, University of Waterloo, Vector Institute, UNC at Chapel Hill."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funders are government agencies (NSF, NSERC) and academic institutions (CIFAR, Vector Institute) with no financial stake in the specific outcomes about symbol grounding emergence."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Models are trained from scratch on specific corpora (CHILDES, Visual Dialog). There is no pre-trained model whose training cutoff is relevant. The LLaVA case study is supplementary."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Models are trained from scratch with controlled data, and test examples are explicitly constructed separately. No pre-trained model benchmark evaluation where contamination is a concern."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — the evaluation uses custom-constructed test conditions, not public benchmarks that could have been seen during pre-training."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost or evaluation time is reported for the saliency analysis, tuned lens probing, or causal interventions."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Appendix B.3 states: 'Each Transformer, Mamba2, and LSTM model is trained on a single A40 GPU within 5 hours. For VLM models, training is conducted on 2 A40 GPUs over 15 hours.'"
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "All experiments are repeated with 5 random seeds (42, 142, 242, 342, 442) as stated in Section 3.3 and Appendix B.2. However, no spread measures across seeds are reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Explicitly stated: '5 random seeds, randomizing both model initialization and corpus shuffle order' (Section 3.3)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. Hyperparameters are listed but it's unclear how they were selected or whether any search was conducted."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No justification for how the specific hyperparameter configurations were selected. Different hyperparameters are used for Transformer vs. Mamba-2 (e.g., learning rate 5e-5 vs 4e-4) without explanation."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Multiple significance tests are performed across checkpoints and head types (Table 2, Figure 7) without any correction for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "The paper does not compare against external baselines or re-implementations. All models are trained from scratch by the authors for a novel task, so the Lucic et al. bias concern doesn't apply."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "Compute differences between architectures are negligible (all trained on same hardware for similar time). The comparison is about architectural mechanisms, not compute scaling."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 6 discusses construct validity: the paper addresses what 'grounding' means relative to Harnad's definition, distinguishes their causal approach from correlational measures in prior work, and discusses whether their proxy (surprisal reduction) captures genuine grounding."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. Models are evaluated directly without agentic scaffolding."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Models are trained from scratch on controlled corpora, and test examples are explicitly constructed separately from training data. The temporal separation is inherent in the experimental design."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "The ENV/LAN tokenization explicitly prevents surface-form leakage: 'book⟨ENV⟩ and book⟨LAN⟩ are treated as distinct tokens with separate integer indices' (Section 3.1). The paper also tests whether co-occurrence statistics explain results (Section 4.2)."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper does not explicitly discuss whether test context templates share structural similarities with training data or whether independence between training and test examples is ensured beyond the template construction."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The mismatch/control condition serves as a built-in leakage detection: if grounding were due to memorization or leakage, the mismatch condition would also show low surprisal. The co-occurrence R² analysis (Section 4.2) is a concrete detection method."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Symbol grounding emerges in Transformers and Mamba-2 trained from scratch with next-token prediction, but not in unidirectional LSTMs.",
    365       "evidence": "Figures 2a-2d show surprisal separation between match and mismatch conditions for Transformers and Mamba-2 but not LSTM. Grounding information gain increases for Transformers and Mamba-2 but remains near zero for LSTM (Figures 3a-3d).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Grounding goes beyond simple co-occurrence statistics.",
    370       "evidence": "R² between co-occurrence and grounding information gain peaks early then declines while grounding continues to increase (Figures 3a-3c). LSTM shows increasing R² but no grounding gain (Figure 3d).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Grounding is concentrated in middle-layer computations and implemented through aggregate attention heads.",
    375       "evidence": "Saliency analysis shows peak ground-to-symbol flow in layers 7-9 (Figure 5a). Tuned lens shows surprisal drops from layer 7 (Figure 5b). Causal intervention: zeroing aggregate heads significantly increases surprisal (p < 0.001) while zeroing gather heads does not (Table 2).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "The grounding mechanism generalizes to multimodal dialogue settings with captions and images.",
    380       "evidence": "Caption-grounded and image-grounded dialogue show similar surprisal gaps and R² patterns (Figure 4). Causal interventions on VLM aggregate heads show significant surprisal increase (Figure 7).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Aggregate heads in LLaVA-1.5-7B also exhibit grounding behavior consistent with the findings.",
    385       "evidence": "Figure 1b shows an aggregate head in LLaVA-1.5-7B, but Section 6 acknowledges this is anecdotal and systematic detection at scale remains an open challenge.",
    386       "supported": "weak"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No variance across seeds reported",
    392       "detail": "Despite running 5 seeds, the paper does not report standard deviation, confidence intervals, or any spread measure. The reader cannot assess how stable the grounding emergence pattern is across runs."
    393     },
    394     {
    395       "flag": "GPT-4o-mini used for context template generation",
    396       "detail": "Context templates for CHILDES evaluation were generated by gpt-4o-mini (Appendix A.1), introducing potential bias in the evaluation setup. While human verification is mentioned, this may not fully mitigate systematic patterns in LLM-generated templates."
    397     }
    398   ],
    399   "cited_papers": [
    400     {
    401       "title": "Emergent abilities of large language models",
    402       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    403       "year": 2022,
    404       "relevance": "Foundational work on emergent capabilities in LLMs, directly relevant to understanding what capabilities arise from scale."
    405     },
    406     {
    407       "title": "Are emergent abilities of large language models a mirage?",
    408       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    409       "year": 2023,
    410       "relevance": "Challenges the emergence narrative, arguing apparent emergent abilities are measurement artifacts."
    411     },
    412     {
    413       "title": "In-context learning and induction heads",
    414       "authors": ["Catherine Olsson", "Nelson Elhage"],
    415       "year": 2022,
    416       "relevance": "Mechanistic interpretability work on attention head specialization in Transformers, directly related to the aggregate head finding."
    417     },
    418     {
    419       "title": "A mathematical framework for transformer circuits",
    420       "authors": ["Nelson Elhage", "Neel Nanda", "Catherine Olsson"],
    421       "year": 2021,
    422       "relevance": "Foundational mechanistic interpretability framework for understanding Transformer computations."
    423     },
    424     {
    425       "title": "Understanding the skill gap in recurrent models: The role of the gather-and-aggregate mechanism",
    426       "authors": ["Aviv Bick", "Eric P. Xing", "Albert Gu"],
    427       "year": 2025,
    428       "relevance": "Proposes the gather-and-aggregate mechanism that this paper identifies as implementing symbol grounding."
    429     },
    430     {
    431       "title": "Retrieval head mechanistically explains long-context factuality",
    432       "authors": ["Wenhao Wu", "Yizhong Wang", "Guangxuan Xiao"],
    433       "year": 2025,
    434       "arxiv_id": "2505.15105",
    435       "relevance": "Shows retrieval heads are critical for reasoning and long-context performance in Transformers."
    436     },
    437     {
    438       "title": "Locating and editing factual associations in GPT",
    439       "authors": ["Kevin Meng", "David Bau", "Alex J Andonian"],
    440       "year": 2022,
    441       "relevance": "Mechanistic interpretability work on factual recall circuits in LLMs."
    442     },
    443     {
    444       "title": "Label words are anchors: An information flow perspective for understanding in-context learning",
    445       "authors": ["Lean Wang", "Lei Li", "Damai Dai"],
    446       "year": 2023,
    447       "relevance": "Shows how attention mechanisms aggregate information for in-context learning predictions."
    448     },
    449     {
    450       "title": "Transformers are SSMs: Generalized models and efficient algorithms through structured state space duality",
    451       "authors": ["Tri Dao", "Albert Gu"],
    452       "year": 2024,
    453       "relevance": "Mamba-2 architecture used as a comparison architecture to test whether grounding emerges in non-Transformer models."
    454     },
    455     {
    456       "title": "Eliciting latent predictions from transformers with the tuned lens",
    457       "authors": ["Nora Belrose", "Zach Furman", "Logan Smith"],
    458       "year": 2023,
    459       "arxiv_id": "2303.08112",
    460       "relevance": "Interpretability tool used in this paper to probe layer-wise representations for grounding signals."
    461     }
    462   ]
    463 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs