scan.json (26888B)
1 { 2 "paper": { 3 "title": "Disaggregation Reveals Hidden Training Dynamics: The Case of Agreement Attraction", 4 "authors": [ 5 "James A. Michaelov", 6 "Catherine Arnett" 7 ], 8 "year": 2025, 9 "venue": "NeurIPS 2025", 10 "arxiv_id": "2510.24934", 11 "doi": "10.48550/arXiv.2510.24934" 12 }, 13 "scan_version": 3, 14 "active_modules": [ 15 "experimental_rigor", 16 "data_leakage" 17 ], 18 "methodology_tags": [ 19 "benchmark-eval" 20 ], 21 "key_findings": "Disaggregating language model performance on subject-verb agreement benchmarks by condition reveals hidden 'breakthroughs' in training that are invisible in aggregate metrics. Models first learn frequency-based heuristics (preferring more common verb forms), then become sensitive to local context (bigram-like behavior producing agreement attraction effects), and finally improve overall. These phases proceed through rapid non-monotonic transitions rather than the gradual improvement suggested by aggregate scores.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "Section 2 (Procedure) states: 'We release all code in the following repository: https://github.com/jmichaelov/sv-disaggregation-cognitive-interpretability.'" 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper uses publicly available datasets: BIG-bench Subject-Verb Agreement task subsets and stimuli from Bock and Cutting (1992) as preprocessed by Arehalli and Linzen (2020). The PolyPythia models are also publicly released." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "While code is released, the paper does not include step-by-step reproduction instructions. The procedure section describes the method conceptually but not as a reproducible recipe." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": true, 49 "justification": "Figures 1-6 include shading reflecting '95% confidence intervals' as stated in the figure captions." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper makes comparative claims about performance differences across conditions and training phases but does not report any statistical significance tests. Differences are described qualitatively from visual inspection of plots." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "No effect sizes are reported. Performance differences are described qualitatively (e.g., 'sharp increase', 'drop in accuracy') without quantifying magnitudes." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "No justification for why 10 random seeds per model size is sufficient, nor any power analysis. The choice of 5 model sizes is not justified." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": true, 69 "justification": "The PolyPythia suite provides 10 random seeds per model size, and the 95% confidence interval shading in all figures reflects variance across seeds." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "The aggregate (mean across conditions) score serves as the baseline comparison point. The paper's core contribution is showing how disaggregated condition-level results differ from this aggregate." 77 }, 78 "baselines_contemporary": { 79 "applies": false, 80 "answer": false, 81 "justification": "This is not a system comparison paper proposing a new method against prior methods. It analyzes training dynamics of existing models on existing benchmarks." 82 }, 83 "ablation_study": { 84 "applies": false, 85 "answer": false, 86 "justification": "No system with components to ablate. The paper is an analysis study, not a system proposal." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": false, 91 "justification": "Only accuracy (whether the model assigns higher log-probability to the correct verb form) is used. The paper explicitly notes in the conclusion it uses 'a simplistic evaluation metric such as accuracy.'" 92 }, 93 "human_evaluation": { 94 "applies": false, 95 "answer": false, 96 "justification": "Human evaluation is irrelevant to this paper's claims about model training dynamics." 97 }, 98 "held_out_test_set": { 99 "applies": false, 100 "answer": false, 101 "justification": "No model training or tuning is performed. The study evaluates pre-existing model checkpoints on fixed datasets." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "This is the core contribution. Results are broken down by verb type (be vs. single-token vs. multi-token), by individual verb (Appendix B), by random seed (Appendix C), and by experimental condition (singular/plural target × attractor type)." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper discusses where models fail: the agreement attraction effect (mismatched attractor conditions), the initial frequency bias phase, and the non-monotonic accuracy drops during training. Smaller models show less stable patterns." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper reports that disaggregation reveals performance decreases on certain conditions during training (e.g., mismatched attractor conditions get worse before they get better), and that smaller models show less stable/reliable patterns." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims that disaggregation reveals distinct training phases corresponding to specific heuristics. Figures 1-6 clearly show these phases: frequency bias → local context sensitivity → generalization." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": false, 128 "justification": "The paper makes causal-style claims ('models appear to become sensitive to the preceding word', 'transformers overfit their predictions to token unigram probability, then bigram probability') based on observational analysis of training curves. These are presented as explanations, not rigorously tested causal hypotheses. The paper acknowledges this: 'it may be premature to draw any strong conclusions... without further confirmatory analyses.'" 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "The Limitations section explicitly states: 'we only investigate language model performance on English subject-verb agreement, and only consider attractors occurring within prepositional phrases' and acknowledges the models used (PolyPythia only). The paper explicitly bounds its scope." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": true, 138 "justification": "Section 4 discusses whether n-gram-like behavior is the explanation or whether models develop 'a more general ability to make predictions based on an increasingly long context,' notes this 'is a question for future work,' and discusses the construct validity implications of bigram-solvable tasks." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper measures accuracy on subject-verb agreement and discusses what this does and does not tell us about grammatical knowledge. Section 4 explicitly discusses whether n-gram statistics vs. generalized grammatical rules explain the behavior, acknowledging the gap between the metric and the broader claim." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "The paper specifies PolyPythia models (van der Wal et al., 2024), which are specific re-releases of Pythia models (Biderman et al., 2023) at sizes 14M to 410M parameters with 10 random seeds each. These are precisely identified model artifacts." 151 }, 152 "prompts_provided": { 153 "applies": false, 154 "answer": false, 155 "justification": "No prompting is used. The paper calculates log-probabilities of verb forms given their context, which is a direct model probability query, not a prompting task." 156 }, 157 "hyperparameters_reported": { 158 "applies": false, 159 "answer": false, 160 "justification": "No training or fine-tuning is performed, and no generation hyperparameters (temperature, etc.) are relevant since the paper only computes log-probabilities." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. This is a direct model evaluation study." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 2 describes which dataset subsets are used, how the Bock and Cutting stimuli were preprocessed by Arehalli and Linzen (2020), how simple agreement sentences were created by removing prepositional phrases, and how multi-token verbs are handled (sum vs. normalized log-probabilities)." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "A dedicated 'Limitations' section appears between Section 4 (Discussion) and Section 5 (Conclusions), discussing three specific limitations." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "The Limitations section identifies study-specific threats: (1) only English subject-verb agreement with PP attractors, (2) only PolyPythia models because no other suite has the required combination of sizes, seeds, and checkpoints, (3) the work is exploratory and conclusions may be premature without confirmatory analysis." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "The Limitations section explicitly states what was NOT tested: other languages, other attractor types, other model families. It also acknowledges the exploratory nature of the work." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "Code is released, datasets used are public (BIG-bench, Arehalli and Linzen stimuli), and model checkpoints (PolyPythia) are publicly available. All raw inputs are independently accessible." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 2 (Datasets) describes exactly which datasets are used, their sources, how stimuli were constructed, and what modifications were made (e.g., removing prepositional phrases to create simple agreement sentences)." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data sources are standard public benchmarks and pre-trained model checkpoints." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "Section 2 (Procedure) documents the pipeline: calculate log-probability of each verb following its context, compare correct vs. incorrect forms, separate single-token and multi-token verbs, define multi-token probability as sum of token log-probabilities." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Acknowledgments section states: 'James Michaelov was supported by a grant from the Andrew W. Mellon foundation (#2210-13947) during the writing of this paper.'" 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are listed: James A. Michaelov (MIT), Catherine Arnett (EleutherAI). EleutherAI produced the Pythia/PolyPythia models used in the study." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": true, 226 "justification": "The Andrew W. Mellon Foundation is a general research funder with no stake in the specific outcomes of this NLP study." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests statement is present. Catherine Arnett is affiliated with EleutherAI which produced the models being evaluated, but no conflict-of-interest disclosure addresses this." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "The paper does not state the training data cutoff for the Pythia/PolyPythia models, though it references The Pile (Gao et al., 2020) indirectly via the verb frequency table. No explicit cutoff date is given." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether the BIG-bench subject-verb agreement stimuli or the Bock and Cutting (1992) stimuli could appear in The Pile training data." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "BIG-bench was published online and could be in The Pile or its derivatives. The paper does not discuss this contamination risk." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference cost or compute time is reported, despite evaluating 50 model variants (5 sizes × 10 seeds) across many training checkpoints." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No total compute budget stated. The paper uses pre-trained checkpoints so training cost is not its responsibility, but inference cost across all checkpoints is not reported." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": true, 304 "justification": "The PolyPythia suite provides 10 random seeds per model size, and Appendix C shows seed-level plots. Section 3 notes 'There is also some variation by random seed (Section C).'" 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": true, 309 "justification": "Section 2 states: 'These are a set ten random seeds of each Pythia model from 14M to 410M parameters.' The number of seeds (10) per size is explicit." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": false, 313 "answer": false, 314 "justification": "No hyperparameter tuning is performed. The study evaluates pre-trained models with a fixed evaluation procedure." 315 }, 316 "best_config_selection_justified": { 317 "applies": false, 318 "answer": false, 319 "justification": "No configuration selection is performed. All model sizes and seeds are evaluated." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "No statistical tests are performed, so no multiple comparison correction is applied, despite making many implicit comparisons across conditions, model sizes, and training steps." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": false, 328 "answer": false, 329 "justification": "The paper does not propose a new system to compare against baselines. It analyzes existing models on existing benchmarks." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": false, 333 "answer": false, 334 "justification": "Compute budget differences are not relevant — the paper analyzes training dynamics, not proposing a more efficient method." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": true, 339 "justification": "Section 4 explicitly discusses construct validity: 'if a task is solvable based on bigram statistics, it may indicate that the task may not have sufficient construct validity' and discusses the implications of BLiMP subtasks being solvable by 5-grams." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "No scaffolding is involved in this study." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of whether BIG-bench stimuli or the Bock and Cutting stimuli were available during training data collection for The Pile." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the evaluation setup leaks information. The minimal pair design provides both correct and incorrect forms, but the paper does not discuss whether this introduces any evaluation artifacts." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether training data contains similar subject-verb agreement patterns to the test stimuli (which it almost certainly does, being drawn from English text)." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention method is used." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Aggregated metrics of performance hide interpretable patterns in the trajectory of language models' grammatical knowledge over the course of training.", 373 "evidence": "Figure 1 shows that while aggregate accuracy shows slow gradual improvement, condition-level accuracy reveals rapid non-monotonic changes beginning far earlier in training (Section 3-4).", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Models initially learn to assign higher probability to the more frequent form of the verb (frequency heuristic).", 378 "evidence": "Section 3-4: 'is' preferred over 'are' initially (is is more frequent); plural bare forms preferred over singular inflected forms for other verbs. Frequency data from The Pile provided in Table 1 (Appendix A) confirms the frequency ordering matches the observed bias.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "After the frequency phase, models become sensitive to local context (preceding word), producing agreement attraction effects.", 383 "evidence": "Section 3: sharp increase in accuracy for simple plural conditions and conditions with matching attractors, with corresponding decrease for mismatched attractor conditions around steps 128-512 (Figure 1).", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "These training phases may correspond to models learning unigram, then bigram, then trigram statistics.", 388 "evidence": "Section 4 proposes this as a 'possible explanation,' citing Chang et al. (2024) on transformer overfit to n-gram probabilities. The multi-token verb delay (requiring trigram sensitivity) is consistent, but this is explicitly not confirmed.", 389 "supported": "weak" 390 }, 391 { 392 "claim": "Learning of grammatical rules proceeds in a sequence of 'hidden breakthroughs' rather than being sudden or gradual.", 393 "evidence": "Section 5 (Conclusions): the disaggregated results show rapid phase transitions at the condition level that are invisible in aggregate metrics. Supported by Figures 1-6 across model sizes and seeds.", 394 "supported": "moderate" 395 } 396 ], 397 "red_flags": [ 398 { 399 "flag": "No statistical tests", 400 "detail": "All claims about differences between conditions and training phases are based on visual inspection of plots. No statistical tests are used to confirm that observed differences are significant, despite having 10 seeds to compute statistics." 401 }, 402 { 403 "flag": "Contamination risk unaddressed", 404 "detail": "BIG-bench subject-verb agreement stimuli were publicly available online and could appear in The Pile training corpus. This is never discussed." 405 }, 406 { 407 "flag": "EleutherAI affiliation undisclosed as conflict", 408 "detail": "Co-author is affiliated with EleutherAI, which produced the Pythia/PolyPythia models being evaluated. While this is disclosed as an affiliation, it is not discussed as a potential conflict of interest." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models", 414 "authors": [ 415 "Aarohi Srivastava et al." 416 ], 417 "year": 2023, 418 "relevance": "BIG-bench benchmark used as the primary dataset; major LLM capability evaluation suite." 419 }, 420 { 421 "title": "Emergent Abilities of Large Language Models", 422 "authors": [ 423 "Jason Wei", 424 "Yi Tay", 425 "Rishi Bommasani" 426 ], 427 "year": 2022, 428 "relevance": "Key paper on emergent abilities debate that this paper directly engages with." 429 }, 430 { 431 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 432 "authors": [ 433 "Rylan Schaeffer", 434 "Brando Miranda", 435 "Sanmi Koyejo" 436 ], 437 "year": 2023, 438 "relevance": "Counter-argument on emergent abilities; this paper provides new evidence relevant to the sudden vs. gradual learning debate." 439 }, 440 { 441 "title": "Embers of autoregression show how large language models are shaped by the problem they are trained to solve", 442 "authors": [ 443 "R. Thomas McCoy", 444 "Shunyu Yao", 445 "Dan Friedman" 446 ], 447 "year": 2024, 448 "relevance": "Argues LLMs are shaped by surface-level heuristics; directly relevant to this paper's findings about frequency and n-gram heuristics." 449 }, 450 { 451 "title": "Dissociating language and thought in large language models", 452 "authors": [ 453 "Kyle Mahowald", 454 "Anna A. Ivanova", 455 "Idan A. Blank" 456 ], 457 "year": 2024, 458 "relevance": "Argues contemporary LLMs show linguistic competence; this paper investigates how that competence is acquired." 459 }, 460 { 461 "title": "Can Language Models Handle Recursively Nested Grammatical Structures? A Case Study on Comparing Models and Humans", 462 "authors": [ 463 "Andrew Lampinen" 464 ], 465 "year": 2024, 466 "relevance": "Shows even large models like Chinchilla fail at difficult grammatical tasks; directly motivates this study." 467 }, 468 { 469 "title": "PolyPythias: Stability and Outliers across Fifty Language Model Pre-Training Runs", 470 "authors": [ 471 "Oskar van der Wal" 472 ], 473 "year": 2024, 474 "relevance": "Provides the PolyPythia model suite used in this study; relevant to understanding training stability and reproducibility." 475 }, 476 { 477 "title": "Hidden Breakthroughs in Language Model Training", 478 "authors": [ 479 "Sophia Kangaslahti", 480 "Elan Rosenfeld", 481 "Naomi Saphra" 482 ], 483 "year": 2025, 484 "arxiv_id": "2506.15872", 485 "relevance": "Introduces the 'hidden breakthroughs' concept that this paper's findings support and extend." 486 }, 487 { 488 "title": "Characterizing Learning Curves During Language Model Pre-Training: Learning, Forgetting, and Stability", 489 "authors": [ 490 "Tyler A. Chang", 491 "Zhuowen Tu", 492 "Benjamin K. Bergen" 493 ], 494 "year": 2024, 495 "relevance": "Documents n-gram overfitting progression during training, providing a possible mechanistic explanation for this paper's findings." 496 } 497 ], 498 "engagement_factors": { 499 "practical_relevance": { 500 "score": 1, 501 "justification": "The disaggregation methodology could inform how practitioners evaluate LM training, but requires significant adaptation beyond this narrow grammatical domain." 502 }, 503 "surprise_contrarian": { 504 "score": 2, 505 "justification": "The main finding that smooth aggregate learning curves hide rapid non-monotonic phase transitions is counterintuitive and challenges the gradual-vs-sudden learning debate." 506 }, 507 "fear_safety": { 508 "score": 0, 509 "justification": "No safety, security, or risk angle whatsoever." 510 }, 511 "drama_conflict": { 512 "score": 0, 513 "justification": "No controversy, no company challenges, purely academic contribution to an ongoing scientific debate." 514 }, 515 "demo_ability": { 516 "score": 1, 517 "justification": "Code is released on GitHub but requires setting up PolyPythia models and running evaluation scripts, not a quick try." 518 }, 519 "brand_recognition": { 520 "score": 1, 521 "justification": "MIT and EleutherAI are recognized in ML circles but not household names; NeurIPS venue adds credibility but the topic is niche." 522 } 523 } 524 }