scan-v5.json (24296B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Disaggregation Reveals Hidden Training Dynamics: The Case of Agreement Attraction", 6 "authors": [ 7 "James A. Michaelov", 8 "Catherine Arnett" 9 ], 10 "year": 2025, 11 "venue": "NeurIPS 2025 (arXiv preprint)", 12 "arxiv_id": "2510.24934", 13 "doi": "10.48550/arXiv.2510.24934" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "All abstract claims are supported: language models do struggle with agreement in complex structures (prior work cited), disaggregation over training reveals phased learning, and models show heuristic-based behavior early in training.", 21 "source": "haiku" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper observes temporal sequence in learning phases and relates them to corpus statistics (Appendix A verb frequencies). The inference that models first learn frequent forms, then context-sensitivity, is supported by evidence, though the paper acknowledges this is exploratory not confirmatory.", 27 "source": "haiku" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": false, 32 "justification": "The abstract and discussion claim the approach is 'a powerful tool for understanding language model behavior more generally,' but the paper only evaluates English subject-verb agreement with prepositional phrase attractors. The title's 'case of' qualifier is undercut by broader claims.", 33 "source": "haiku" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": true, 38 "justification": "Paper engages with competing hypotheses: sudden vs. gradual learning debate (Wei et al., Schaeffer et al., Kangaslahti et al.), n-gram vs. longer-range dependencies ('is a question for future work'), and positions findings within these frameworks.", 39 "source": "haiku" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "Clear distinction between proxy (log-probability assignment on minimal pairs) and claim (learning of subject-verb agreement rules). The proxy is well-established in psycholinguistics literature cited.", 45 "source": "haiku" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": true, 52 "justification": "Dedicated Limitations section clearly present after Discussion, separate from Conclusions.", 53 "source": "haiku" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": true, 58 "justification": "Three specific threats: (1) only English SVA with PP attractors, (2) only Pythia models due to checkpoint availability, (3) exploratory not confirmatory work. These are concrete, not boilerplate.", 59 "source": "haiku" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": true, 64 "justification": "Explicitly states what the work does NOT show: generalization beyond English SVA, evaluation of other languages or syntactic phenomena, confirmatory findings without further analysis.", 65 "source": "haiku" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": false, 72 "justification": "Only James Michaelov's funding is disclosed (Andrew W. Mellon Foundation). Catherine Arnett's funding status is not mentioned.", 73 "source": "haiku" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Both authors' affiliations clearly listed: MIT and EleutherAI. Neither has apparent financial interest in the evaluated product.", 79 "source": "haiku" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": true, 83 "answer": true, 84 "justification": "Mellon Foundation is independent funder. No conflict of interest evident.", 85 "source": "haiku" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests statement provided in paper.", 91 "source": "haiku" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "Subject-verb agreement defined with examples (1-2). Agreement attractor explained. Language models specified (Pythia). Terms are defined adequately for target audience.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "Two contributions explicitly stated: (1) methodological approach of disaggregating by condition over training, (2) findings about phased learning in agreement. Reader knows what is being claimed.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Extensive engagement with psycholinguistics (Bock & Miller), prior LM agreement work (Marvin & Linzen, Gulordava), and sudden vs. gradual learning debate (Wei et al., Kangaslahti et al.). Not just citations but real intellectual positioning.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": true, 121 "justification": "Code explicitly released: 'https://github.com/jmichaelov/sv-disaggregation-cognitive-interpretability' in Method section.", 122 "source": "haiku" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "Evaluation uses public benchmarks (BIG-bench Subject-Verb Agreement task, Bock & Cutting 1992 stimuli preprocessed by Arehalli & Linzen 2020). Standard public datasets.", 128 "source": "haiku" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "No requirements.txt, Dockerfile, or environment specification (Python version, dependencies) provided in the paper.", 134 "source": "haiku" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": true, 139 "justification": "Procedure clearly described: use PolyPythia models, specified datasets, calculate log-probability, compare correct vs incorrect verb form. Sufficient detail to reproduce with code access.", 140 "source": "haiku" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": true, 147 "justification": "Figures 1-5 all explicitly show '95% confidence intervals' as shaded regions. Variance is quantified.", 148 "source": "haiku" 149 }, 150 "significance_tests": { 151 "applies": true, 152 "answer": false, 153 "justification": "No formal statistical tests (t-tests, Mann-Whitney, etc.) reported. Only visual inspection of confidence intervals and curves.", 154 "source": "haiku" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Accuracy percentages and improvements shown clearly (0-100% on Y-axis). Final performance levels (e.g., 75-100% vs 0-25% across conditions) are effect sizes.", 160 "source": "haiku" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "10 random seeds per model × 5 model sizes = 50 runs. No power analysis or justification for why 10 seeds is sufficient.", 166 "source": "haiku" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": true, 171 "justification": "Confidence intervals shown across random seeds (Appendix C), and variance is visible in the shading across all figures.", 172 "source": "haiku" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": false, 179 "justification": "No baseline comparisons or control methods. This is exploratory analysis of a single phenomenon, not a method comparison.", 180 "source": "haiku" 181 }, 182 "baselines_contemporary": { 183 "applies": false, 184 "answer": false, 185 "justification": "Not applicable.", 186 "source": "haiku" 187 }, 188 "ablation_study": { 189 "applies": true, 190 "answer": false, 191 "justification": "No systematic ablations. Analysis examines naturally varying conditions but doesn't manipulate model components.", 192 "source": "haiku" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": false, 197 "justification": "Only metric reported is accuracy (correct vs incorrect verb form). No loss, cross-entropy, confidence, or other metrics.", 198 "source": "haiku" 199 }, 200 "human_evaluation": { 201 "applies": false, 202 "answer": false, 203 "justification": "Not applicable for this analytical study.", 204 "source": "haiku" 205 }, 206 "held_out_test_set": { 207 "applies": true, 208 "answer": true, 209 "justification": "Evaluation uses standard BIG-bench test sets and published stimuli. Clear separation between training and evaluation.", 210 "source": "haiku" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Extensive breakdowns: per-verb (Appendix B), per-seed (Appendix C), by verb type (be, other single-token, multi-token), by condition (singular, plural, with/without attractors).", 216 "source": "haiku" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Discusses where models fail: performance drops with intervening attractors, low accuracy on plural with singular attractor condition, reversals at step 512 for some verbs.", 222 "source": "haiku" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "Shows conditions with near-zero accuracy (e.g., plural conditions early in training), unexpected reversals, and variation across seeds indicating instability.", 228 "source": "haiku" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": true, 235 "justification": "Exact models specified: Pythia 14M, 31M, 70M, 160M, 410M from PolyPythia (van der Wal et al. 2024) with 10 random seeds and multiple training checkpoints.", 236 "source": "haiku" 237 }, 238 "prompts_provided": { 239 "applies": false, 240 "answer": false, 241 "justification": "Not applicable. Evaluation is based on log-probability, not prompting.", 242 "source": "haiku" 243 }, 244 "hyperparameters_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "Not applicable. Models are pre-trained checkpoints. Paper references PolyPythia for training hyperparameters.", 248 "source": "haiku" 249 }, 250 "scaffolding_described": { 251 "applies": false, 252 "answer": false, 253 "justification": "Not applicable. No agentic scaffolding used.", 254 "source": "haiku" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": true, 259 "justification": "Preprocessing steps documented: dataset selection (BIG-bench subsets + Bock & Cutting), single-token vs multi-token verb handling, log-probability calculation, token normalization discussion (Appendix D).", 260 "source": "haiku" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": true, 267 "justification": "Uses public benchmarks (BIG-bench, Bock & Cutting 1992). Raw stimuli are publicly available from cited sources.", 268 "source": "haiku" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Data sourced from prior published work (BIG-bench, Arehalli & Linzen 2020). Collection procedures are described in cited papers.", 274 "source": "haiku" 275 }, 276 "recruitment_methods_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "Not applicable. Uses benchmark datasets, no recruitment.", 280 "source": "haiku" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": true, 285 "justification": "Pipeline documented: datasets identified, single vs multi-token handling specified, log-probability calculation method described, token normalization approach discussed (Appendix D).", 286 "source": "haiku" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "Models trained on The Pile (cutoff mid-2020) but this is not explicitly stated in the paper. Training cutoff date for evaluation benchmarks vs training data not discussed.", 294 "source": "haiku" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": true, 298 "answer": false, 299 "justification": "No discussion of potential contamination. BIG-bench and Bock & Cutting 1992 stimuli are unlikely to be in Pile, but this is not addressed.", 300 "source": "haiku" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": true, 304 "answer": false, 305 "justification": "No explicit discussion of benchmark contamination risk, despite Pile being a broad web corpus.", 306 "source": "haiku" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": false, 312 "answer": false, 313 "justification": "Not applicable.", 314 "source": "haiku" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": false, 318 "answer": false, 319 "justification": "Not applicable.", 320 "source": "haiku" 321 }, 322 "demographics_reported": { 323 "applies": false, 324 "answer": false, 325 "justification": "Not applicable.", 326 "source": "haiku" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": false, 330 "answer": false, 331 "justification": "Not applicable.", 332 "source": "haiku" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "Not applicable to human studies, though 10 random seeds per model are documented.", 338 "source": "haiku" 339 }, 340 "blinding_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "Not applicable.", 344 "source": "haiku" 345 }, 346 "attrition_reported": { 347 "applies": false, 348 "answer": false, 349 "justification": "Not applicable.", 350 "source": "haiku" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": true, 356 "answer": false, 357 "justification": "No inference cost, latency, or computational requirements reported.", 358 "source": "haiku" 359 }, 360 "compute_budget_stated": { 361 "applies": true, 362 "answer": false, 363 "justification": "No total computational budget or number of inference calls reported.", 364 "source": "haiku" 365 } 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "Language models learn subject-verb agreement through distinct training phases, initially relying on word frequency heuristics", 372 "evidence": "Figure 1 shows models preferring frequent verb form (is over are) early in training, matching The Pile frequency statistics (Appendix A)", 373 "supported": "strong" 374 }, 375 { 376 "claim": "Models become sensitive to local context (preceding noun) in a discrete phase after frequency-based learning", 377 "evidence": "Sharp transitions in agreement attractor effect at steps 128-512 visible in Figures 1-2, with matching attractor conditions improving while mismatched conditions worsen", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Disaggregating performance by condition reveals hidden learning dynamics invisible in aggregate metrics", 382 "evidence": "Aggregate curve shows gradual improvement while condition-level analysis shows rapid non-monotonic changes. Explicitly demonstrated in Figures 1 vs individual condition traces", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Multi-token verb learning occurs later than single-token verbs due to longer dependency requirements", 387 "evidence": "Figure 1C shows multi-token verb patterns occurring later in training than Figure 1B, explained by requiring trigram sensitivity vs bigram sensitivity", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "The observed learning phases correspond to models learning increasingly complex n-gram statistics", 392 "evidence": "Discussion cites Chang et al. 2024 on transformer learning progression from unigram→bigram→trigram. Timing aligns but not directly tested", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Variation across random seeds indicates underlying process is not completely deterministic", 397 "evidence": "Appendix C shows seed-level plots with meaningful divergence, particularly in smaller models (14M, 31M)", 398 "supported": "moderate" 399 } 400 ], 401 "methodology_tags": [ 402 "benchmark-eval", 403 "observational" 404 ], 405 "key_findings": "By disaggregating performance over training, the authors reveal that language models learn subject-verb agreement in roughly three phases: initial preference for frequent verb forms, transition to sensitivity for preceding-word context (with attractor effects), and final improvement. Aggregate metrics hide these interpretable dynamics, and learning patterns vary by verb tokenization, with multi-token verbs requiring longer-range dependencies and learning later. The findings contribute to the debate on sudden vs. gradual learning by showing 'hidden breakthroughs' in specific conditions that underlie apparently gradual overall improvement.", 406 "red_flags": [ 407 { 408 "flag": "No formal statistical significance testing", 409 "detail": "Only confidence intervals shown visually. No t-tests or significance tests comparing conditions or model sizes." 410 }, 411 { 412 "flag": "Single phenomenon tested with broad generalization claims", 413 "detail": "Only English SVA with PP attractors, but abstract and discussion claim approach is 'powerful tool for understanding language model behavior more generally.'" 414 }, 415 { 416 "flag": "Single model family analyzed", 417 "detail": "Only Pythia evaluated. Authors acknowledge lack of comparable checkpoints in other model families, limiting generalization." 418 }, 419 { 420 "flag": "Exploratory not confirmatory", 421 "detail": "Authors explicitly state 'may be premature to draw any strong conclusions... without further confirmatory analyses.'" 422 }, 423 { 424 "flag": "Training/evaluation cutoff not discussed", 425 "detail": "No explicit statement of Pile training cutoff or discussion of potential benchmark contamination despite broad web corpus." 426 }, 427 { 428 "flag": "Mechanistic interpretations not directly tested", 429 "detail": "N-gram vs longer-range dependency hypothesis inferred from timing, but not directly manipulated or verified." 430 }, 431 { 432 "flag": "Seed variation not deeply analyzed", 433 "detail": "Appendix C shows meaningful variation across 10 seeds, particularly in smaller models, but this instability is not investigated." 434 }, 435 { 436 "flag": "Brief unexplained reversals at step 512", 437 "detail": "Some verbs show singular/plural preference reversal at step 512 that 'quickly reverses' — suggests possible confound or training artifact not explained." 438 }, 439 { 440 "flag": "Sample size not justified", 441 "detail": "10 random seeds chosen without power analysis or justification for sufficiency." 442 }, 443 { 444 "flag": "Only one evaluation metric", 445 "detail": "Only accuracy reported; no cross-entropy loss, token probability, confidence measures, or other metrics." 446 } 447 ], 448 "cited_papers": [ 449 { 450 "title": "Colorless Green Recurrent Networks Dream Hierarchically", 451 "relevance": "Prior work on LM agreement errors with attractors; directly compared in this paper's analysis" 452 }, 453 { 454 "title": "Assessing the Ability of LSTMs to Learn Syntax-Sensitive Dependencies", 455 "relevance": "Foundational work on minimal pairs for testing grammatical knowledge; paradigm used in this study" 456 }, 457 { 458 "title": "Targeted Syntactic Evaluation of Language Models", 459 "relevance": "Framework for evaluating syntactic knowledge via controlled datasets; methodological precedent" 460 }, 461 { 462 "title": "BLiMP: The Benchmark of Linguistic Minimal Pairs for English", 463 "relevance": "Standard benchmark containing Subject-Verb Agreement tasks used in this evaluation" 464 }, 465 { 466 "title": "Hidden Breakthroughs in Language Model Training", 467 "relevance": "Concurrent work arguing for phase transitions in learning; directly cited as related framework" 468 }, 469 { 470 "title": "Characterizing Learning Curves During Language Model Pre-Training: Learning, Forgetting, and Stability", 471 "relevance": "Analysis of learning dynamics over training; complementary approach to understanding training phases" 472 }, 473 { 474 "title": "Broken Agreement", 475 "relevance": "Classic psycholinguistics work on agreement attraction in humans; paradigm adapted for LM analysis" 476 }, 477 { 478 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 479 "relevance": "Challenges sudden emergence narrative; paper positioned relative to gradual vs sudden learning debate" 480 } 481 ], 482 "engagement_factors": { 483 "practical_relevance": { 484 "score": 1, 485 "justification": "Academic analysis of model behavior; not directly actionable for practitioners. Understanding agreement learning phases may inform model evaluation but lacks practical application." 486 }, 487 "surprise_contrarian": { 488 "score": 2, 489 "justification": "Challenges both 'sudden emergence' and 'purely gradual' narratives by showing phase structure with hidden breakthroughs. Modest contrarian value in the ongoing debate." 490 }, 491 "fear_safety": { 492 "score": 0, 493 "justification": "No safety implications or risk concerns raised. Analysis of grammatical learning mechanisms does not touch capability risks." 494 }, 495 "drama_conflict": { 496 "score": 1, 497 "justification": "Engages with sudden vs. gradual learning debate and offers evidence for 'hidden breakthroughs' framework. Moderate intellectual conflict." 498 }, 499 "demo_ability": { 500 "score": 2, 501 "justification": "Code released on GitHub, evaluation uses public benchmarks, procedure clearly described. Could be reproduced and extended by others." 502 }, 503 "brand_recognition": { 504 "score": 1, 505 "justification": "MIT and EleutherAI are respected institutions but not top-tier labs. Limited brand halo compared to FAIR/OpenAI/Anthropic." 506 } 507 }, 508 "hn_data": { 509 "threads": [ 510 { 511 "hn_id": "45783837", 512 "title": "Watermarking for Generative AI", 513 "points": 17, 514 "comments": 0, 515 "url": "https://news.ycombinator.com/item?id=45783837", 516 "created_at": "2025-11-01T18:04:10Z" 517 } 518 ], 519 "top_points": 17, 520 "total_points": 17, 521 "total_comments": 0 522 } 523 }