scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31502B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Disaggregation Reveals Hidden Training Dynamics: The Case of Agreement Attraction",
      6     "authors": [
      7       "James A. Michaelov",
      8       "Catherine Arnett"
      9     ],
     10     "year": 2025,
     11     "venue": "NeurIPS 2025 (arXiv)",
     12     "arxiv_id": "2510.24934",
     13     "doi": "10.48550/arXiv.2510.24934"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The abstract claims that disaggregation reveals distinct training phases corresponding to specific heuristics. Figures 1-6 clearly show these phases: frequency bias → local context sensitivity → generalization.",
     21         "source": "opus"
     22       },
     23       "causal_claims_justified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper makes causal-style claims ('models appear to become sensitive to the preceding word', 'transformers overfit their predictions to token unigram probability, then bigram probability') based on observational analysis of training curves. These are presented as explanations, not rigorously tested causal hypotheses. The paper acknowledges this: 'it may be premature to draw any strong conclusions... without further confirmatory analyses.'",
     27         "source": "opus"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The Limitations section explicitly states: 'we only investigate language model performance on English subject-verb agreement, and only consider attractors occurring within prepositional phrases' and acknowledges the models used (PolyPythia only). The paper explicitly bounds its scope.",
     33         "source": "opus"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 4 discusses whether n-gram-like behavior is the explanation or whether models develop 'a more general ability to make predictions based on an increasingly long context,' notes this 'is a question for future work,' and discusses the construct validity implications of bigram-solvable tasks.",
     39         "source": "opus"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "The paper measures accuracy on subject-verb agreement and discusses what this does and does not tell us about grammatical knowledge. Section 4 explicitly discusses whether n-gram statistics vs. generalized grammatical rules explain the behavior, acknowledging the gap between the metric and the broader claim.",
     45         "source": "opus"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "A dedicated 'Limitations' section appears between Section 4 (Discussion) and Section 5 (Conclusions), discussing three specific limitations.",
     53         "source": "opus"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The Limitations section identifies study-specific threats: (1) only English subject-verb agreement with PP attractors, (2) only PolyPythia models because no other suite has the required combination of sizes, seeds, and checkpoints, (3) the work is exploratory and conclusions may be premature without confirmatory analysis.",
     59         "source": "opus"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The Limitations section explicitly states what was NOT tested: other languages, other attractor types, other model families. It also acknowledges the exploratory nature of the work.",
     65         "source": "opus"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Acknowledgments section states: 'James Michaelov was supported by a grant from the Andrew W. Mellon foundation (#2210-13947) during the writing of this paper.'",
     73         "source": "opus"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations are listed: James A. Michaelov (MIT), Catherine Arnett (EleutherAI). EleutherAI produced the Pythia/PolyPythia models used in the study.",
     79         "source": "opus"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The Andrew W. Mellon Foundation is a general research funder with no stake in the specific outcomes of this NLP study.",
     85         "source": "opus"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement is present. Catherine Arnett is affiliated with EleutherAI which produced the models being evaluated, but no conflict-of-interest disclosure addresses this.",
     91         "source": "opus"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms defined precisely: subject-verb agreement (grammatical rule), attractor (intervening noun), agreement attraction (error pattern), log-probability (model likelihood measure). Grounded in psycholinguistics literature.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Contribution explicitly stated: disaggregation method reveals three training phases (frequency → local context → generalized rules) hidden in aggregate metrics. Methodological contribution (analysis approach) + empirical finding (hidden breakthroughs).",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Extensive engagement: cites foundational syntactic evaluation work (Linzen, Marvin), positions relative to sudden vs. gradual learning debate (Wei, Schaeffer, Olsson), draws on psycholinguistics methods (Bock & Miller), relates to recent phase transition work (Kangaslahti et al. 2025).",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "empirical": {
    117       "artifacts": {
    118         "code_released": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Section 2 (Procedure) states: 'We release all code in the following repository: https://github.com/jmichaelov/sv-disaggregation-cognitive-interpretability.'",
    122           "source": "opus"
    123         },
    124         "data_released": {
    125           "applies": true,
    126           "answer": true,
    127           "justification": "The paper uses publicly available datasets: BIG-bench Subject-Verb Agreement task subsets and stimuli from Bock and Cutting (1992) as preprocessed by Arehalli and Linzen (2020). The PolyPythia models are also publicly released.",
    128           "source": "opus"
    129         },
    130         "environment_specified": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper.",
    134           "source": "opus"
    135         },
    136         "reproduction_instructions": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "While code is released, the paper does not include step-by-step reproduction instructions. The procedure section describes the method conceptually but not as a reproducible recipe.",
    140           "source": "opus"
    141         }
    142       },
    143       "statistical_methodology": {
    144         "confidence_intervals_or_error_bars": {
    145           "applies": true,
    146           "answer": true,
    147           "justification": "Figures 1-6 include shading reflecting '95% confidence intervals' as stated in the figure captions.",
    148           "source": "opus"
    149         },
    150         "significance_tests": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "The paper makes comparative claims about performance differences across conditions and training phases but does not report any statistical significance tests. Differences are described qualitatively from visual inspection of plots.",
    154           "source": "opus"
    155         },
    156         "effect_sizes_reported": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No effect sizes are reported. Performance differences are described qualitatively (e.g., 'sharp increase', 'drop in accuracy') without quantifying magnitudes.",
    160           "source": "opus"
    161         },
    162         "sample_size_justified": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No justification for why 10 random seeds per model size is sufficient, nor any power analysis. The choice of 5 model sizes is not justified.",
    166           "source": "opus"
    167         },
    168         "variance_reported": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "The PolyPythia suite provides 10 random seeds per model size, and the 95% confidence interval shading in all figures reflects variance across seeds.",
    172           "source": "opus"
    173         }
    174       },
    175       "evaluation_design": {
    176         "baselines_included": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "The aggregate (mean across conditions) score serves as the baseline comparison point. The paper's core contribution is showing how disaggregated condition-level results differ from this aggregate.",
    180           "source": "opus"
    181         },
    182         "baselines_contemporary": {
    183           "applies": false,
    184           "answer": false,
    185           "justification": "This is not a system comparison paper proposing a new method against prior methods. It analyzes training dynamics of existing models on existing benchmarks.",
    186           "source": "opus"
    187         },
    188         "ablation_study": {
    189           "applies": false,
    190           "answer": false,
    191           "justification": "No system with components to ablate. The paper is an analysis study, not a system proposal.",
    192           "source": "opus"
    193         },
    194         "multiple_metrics": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "Only accuracy (whether the model assigns higher log-probability to the correct verb form) is used. The paper explicitly notes in the conclusion it uses 'a simplistic evaluation metric such as accuracy.'",
    198           "source": "opus"
    199         },
    200         "human_evaluation": {
    201           "applies": false,
    202           "answer": false,
    203           "justification": "Human evaluation is irrelevant to this paper's claims about model training dynamics.",
    204           "source": "opus"
    205         },
    206         "held_out_test_set": {
    207           "applies": false,
    208           "answer": false,
    209           "justification": "No model training or tuning is performed. The study evaluates pre-existing model checkpoints on fixed datasets.",
    210           "source": "opus"
    211         },
    212         "per_category_breakdown": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "This is the core contribution. Results are broken down by verb type (be vs. single-token vs. multi-token), by individual verb (Appendix B), by random seed (Appendix C), and by experimental condition (singular/plural target × attractor type).",
    216           "source": "opus"
    217         },
    218         "failure_cases_discussed": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "The paper discusses where models fail: the agreement attraction effect (mismatched attractor conditions), the initial frequency bias phase, and the non-monotonic accuracy drops during training. Smaller models show less stable patterns.",
    222           "source": "opus"
    223         },
    224         "negative_results_reported": {
    225           "applies": true,
    226           "answer": true,
    227           "justification": "The paper reports that disaggregation reveals performance decreases on certain conditions during training (e.g., mismatched attractor conditions get worse before they get better), and that smaller models show less stable/reliable patterns.",
    228           "source": "opus"
    229         }
    230       },
    231       "setup_transparency": {
    232         "model_versions_specified": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "The paper specifies PolyPythia models (van der Wal et al., 2024), which are specific re-releases of Pythia models (Biderman et al., 2023) at sizes 14M to 410M parameters with 10 random seeds each. These are precisely identified model artifacts.",
    236           "source": "opus"
    237         },
    238         "prompts_provided": {
    239           "applies": false,
    240           "answer": false,
    241           "justification": "No prompting is used. The paper calculates log-probabilities of verb forms given their context, which is a direct model probability query, not a prompting task.",
    242           "source": "opus"
    243         },
    244         "hyperparameters_reported": {
    245           "applies": false,
    246           "answer": false,
    247           "justification": "No training or fine-tuning is performed, and no generation hyperparameters (temperature, etc.) are relevant since the paper only computes log-probabilities.",
    248           "source": "opus"
    249         },
    250         "scaffolding_described": {
    251           "applies": false,
    252           "answer": false,
    253           "justification": "No agentic scaffolding is used. This is a direct model evaluation study.",
    254           "source": "opus"
    255         },
    256         "data_preprocessing_documented": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Section 2 describes which dataset subsets are used, how the Bock and Cutting stimuli were preprocessed by Arehalli and Linzen (2020), how simple agreement sentences were created by removing prepositional phrases, and how multi-token verbs are handled (sum vs. normalized log-probabilities).",
    260           "source": "opus"
    261         }
    262       },
    263       "data_integrity": {
    264         "raw_data_available": {
    265           "applies": true,
    266           "answer": true,
    267           "justification": "Code is released, datasets used are public (BIG-bench, Arehalli and Linzen stimuli), and model checkpoints (PolyPythia) are publicly available. All raw inputs are independently accessible.",
    268           "source": "opus"
    269         },
    270         "data_collection_described": {
    271           "applies": true,
    272           "answer": true,
    273           "justification": "Section 2 (Datasets) describes exactly which datasets are used, their sources, how stimuli were constructed, and what modifications were made (e.g., removing prepositional phrases to create simple agreement sentences).",
    274           "source": "opus"
    275         },
    276         "recruitment_methods_described": {
    277           "applies": false,
    278           "answer": false,
    279           "justification": "No human participants. Data sources are standard public benchmarks and pre-trained model checkpoints.",
    280           "source": "opus"
    281         },
    282         "data_pipeline_documented": {
    283           "applies": true,
    284           "answer": true,
    285           "justification": "Section 2 (Procedure) documents the pipeline: calculate log-probability of each verb following its context, compare correct vs. incorrect forms, separate single-token and multi-token verbs, define multi-token probability as sum of token log-probabilities.",
    286           "source": "opus"
    287         }
    288       },
    289       "contamination": {
    290         "training_cutoff_stated": {
    291           "applies": true,
    292           "answer": false,
    293           "justification": "The paper does not state the training data cutoff for the Pythia/PolyPythia models, though it references The Pile (Gao et al., 2020) indirectly via the verb frequency table. No explicit cutoff date is given.",
    294           "source": "opus"
    295         },
    296         "train_test_overlap_discussed": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "No discussion of whether the BIG-bench subject-verb agreement stimuli or the Bock and Cutting (1992) stimuli could appear in The Pile training data.",
    300           "source": "opus"
    301         },
    302         "benchmark_contamination_addressed": {
    303           "applies": true,
    304           "answer": false,
    305           "justification": "BIG-bench was published online and could be in The Pile or its derivatives. The paper does not discuss this contamination risk.",
    306           "source": "opus"
    307         }
    308       },
    309       "human_studies": {
    310         "pre_registered": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "No human participants in this study.",
    314           "source": "opus"
    315         },
    316         "irb_or_ethics_approval": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants.",
    320           "source": "opus"
    321         },
    322         "demographics_reported": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants.",
    326           "source": "opus"
    327         },
    328         "inclusion_exclusion_criteria": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "opus"
    333         },
    334         "randomization_described": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "opus"
    339         },
    340         "blinding_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "opus"
    345         },
    346         "attrition_reported": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "opus"
    351         }
    352       },
    353       "cost_and_practicality": {
    354         "inference_cost_reported": {
    355           "applies": true,
    356           "answer": false,
    357           "justification": "No inference cost or compute time is reported, despite evaluating 50 model variants (5 sizes × 10 seeds) across many training checkpoints.",
    358           "source": "opus"
    359         },
    360         "compute_budget_stated": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "No total compute budget stated. The paper uses pre-trained checkpoints so training cost is not its responsibility, but inference cost across all checkpoints is not reported.",
    364           "source": "opus"
    365         }
    366       },
    367       "experimental_rigor": {
    368         "seed_sensitivity_reported": {
    369           "applies": true,
    370           "answer": true,
    371           "justification": "The PolyPythia suite provides 10 random seeds per model size, and Appendix C shows seed-level plots. Section 3 notes 'There is also some variation by random seed (Section C).'",
    372           "source": "opus"
    373         },
    374         "number_of_runs_stated": {
    375           "applies": true,
    376           "answer": true,
    377           "justification": "Section 2 states: 'These are a set ten random seeds of each Pythia model from 14M to 410M parameters.' The number of seeds (10) per size is explicit.",
    378           "source": "opus"
    379         },
    380         "hyperparameter_search_budget": {
    381           "applies": false,
    382           "answer": false,
    383           "justification": "No hyperparameter tuning is performed. The study evaluates pre-trained models with a fixed evaluation procedure.",
    384           "source": "opus"
    385         },
    386         "best_config_selection_justified": {
    387           "applies": false,
    388           "answer": false,
    389           "justification": "No configuration selection is performed. All model sizes and seeds are evaluated.",
    390           "source": "opus"
    391         },
    392         "multiple_comparison_correction": {
    393           "applies": true,
    394           "answer": false,
    395           "justification": "No statistical tests are performed, so no multiple comparison correction is applied, despite making many implicit comparisons across conditions, model sizes, and training steps.",
    396           "source": "opus"
    397         },
    398         "self_comparison_bias_addressed": {
    399           "applies": false,
    400           "answer": false,
    401           "justification": "The paper does not propose a new system to compare against baselines. It analyzes existing models on existing benchmarks.",
    402           "source": "opus"
    403         },
    404         "compute_budget_vs_performance": {
    405           "applies": false,
    406           "answer": false,
    407           "justification": "Compute budget differences are not relevant — the paper analyzes training dynamics, not proposing a more efficient method.",
    408           "source": "opus"
    409         },
    410         "benchmark_construct_validity": {
    411           "applies": true,
    412           "answer": true,
    413           "justification": "Section 4 explicitly discusses construct validity: 'if a task is solvable based on bigram statistics, it may indicate that the task may not have sufficient construct validity' and discusses the implications of BLiMP subtasks being solvable by 5-grams.",
    414           "source": "opus"
    415         },
    416         "scaffold_confound_addressed": {
    417           "applies": false,
    418           "answer": false,
    419           "justification": "No scaffolding is involved in this study.",
    420           "source": "opus"
    421         }
    422       },
    423       "data_leakage": {
    424         "temporal_leakage_addressed": {
    425           "applies": true,
    426           "answer": false,
    427           "justification": "No discussion of whether BIG-bench stimuli or the Bock and Cutting stimuli were available during training data collection for The Pile.",
    428           "source": "opus"
    429         },
    430         "feature_leakage_addressed": {
    431           "applies": true,
    432           "answer": false,
    433           "justification": "No discussion of whether the evaluation setup leaks information. The minimal pair design provides both correct and incorrect forms, but the paper does not discuss whether this introduces any evaluation artifacts.",
    434           "source": "opus"
    435         },
    436         "non_independence_addressed": {
    437           "applies": true,
    438           "answer": false,
    439           "justification": "No discussion of whether training data contains similar subject-verb agreement patterns to the test stimuli (which it almost certainly does, being drawn from English text).",
    440           "source": "opus"
    441         },
    442         "leakage_detection_method": {
    443           "applies": true,
    444           "answer": false,
    445           "justification": "No leakage detection or prevention method is used.",
    446           "source": "opus"
    447         }
    448       }
    449     }
    450   },
    451   "claims": [
    452     {
    453       "claim": "Models initially assign higher probability to more frequent verb forms (frequency-driven phase)",
    454       "evidence": "Figure 1A shows initial preference for 'is' over 'are' (1B shows plural preference for other verbs); explained by verb frequency in The Pile (Table 1: 'is' 2.5B vs 'are' 816M tokens)",
    455       "supported": "strong"
    456     },
    457     {
    458       "claim": "Models then become sensitive to preceding context, improving on less-frequent forms with matching attractors",
    459       "evidence": "Figure 1 shows sharp improvement in plural-with-plural-attractor condition around steps 128–512, suggesting local context sensitivity",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "Models exhibit agreement attraction effects (mismatching attractors cause errors) similar to humans",
    464       "evidence": "Singular-with-plural-attractor condition shows decreased accuracy at step 512+, matching psycholinguistic pattern reported in Bock & Cutting (1992)",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Disaggregation reveals three distinct learning phases not visible in aggregate metrics",
    469       "evidence": "Aggregate black line (Figure 1) shows gradual increase, but disaggregated conditions reveal sharp phase transitions in steps 128–512 followed by continued improvement",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Learning phases correspond to increasing context window sensitivity (unigram → bigram → trigram dependencies)",
    474       "evidence": "Authors propose this explanation noting multi-token verbs (requiring trigram sensitivity) show later phase transitions than single-token verbs; labeled as 'possible explanation' and 'question for future work'",
    475       "supported": "moderate"
    476     },
    477     {
    478       "claim": "Patterns generalize across model sizes (14M–410M parameters) and random seeds",
    479       "evidence": "Core patterns visible across all five model sizes in Figure 1; seed-level variation shown in Appendix C with consistent trends",
    480       "supported": "strong"
    481     },
    482     {
    483       "claim": "Smaller models show higher variability and delayed phase transitions",
    484       "evidence": "Figure 1 shows Pythia 14M improvements begin later and are more gradual; Appendix C shows more seed variability in smaller models",
    485       "supported": "strong"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval",
    490     "observational"
    491   ],
    492   "key_findings": "Disaggregating language model performance over experimental conditions reveals three sequential learning phases invisible in aggregate metrics: (1) frequency-driven (preference for high-frequency verb forms), (2) local context-driven (sensitivity to preceding noun), and (3) progressive generalization. These phases emerge in steps 128–512 of training and align with psycholinguistic patterns of agreement attraction in humans. The finding challenges the binary sudden-vs.-gradual learning debate by showing learning proceeds through 'hidden breakthroughs'—discrete phase transitions that compound into apparently gradual aggregate curves.",
    493   "red_flags": [
    494     {
    495       "flag": "No significance testing",
    496       "detail": "95% CIs provided but no formal statistical tests (t-tests, ANOVA) comparing phase differences or condition effects, making it unclear if observed differences are meaningful vs. noise"
    497     },
    498     {
    499       "flag": "Limited generalization scope",
    500       "detail": "Only English subject-verb agreement with prepositional phrase attractors; only PolyPythia models. Results may not transfer to other languages, languages structures, or model architectures"
    501     },
    502     {
    503       "flag": "Explanatory not confirmatory",
    504       "detail": "Authors explicitly state work is 'exploratory' and may be 'premature to draw strong conclusions.' Proposed n-gram explanation is speculative and flagged as 'question for future work'"
    505     },
    506     {
    507       "flag": "Single evaluation metric",
    508       "detail": "Only accuracy reported; no alternative metrics (e.g., calibration, rank probability, per-seed correlation distance) to validate phase consistency"
    509     },
    510     {
    511       "flag": "Anthropomorphic language without mechanistic evidence",
    512       "detail": "Statements like 'models appear to become sensitive' and 'models learn' lack mechanistic grounding; no attention analysis, gradient flow, or probing experiments"
    513     },
    514     {
    515       "flag": "Model availability constraint",
    516       "detail": "PolyPythia chosen because other suites lack sufficient early-stage checkpoints—acknowledged as limitation but may bias results to this specific training regime"
    517     },
    518     {
    519       "flag": "Missing cost/reproducibility details",
    520       "detail": "No environment specification (requirements.txt) or compute budget reported; only code release (link not verified)"
    521     },
    522     {
    523       "flag": "Contamination not addressed",
    524       "detail": "No explicit statement that BIG-bench/Bock & Cutting stimuli were not in Pile training data; assumed safe but unverified"
    525     }
    526   ],
    527   "cited_papers": [
    528     {
    529       "title": "Assessing the Ability of LSTMs to Learn Syntax-Sensitive Dependencies",
    530       "authors": "Linzen et al.",
    531       "year": 2016,
    532       "relevance": "Foundational work on syntax learning in RNNs; methodological inspiration for this study's linguistic evaluation approach"
    533     },
    534     {
    535       "title": "Targeted Syntactic Evaluation of Language Models",
    536       "authors": "Marvin & Linzen",
    537       "year": 2018,
    538       "relevance": "Introduces targeted minimal-pair evaluation; core baseline for agreement attraction experiments"
    539     },
    540     {
    541       "title": "Colorless Green Recurrent Networks Dream Hierarchically",
    542       "authors": "Gulordava et al.",
    543       "year": 2018,
    544       "relevance": "Large-scale agreement task evaluation; establishes agreement as diagnostic of syntactic learning"
    545     },
    546     {
    547       "title": "BLiMP: The Benchmark of Linguistic Minimal Pairs for English",
    548       "authors": "Warstadt et al.",
    549       "year": 2020,
    550       "relevance": "Primary benchmark used in study; establishes acceptability judgment evaluation paradigm"
    551     },
    552     {
    553       "title": "Hidden Breakthroughs in Language Model Training",
    554       "authors": "Kangaslahti et al.",
    555       "year": 2025,
    556       "relevance": "Directly cited for 'hidden breakthroughs' framing; concurrent work on phase structure in LM learning"
    557     },
    558     {
    559       "title": "Emergent Abilities of Large Language Models",
    560       "authors": "Wei et al.",
    561       "year": 2022,
    562       "relevance": "Key paper in 'sudden learning' debate; positions this work relative to emergence paradigm"
    563     },
    564     {
    565       "title": "Are Emergent Abilities of Large Language Models a Mirage?",
    566       "authors": "Schaeffer et al.",
    567       "year": 2023,
    568       "relevance": "Argues against sudden learning; this work's phase structure findings support gradual-with-transitions interpretation"
    569     },
    570     {
    571       "title": "Broken Agreement",
    572       "authors": "Bock & Miller",
    573       "year": 1991,
    574       "relevance": "Psycholinguistic foundation for agreement attraction in humans; motivates human-LM comparison"
    575     },
    576     {
    577       "title": "Characterizing Learning Curves During Language Model Pre-Training: Learning, Forgetting, and Stability",
    578       "authors": "Chang et al.",
    579       "year": 2024,
    580       "relevance": "Analyzes n-gram to higher-order pattern progression in LM training; supports proposed unigram→bigram→trigram explanation"
    581     }
    582   ],
    583   "engagement_factors": {
    584     "practical_relevance": {
    585       "score": 1,
    586       "justification": "The disaggregation methodology could inform how practitioners evaluate LM training, but requires significant adaptation beyond this narrow grammatical domain."
    587     },
    588     "surprise_contrarian": {
    589       "score": 2,
    590       "justification": "The main finding that smooth aggregate learning curves hide rapid non-monotonic phase transitions is counterintuitive and challenges the gradual-vs-sudden learning debate."
    591     },
    592     "fear_safety": {
    593       "score": 0,
    594       "justification": "No safety, security, or risk angle whatsoever."
    595     },
    596     "drama_conflict": {
    597       "score": 0,
    598       "justification": "No controversy, no company challenges, purely academic contribution to an ongoing scientific debate."
    599     },
    600     "demo_ability": {
    601       "score": 1,
    602       "justification": "Code is released on GitHub but requires setting up PolyPythia models and running evaluation scripts, not a quick try."
    603     },
    604     "brand_recognition": {
    605       "score": 1,
    606       "justification": "MIT and EleutherAI are recognized in ML circles but not household names; NeurIPS venue adds credibility but the topic is niche."
    607     }
    608   },
    609   "hn_data": {
    610     "threads": [
    611       {
    612         "hn_id": "45783837",
    613         "title": "Watermarking for Generative AI",
    614         "points": 17,
    615         "comments": 0,
    616         "url": "https://news.ycombinator.com/item?id=45783837",
    617         "created_at": "2025-11-01T18:04:10Z"
    618       }
    619     ],
    620     "top_points": 17,
    621     "total_points": 17,
    622     "total_comments": 0
    623   }
    624 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs