scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28859B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Language Model Behavioral Phases are Consistent Across Architecture, Training Data, and Scale",
      6     "authors": [
      7       "James A. Michaelov",
      8       "Roger P. Levy",
      9       "Benjamin Bergen"
     10     ],
     11     "year": 2025,
     12     "venue": "NeurIPS 2025",
     13     "arxiv_id": "2510.24963",
     14     "doi": "10.48550/arXiv.2510.24963"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims—consistent behavioral phases across architecture/data/scale, up to 98% variance explained by three heuristics, and the n-gram overfitting trajectory—are directly demonstrated in Figures 1-2 and the regression analyses of Experiments 1-2.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper makes causal-leaning claims such as 'the autoregressive language modeling task itself may be the largest factor—and perhaps the decisive one—in shaping the behavioral phases,' but the study design is purely observational with no controlled interventions to establish causality.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Claims are bounded to the tested conditions (3 architectures, 2 training corpora, 14M–12B parameters, English only); Section 6 (Limitations) explicitly notes the analysis is limited to three architectures with Mamba/RWKV evaluated only at small scale.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The paper explicitly discusses the confound that CC-derived semantic similarity may merely capture unigram frequency, and addresses whether n-gram correlations implicitly absorb the semantic similarity signal; Experiment 2 is specifically designed to disentangle these alternatives.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper measures LM log-probabilities directly and correlates them with n-gram log-probabilities and semantic similarity; the measured variable matches the claimed outcome with no proxy confusion.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "Section 6 is a dedicated Limitations section listing three specific limitations: restriction to three architectures, Mamba/RWKV tested only at small scale (~130–169M parameters on ~2B tokens), and restriction to n-grams up to n=5 with static word embeddings.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats are named with concrete numbers: Mamba and RWKV only at 130–169M parameters trained on ~2B tokens; n limited to {1,2,3,4,5}; static rather than contextual embeddings; regressions still leave substantial unexplained variance.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The paper explicitly states results may not generalize to other architectures, that higher-order n-grams and contextual embeddings were not tested, and that the unexplained variance remains significant even in the smallest models.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The Acknowledgments section discloses Andrew W. Mellon Foundation support (#2210-13947) for Michaelov; however, funding for co-authors Levy and Bergen is not mentioned, leaving disclosure incomplete.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations (MIT Department of Brain and Cognitive Sciences, MIT Libraries CREOS, UCSD Department of Cognitive Science) are listed on the title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The Andrew W. Mellon Foundation is an independent philanthropic organization with no commercial stake in findings about language model behavioral phases.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial interests declaration appears in the paper; the NeurIPS checklist references the Code of Ethics broadly but does not include an explicit financial interests statement.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are operationally defined: n-gram probability via Stupid Backoff with infini-gram (Appendix B), contextual semantic similarity as cosine similarity between fastText embeddings (Appendix C), and 'behavioral phases' explicitly characterized as Phase 1/2/3 in Section 4.2.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section 1 lists four explicit numbered key contributions: (1) the Parc models with 1,314 checkpoints, (2) the NaWoCo evaluation dataset, (3) the heuristic explanation analysis, and (4) demonstration of consistent behavioral phases across architectures.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 (Related Work) explicitly situates the paper relative to prior work on n-gram-like and similarity-based prediction in LMs; the General Discussion (Section 5) thoroughly explains how findings extend or contrast with Chang et al. (2024) and Belrose et al. (2024).",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "The paper explicitly provides a GitHub repository (https://github.com/jmichaelov/lm-behavioral-phases) with 'all code, data, analyses, and models'; Parc model checkpoints are also released.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The NaWoCo dataset is released alongside code; the underlying training corpora (The Pile, OpenWebText, FineWeb) are publicly available; infini-gram indices for n-gram computation are described with build code provided.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": true,
    134           "justification": "The NeurIPS checklist (Q8) states 'Yes' that computational resource details including environment are provided in supplementary documentation; the public GitHub repository would contain dependency specifications.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": true,
    140           "justification": "The NeurIPS checklist (Q5) confirms 'Yes' with instructions for using the code in supplementary materials; the paper states 'We provide the full code for training and running the models' and links the GitHub repository.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Figure 1 explicitly shows 95% confidence intervals for all correlation coefficients across seeds; seed-level breakdowns with 95% CIs are provided in Sections F, I, and J of the appendix.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No formal statistical tests (p-values, t-tests, permutation tests) are reported for comparative claims across architectures or scales; the paper relies on visual inspection of overlapping/non-overlapping CIs and regression coefficient patterns.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "R² values (proportion of variance explained) are reported in Figure 2B as effect size measures for regression models; Pearson r correlations throughout the paper are directly interpretable as effect sizes.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The NaWoCo dataset size (77,999 training, 39,474 validation, 40,980 test items) is described as an outcome of filtering procedures but no power analysis or principled sample size justification is provided.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": true,
    172           "justification": "95% confidence intervals across seeds are shown in all main figures; Appendix G provides per-seed Pearson correlation matrices showing variance across seeds at each checkpoint.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Multiple heuristics (1-gram through 5-gram log-probabilities, multiple semantic similarity variants) serve as comparison conditions; matched vs. unmatched n-gram corpus conditions act as controls for corpus specificity.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Models include Pythia (2023), Mamba-1 (2024), RWKV-4 (2023), and PolyPythia (2024)—all recent; Open-GPT2 (2021) is included for historical continuity alongside newer models.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Experiment 2 ablates each heuristic's independent contribution via multiple linear regression; matched vs. unmatched n-gram conditions test whether results depend on corpus identity; SGPT-weighted vs. uniform similarity tests the weighting method's influence.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The paper uses Pearson correlation, Spearman correlation, linear regression R² and coefficients, perplexity on 7 Paloma benchmark subsets, and accuracy on 5 downstream benchmarks (LAMBADA, SciQ, ARC-Easy, SWAG, BLiMP).",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "This study measures LM log-probabilities computationally on text corpora; no human evaluation of system outputs is required or performed.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "NaWoCo is split into training (77,999), validation (39,474), and test (40,980) sets; Figure 2B reports regression R² on the held-out validation set alongside training set R² to verify no overfitting.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Results are broken down by architecture, model size (14M–12B), training dataset, and random seed; both Pearson and Spearman correlations are provided separately; Appendix G provides cross-architecture correlation matrices at each training checkpoint.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "The paper identifies the outlying checkpoint (step 256,000 of the 'beren' Open-GPT2 345M seed) driving large CIs, and the larger early-training variability in PolyPythia smaller models, as edge cases requiring explanation.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper reports that SGPT-weighted vs. unweighted similarity produces virtually identical coefficient patterns (no meaningful advantage), and that matched vs. unmatched n-gram corpus choice does not dramatically change any findings.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Exact model versions are specified: Pythia 14M–12B with PolyPythia seeds (van der Wal et al., 2024), Open-GPT2 117M and 345M with specific seed IDs (49, 81, 343, 777), and Parc models with full training details in Appendix A.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": false,
    241           "answer": false,
    242           "justification": "This study evaluates autoregressive LM log-probabilities for words in natural context, not prompt-based outputs; no prompting is used.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Parc model training hyperparameters are reported (1024-token sequences, batch size 512, 4000 steps, 0.5M tokens/step); Stupid Backoff uses α=0.4 per Appendix B; fastText embedding variants and SGPT weighting are detailed in Appendix C.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is involved; the study evaluates base LM log-probabilities directly.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Appendix D provides detailed NaWoCo construction steps: FineWeb extraction, filtering criteria (>5 words, capitalized first word only, toxicity ≤0.1), overlap removal via infini-gram zero-count criterion, random critical word selection at position ≥5, and single-token-for-all-models vocabulary filtering.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": true,
    268           "justification": "The NaWoCo dataset is released alongside code and trained Parc model checkpoints via GitHub; underlying corpora (The Pile, OpenWebText, FineWeb) are publicly available.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Appendix D describes the complete NaWoCo collection pipeline: source corpus (FineWeb), all filtering criteria with specific thresholds, split sizes, critical word selection method, and vocabulary filtering procedure.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants were recruited; the study uses machine-generated text corpora only.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The full pipeline is documented: FineWeb extraction → multi-criterion filtering → infini-gram overlap check → split assignment → critical word selection → token vocabulary filtering; infini-gram index construction for n-gram probabilities is fully described in Appendix B with code provided.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": true,
    294           "justification": "Training data for all models is known and fully controlled: The Pile for Pythia (300B tokens), OpenWebText for Open-GPT2 and Parc models; the authors trained the Parc models themselves so training data boundaries are exact.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": true,
    300           "justification": "The paper explicitly addresses this concern: NaWoCo sentences are verified not to appear in either OpenWebText or The Pile using infini-gram zero-count as the exclusion criterion (Section 3.1.4).",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": true,
    306           "justification": "The evaluation dataset (NaWoCo) was constructed from FineWeb, filtered to exclude any sentences appearing in the training corpora of the analyzed models, directly and explicitly addressing contamination concerns.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study; NeurIPS checklist confirms NA.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "Inference cost and latency for running 1,418 model checkpoints across ~158,000 evaluation items are not reported in the main paper; the NeurIPS checklist says compute details are only in supplementary documentation.",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": true,
    364           "justification": "The NeurIPS checklist (Q8) answers 'Yes' that computational resource details (type of compute, memory, execution time) are provided in the documentation supplied in supplementary materials.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Three simple heuristics (unigram probability, n-gram probability, semantic similarity) explain up to 98% of the variance in language model log-probabilities at the word level",
    373       "evidence": "Regression R² values in Figure 2B reach 0.86–0.98 at the peak of the unigram effect and remain above 0.5 for most of training even in the largest models; validation set R² matches training set R² closely",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "All autoregressive LMs (Transformer, Mamba, RWKV) exhibit consistent three behavioral phases during pretraining regardless of architecture",
    378       "evidence": "Figures 1 and 2 show identical phase patterns for Parc-Pythia, Parc-Mamba, and Parc-RWKV trained in parallel on the same data; Appendix G shows cross-architecture Pearson r≥0.93 at each step ≥80",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Language models overfit to n-grams of increasing n over the course of pretraining, with larger models showing greater decrease in correlation with lower-order n-grams",
    383       "evidence": "Figure 1 shows successive correlation peaks with 1-gram through 5-gram log-probabilities as training progresses; regression coefficients in Figure 2A show the 1-gram coefficient drops more sharply for larger Pythia models",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Semantic similarity correlates with LM predictions independently of and above and beyond n-gram probability",
    388       "evidence": "Regression coefficients for semantic similarity remain positive after controlling for unigram and 5-gram log-probability (Experiment 2, Figure 2A), consistent across matched and unmatched n-gram corpus conditions",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Phase timing differs between training-token-matched and training-step-matched comparisons, suggesting steps rather than tokens drive phase onset",
    393       "evidence": "OpenWebText-trained models (0.5M tokens/step) enter phases in fewer steps than Pythia models (2M tokens/step) despite different token counts; discussed in Section 4.2 but not formally tested",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "The autoregressive language modeling task itself may be the decisive factor shaping behavioral phases irrespective of model architecture",
    398       "evidence": "Cross-architecture consistency is the primary evidence cited, but this is an interpretive inference from correlation patterns; no controlled intervention separates the task from architectural or initialization effects",
    399       "supported": "weak"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "observational",
    404     "benchmark-eval"
    405   ],
    406   "key_findings": "Across 1,418 model checkpoints spanning Transformer, Mamba, and RWKV architectures, two training corpora (OpenWebText, The Pile), and 14M–12B parameters, autoregressive LMs exhibit consistent three-phase behavioral patterns during pretraining: initial alignment with unigram frequency, progressive alignment with higher-order n-grams, then stabilization. Three simple heuristics explain up to 98% of variance in word-level log-probabilities at any checkpoint, and this pattern is remarkably stable across random seeds (cross-architecture r≥0.93 after step 80). Semantic similarity contributes to LM predictions independently of n-gram probability after accounting for collinearity. The consistency of phases across fundamentally different architectures suggests the autoregressive training objective itself, rather than architectural choices, drives these learning dynamics.",
    407   "red_flags": [
    408     {
    409       "flag": "Architecture scope overstated in title",
    410       "detail": "The title claims consistency 'Across Architecture' but only 3 architectures are tested; Mamba and RWKV are evaluated only at 130–169M parameters trained on ~2B tokens, far smaller than the 12B Transformer comparator, making scale-matched cross-architecture comparison impossible."
    411     },
    412     {
    413       "flag": "No formal significance tests for comparative claims",
    414       "detail": "All cross-architecture and cross-scale comparative claims rely on visual inspection of overlapping/non-overlapping confidence intervals in figures; no formal hypothesis tests with p-values are reported."
    415     },
    416     {
    417       "flag": "Causal inference from correlational design",
    418       "detail": "The conclusion that 'the autoregressive language modeling task itself may be the decisive factor' is a causal claim unsupported by the purely observational design; architecture and training data were not independently experimentally manipulated."
    419     },
    420     {
    421       "flag": "English-only scope not prominently bounded",
    422       "detail": "All models and the NaWoCo evaluation dataset are English-only; the paper does not discuss whether behavioral phases generalize to multilingual or non-English models, despite broadly framed conclusions about 'neural language models.'"
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling",
    428       "relevance": "Core model suite providing the majority of scaling analysis checkpoints (14M–12B parameters); PolyPythia seeds used for seed-level variance analysis"
    429     },
    430     {
    431       "title": "Characterizing Learning Curves During Language Model Pre-Training: Learning, Forgetting, and Stability",
    432       "relevance": "Direct predecessor work on n-gram correlation trajectories during LM pretraining that this paper explicitly extends to non-transformer architectures and additional scales"
    433     },
    434     {
    435       "title": "Neural Networks Learn Statistics of Increasing Complexity",
    436       "relevance": "Related work showing KL divergence between Pythia models and n-gram models follows a consistent training trajectory, directly corroborating the paper's interpretation"
    437     },
    438     {
    439       "title": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces",
    440       "relevance": "Architecture used to train the Parc-Mamba models; critical for the cross-architecture comparison"
    441     },
    442     {
    443       "title": "RWKV: Reinventing RNNs for the Transformer Era",
    444       "relevance": "Architecture used to train the Parc-RWKV models; critical for testing whether n-gram phase patterns extend beyond transformers"
    445     },
    446     {
    447       "title": "Infini-gram: Scaling Unbounded n-gram Language Models to a Trillion Tokens",
    448       "relevance": "Core tool used for efficient n-gram probability calculation and for detecting training data overlap in NaWoCo construction"
    449     },
    450     {
    451       "title": "The FineWeb Datasets: Decanting the Web for the Finest Text Data at Scale",
    452       "relevance": "Source corpus for constructing the NaWoCo evaluation dataset"
    453     },
    454     {
    455       "title": "PolyPythias: Stability and Outliers across Fifty Language Model Pre-Training Runs",
    456       "relevance": "Provides additional Pythia training seeds (PolyPythia) used in the seed-level variance and outlier analyses"
    457     },
    458     {
    459       "title": "Strong Prediction: Language Model Surprisal Explains Multiple N400 Effects",
    460       "relevance": "Prior work by first author establishing the semantic similarity–LM surprisal relationship that motivates Experiment 1's semantic similarity analysis"
    461     },
    462     {
    463       "title": "Language Model Behavior: A Comprehensive Survey",
    464       "relevance": "Provides the broader framework for understanding LM behavioral properties that motivates the study's three-heuristic characterization approach"
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 1,
    470       "justification": "Understanding training phases may inform early stopping or curriculum design, but no direct practitioner tools or immediate applications are provided."
    471     },
    472     "surprise_contrarian": {
    473       "score": 2,
    474       "justification": "The finding that 3 simple heuristics explain up to 98% of LM prediction variance, and that behavioral phases are near-identical across architectures as different as Transformers and Mamba, is genuinely surprising and challenges assumptions about architecture importance."
    475     },
    476     "fear_safety": {
    477       "score": 0,
    478       "justification": "No AI safety, risk, or alignment concerns are raised; this is basic science on LM learning dynamics with no dual-use implications."
    479     },
    480     "drama_conflict": {
    481       "score": 0,
    482       "justification": "No controversy; findings are incremental extensions of established prior work with no contested claims about existing methods."
    483     },
    484     "demo_ability": {
    485       "score": 1,
    486       "justification": "Code, models, and data are released at GitHub, but reproducing the analysis requires substantial compute to run 1,418 model checkpoints over 158,000 evaluation items."
    487     },
    488     "brand_recognition": {
    489       "score": 1,
    490       "justification": "MIT and UCSD are well-recognized research institutions; the Pythia model suite from EleutherAI is widely known in the LLM research community, lending some recognition to the work."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [],
    495     "top_points": 0,
    496     "total_points": 0,
    497     "total_comments": 0
    498   }
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs