scan.json (28417B)
1 { 2 "paper": { 3 "title": "Language Model Behavioral Phases are Consistent Across Architecture, Training Data, and Scale", 4 "authors": ["James A. Michaelov", "Roger P. Levy", "Benjamin K. Bergen"], 5 "year": 2025, 6 "venue": "NeurIPS 2025", 7 "arxiv_id": "2510.24963", 8 "doi": "10.48550/arXiv.2510.24963" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor"], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "Section 3.1 states 'All code, data, analyses, and models are provided in the following repository: https://github.com/jmichaelov/lm-behavioral-phases'. The NeurIPS checklist also confirms code and data are provided in supplementary materials." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The NaWoCo evaluation dataset is released as part of the repository. The paper also uses publicly available datasets (The Pile, OpenWebText, FineWeb). Trained Parc models with checkpoints are also released." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": true, 27 "justification": "The NeurIPS checklist (item 8) states that computational resource details are provided in the supplementary materials documentation. The paper mentions providing 'full code for training and running the models' which would include environment specifications." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": true, 32 "justification": "NeurIPS checklist item 5 confirms 'instructions contain the exact command and environment needed to run to reproduce the results.' The paper provides full code for building infini-gram indices, calculating probabilities, and training models." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": true, 39 "justification": "Figure 1 shows '95% confidence intervals' across seeds for all models. Figure 2 also shows confidence intervals. Seed-level analyses in Sections F, I, J provide further detail." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper reports Pearson and Spearman correlations and R² values but does not perform formal significance tests (e.g., testing whether correlations or regression coefficients differ significantly from zero or from each other). Comparisons between conditions are made by visual inspection of plots." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "Pearson correlation coefficients (r) and R² values are reported throughout, providing effect size measures. E.g., 'R² = 0.86–0.98' and cross-architecture correlation 'r ≥ 0.93' (Section 3.2, Section G)." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The evaluation dataset sizes (77,999 training, 39,474 validation, 40,980 test items) are stated but not justified. No power analysis or justification for why these specific sizes were chosen." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Results are reported across multiple seeds (6–10 seeds depending on model) with 95% confidence intervals. Seed-level results are provided in Sections F, I, J. The paper explicitly notes variance across seeds, e.g., 'the confidence intervals are virtually invisible for the most part' (Section 3.2)." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The three heuristics (unigram, n-gram, semantic similarity) serve as baselines for explaining LM behavior. Multiple model families (Pythia, Open-GPT2, Parc) serve as comparisons to each other. The paper also compares against prior work by Chang et al. (2024)." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "Uses contemporary model suites: Pythia (2023), PolyPythia (2024), Mamba-1 (2024), RWKV-4 (2023). Compares against recent work by Chang et al. (2024), Belrose et al. (2024), Nguyen (2024)." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": true, 76 "justification": "Experiment 2 serves as an ablation: the regression analysis dissects contributions of individual heuristics (unigram vs. 5-gram vs. semantic similarity), testing matched vs. unmatched n-gram corpora and different similarity metrics (Wikipedia vs. Common Crawl, weighted vs. unweighted)." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "Uses Pearson correlation, Spearman correlation (Section E), R² on training and validation sets, regression coefficients, and benchmark performance metrics (perplexity bits-per-byte, accuracy on 5 benchmarks in Section L)." 82 }, 83 "human_evaluation": { 84 "applies": false, 85 "answer": false, 86 "justification": "Human evaluation is not relevant to the claims. The paper studies statistical relationships between LM predictions and simple heuristics — this is entirely quantitative/automated." 87 }, 88 "held_out_test_set": { 89 "applies": true, 90 "answer": true, 91 "justification": "The NaWoCo dataset has explicit train/validation/test splits (77,999 / 39,474 / 40,980). Section 4.2 reports R² on the held-out validation set to verify robustness: 'we see almost no difference, suggesting that these results are robust.'" 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are broken down per model size (14M to 12B), per architecture (Transformer, Mamba, RWKV), per training dataset (OpenWebText, The Pile), per seed, and per similarity metric variant. Each model/seed combination is shown separately." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper discusses where its approach falls short: the R² decreases for larger models in later training phases (Section 4.2), indicating unexplained variance. The limitations section notes the heuristics 'still do not account for all the variance in language model behavior.' An outlying checkpoint (Open-GPT2 345M beren seed) is also discussed." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper reports that R² decreases substantially for larger models in later training (e.g., falls below 0.5 for largest Pythia models), that 5-gram coefficients can go slightly negative during Phase 1, and that the semantic similarity metric choice matters (Common Crawl vs. Wikipedia show different patterns due to confounding with unigram frequency)." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Abstract claims: (1) consistent behavioral phases across architectures — supported by Figures 1-2 showing identical patterns for Transformer, Mamba, RWKV; (2) up to 98% variance explained — supported by R² values in Figure 2B; (3) overfitting to n-grams of increasing n — supported by correlation peaks in Figure 1. All claims have corresponding evidence." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": false, 118 "justification": "The paper uses language suggesting causal mechanisms: 'learning in neural language models may follow a similar trajectory' and 'the autoregressive language modeling task itself may be the largest factor—and perhaps the decisive one—in shaping the behavioral phases.' These are causal claims based on correlational evidence (regression analysis). The study design (observational analysis of training dynamics) cannot establish causation. The paper does hedge with 'may' but the framing is causal." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": true, 123 "justification": "The limitations section (Section 6) explicitly bounds generalizations: 'our analysis is limited to three architectures,' 'only relatively small models (130–169M)' for non-Transformer architectures, 'we also limit our analysis to n-grams in n ∈ {1, 2, 3, 4, 5} and static word embeddings.' The title makes a broad claim but the paper content bounds it appropriately." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": true, 128 "justification": "Section 4 discusses alternative explanations for the correlation patterns: Common Crawl semantic similarity may be capturing unigram probability rather than true semantic similarity (Section 4, paragraph 1). Section 5 discusses whether CBOW vs. skip-gram training explains differences between embedding types, and whether the residual variance reflects 'more complex cues.'" 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper is precise about what it measures (log-probability correlations with heuristics) and what it claims (behavioral phases). It explicitly acknowledges that 'there is still a lot to be understood, even in the smallest models' (Section 6) and that correlation is not mechanism. Claims match measurement granularity." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "Specific model versions are provided: Pythia models (14M–12B) with seed numbers, PolyPythia seeds, Open-GPT2 models with specific seeds (49, 81, 343, 777), Parc models with exact parameter counts (160M, 130M, 169M). Table 1 provides comprehensive details. Known issues with specific models are cited (fp16 precision, Open-GPT2 seed 21)." 141 }, 142 "prompts_provided": { 143 "applies": false, 144 "answer": false, 145 "justification": "No prompting is used. The paper evaluates language models' next-word prediction probabilities directly, without any prompt engineering." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Training hyperparameters for Parc models are stated: 1024-token sequences, batch size 512, 4000 steps, 0.5M tokens per step. Stupid Backoff parameter α = 0.4 is stated. Precision (fp32) is specified. Section A notes hyperparameters 'were chosen to match those of the original model training.'" 151 }, 152 "scaffolding_described": { 153 "applies": false, 154 "answer": false, 155 "justification": "No agentic scaffolding is used. The paper evaluates base language models' prediction probabilities." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section D (Appendix) provides detailed NaWoCo construction: extraction from FineWeb with specific filtering criteria (>5 words, capitalization rules, toxicity threshold ≤0.1, infini-gram decontamination, single-token constraint). Section B documents n-gram probability estimation. Section C documents similarity calculation." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 6 is a dedicated 'Limitations' section discussing three specific limitations: limited architectures, limited n-gram range, and unexplained variance." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "Section 6 provides specific threats: Mamba and RWKV models are 'only relatively small models (130–169M models trained on ~2B tokens),' analysis limited to 'n-grams in n ∈ {1,2,3,4,5} and static word embeddings,' and 'regressions still do not account for all the variance.' These are specific to this study, not generic." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 6 explicitly states what was NOT tested: other architectures, larger non-Transformer models, higher-order n-grams, contextual embeddings. The paper also hedges conclusions with 'may' throughout Section 5." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "The paper releases the NaWoCo dataset, all model checkpoints (Parc models), all code, and all analysis scripts. Raw log-probabilities can be regenerated from the released models and evaluation code." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section D describes NaWoCo construction in detail: source corpus (FineWeb), filtering criteria (sentence length, capitalization, toxicity, decontamination, single-token constraint), and split sizes. N-gram collection is described in Section B (infini-gram counts with Stupid Backoff)." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. Data sources are standard public corpora (FineWeb, OpenWebText, The Pile) and publicly available pretrained models." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section D documents the filtering pipeline: 250,000 sentences extracted → filtered for toxicity, decontamination, capitalization → 100,000/50,000/50,000 split → word selection → re-filtering → single-token constraint → final counts (77,999/39,474/40,980). Each filtering step and criterion is explained." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "Acknowledgments section states: 'James Michaelov was supported by a grant from the Andrew W. Mellon foundation (#2210-13947) during the writing of this paper.'" 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations are clearly listed: MIT (Brain and Cognitive Sciences, Libraries CREOS) and UCSD (Cognitive Science). No commercial affiliations with evaluated products." 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": true, 216 "justification": "The Andrew W. Mellon Foundation is an independent philanthropic foundation with no financial interest in language model behavioral dynamics. The funder has no stake in the results." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests or financial interests statement is present in the paper. While the academic affiliations suggest no obvious conflicts, the absence of an explicit declaration means this criterion is not met." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": true, 228 "justification": "Training data is fully specified: Pythia models trained on The Pile, Open-GPT2 and Parc models trained on OpenWebText. Since all models are trained by the authors or on known fixed datasets, the exact training data is known (not a black-box API model with unknown cutoff)." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": true, 233 "justification": "Section 3.1.4 explicitly addresses this: NaWoCo sentences 'are not in the training data of any of the language models tested (based on infini-gram counts).' Section D confirms decontamination: sentences not in OpenWebText or Pile corpora." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": true, 238 "justification": "The evaluation dataset was specifically constructed from FineWeb with explicit decontamination against both training corpora (OpenWebText and The Pile) using infini-gram counts. This directly addresses contamination by construction." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "No inference cost or latency reported for running 1,418 model checkpoints on 110,000+ tokens. The NeurIPS checklist mentions compute details in supplementary materials but the paper itself does not quantify costs." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": false, 287 "justification": "The paper does not state total GPU hours or computational budget for training 18 Parc models or evaluating 1,418 checkpoints. The NeurIPS checklist references supplementary materials for compute details but no numbers appear in the paper text." 288 } 289 }, 290 "experimental_rigor": { 291 "seed_sensitivity_reported": { 292 "applies": true, 293 "answer": true, 294 "justification": "Results are reported across multiple seeds throughout: 6 seeds for Parc models, 10 seeds for small Pythia/PolyPythia, 4 seeds for Open-GPT2. Seed-level analyses are provided in Sections F, I, J. The paper explicitly discusses seed sensitivity: 'the patterns across random seeds are remarkably similar' (Section 3.2)." 295 }, 296 "number_of_runs_stated": { 297 "applies": true, 298 "answer": true, 299 "justification": "Table 1 explicitly states the number of seeds for each model. Total of 1,418 model instances is stated in Section 3.1.1. Each analysis reports per-seed and aggregated results." 300 }, 301 "hyperparameter_search_budget": { 302 "applies": true, 303 "answer": false, 304 "justification": "For Parc model training, hyperparameters were 'chosen to match those of the original model training where possible' (Section A.2) but no search budget is reported. For regression analysis, no hyperparameter search was needed (closed-form linear regression)." 305 }, 306 "best_config_selection_justified": { 307 "applies": true, 308 "answer": true, 309 "justification": "The paper does not select a 'best' configuration — it reports all configurations exhaustively (all seeds, all architectures, all similarity metrics, matched and unmatched n-grams). This comprehensive reporting avoids cherry-picking." 310 }, 311 "multiple_comparison_correction": { 312 "applies": false, 313 "answer": false, 314 "justification": "The paper does not perform formal hypothesis tests, so multiple comparison correction is not applicable. The analysis is descriptive (correlations, regressions) rather than hypothesis-testing." 315 }, 316 "self_comparison_bias_addressed": { 317 "applies": true, 318 "answer": false, 319 "justification": "The authors train Parc models and evaluate them, but do not explicitly discuss author-evaluation bias. However, the analysis is correlational rather than competitive (no claim that their models outperform others), which somewhat reduces this concern." 320 }, 321 "compute_budget_vs_performance": { 322 "applies": false, 323 "answer": false, 324 "justification": "The paper does not propose a method that competes on performance, so compute-vs-performance comparison is not relevant. The analysis is about characterizing existing model behavior." 325 }, 326 "benchmark_construct_validity": { 327 "applies": true, 328 "answer": true, 329 "justification": "The paper discusses construct validity of NaWoCo: it was designed to measure word-level prediction in natural contexts, with careful decontamination and filtering. Section 4 discusses whether heuristic correlations reflect genuine behavioral facets vs. confounds (e.g., Common Crawl similarity confounded with frequency)." 330 }, 331 "scaffold_confound_addressed": { 332 "applies": false, 333 "answer": false, 334 "justification": "No scaffolding is used. Models are evaluated directly on next-word prediction probabilities." 335 } 336 } 337 }, 338 "claims": [ 339 { 340 "claim": "Up to 98% of the variance in language model behavior at the word level can be explained by three simple heuristics: unigram probability, n-gram probability, and semantic similarity.", 341 "evidence": "Figure 2B shows R² values reaching 0.86–0.98 at peak, with validation set R² nearly identical. This holds across all model families tested.", 342 "supported": "strong" 343 }, 344 { 345 "claim": "All language models tested exhibit consistent behavioral phases, with predicted probabilities overfitting to n-gram probabilities of increasing n over the course of training.", 346 "evidence": "Figure 1 shows consistent peak ordering (unigram → bigram → ... → 5-gram) across Pythia (14M–12B), Open-GPT2, Parc-Pythia, Parc-Mamba, and Parc-RWKV. Confirmed in both Pearson (Figure 1) and Spearman (Figure 3) correlations.", 347 "supported": "strong" 348 }, 349 { 350 "claim": "These behavioral phases are consistent across architecture (Transformer vs. Mamba vs. RWKV), training data (OpenWebText vs. The Pile), and scale (14M to 12B parameters).", 351 "evidence": "Cross-architecture correlations r ≥ 0.93 at step ≥80 (Section G, Figure 6). Same patterns visible for Pythia (The Pile) and Open-GPT2/Parc (OpenWebText). Patterns hold from 14M to 12B parameters, with timing differences but consistent phase structure.", 352 "supported": "strong" 353 }, 354 { 355 "claim": "Semantic similarity shows a dissociable effect on language model predictions above and beyond n-gram probability.", 356 "evidence": "Experiment 2 regression coefficients (Figure 2A) show semantic similarity maintains a positive coefficient after controlling for unigram and 5-gram log-probability throughout training. This holds for both Wikipedia and Common Crawl embeddings.", 357 "supported": "strong" 358 }, 359 { 360 "claim": "Larger models shift further from lower-order n-gram predictions than smaller models.", 361 "evidence": "Section 3.2 and 4.2: 'larger models also see a greater decrease in the correlation to smaller n-grams.' Regression coefficients show larger Pythia models have smaller unigram coefficients and larger 5-gram coefficients in later training phases.", 362 "supported": "strong" 363 } 364 ], 365 "methodology_tags": ["benchmark-eval"], 366 "key_findings": "Across 1,418 language model checkpoints spanning three architectures (Transformer, Mamba, RWKV), two training datasets, and three orders of magnitude in scale (14M–12B parameters), autoregressive language models exhibit remarkably consistent behavioral phases during pretraining. These phases involve progressive overfitting to n-grams of increasing order, with three simple heuristics (word frequency, n-gram probability, semantic similarity) explaining up to 98% of variance in model predictions. Larger models shift further from lower-order n-gram predictions, suggesting greater capacity for complex pattern learning. The consistency across architectures suggests these phases are driven by the autoregressive language modeling objective itself rather than architectural details.", 367 "red_flags": [], 368 "cited_papers": [ 369 { 370 "title": "Pythia: A Suite for Analyzing Large Language Models Across Training and Scaling", 371 "authors": ["Stella Biderman", "Hailey Schoelkopf", "Quentin G. Anthony"], 372 "year": 2023, 373 "relevance": "Major open model suite used extensively in this study; provides checkpointed models for training dynamics analysis." 374 }, 375 { 376 "title": "Emergent Abilities of Large Language Models", 377 "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"], 378 "year": 2022, 379 "relevance": "Foundational work on emergent behaviors in LLMs; this paper provides evidence that behavioral phases precede emergence." 380 }, 381 { 382 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 383 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 384 "year": 2023, 385 "relevance": "Challenges emergence claims; relevant to understanding whether LM behavioral changes are gradual or sudden." 386 }, 387 { 388 "title": "Scaling Laws for Neural Language Models", 389 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 390 "year": 2020, 391 "arxiv_id": "2001.08361", 392 "relevance": "Foundational scaling laws work; this paper adds behavioral phase dynamics to the scaling picture." 393 }, 394 { 395 "title": "Characterizing Learning Curves During Language Model Pre-Training: Learning, Forgetting, and Stability", 396 "authors": ["Tyler A. Chang", "Zhuowen Tu", "Benjamin K. Bergen"], 397 "year": 2024, 398 "relevance": "Direct predecessor showing n-gram phase patterns in GPT-2; this paper extends to multiple architectures and scales." 399 }, 400 { 401 "title": "Mamba: Linear-Time Sequence Modeling with Selective State Spaces", 402 "authors": ["Albert Gu", "Tri Dao"], 403 "year": 2024, 404 "relevance": "State-space model architecture evaluated in this study; first checkpointed Mamba models released." 405 }, 406 { 407 "title": "RWKV: Reinventing RNNs for the Transformer Era", 408 "authors": ["Bo Peng", "Eric Alcaide", "Quentin Anthony"], 409 "year": 2023, 410 "relevance": "Modern RNN architecture evaluated in this study; first checkpointed RWKV models released." 411 }, 412 { 413 "title": "In-context Learning and Induction Heads", 414 "authors": ["Catherine Olsson", "Nelson Elhage", "Neel Nanda"], 415 "year": 2022, 416 "arxiv_id": "2209.11895", 417 "relevance": "Mechanistic interpretability work on induction heads; this paper connects behavioral phases to in-context learning capabilities." 418 }, 419 { 420 "title": "Neural Networks Learn Statistics of Increasing Complexity", 421 "authors": ["Nora Belrose", "Quintin Pope", "Lucia Quirke"], 422 "year": 2024, 423 "relevance": "Shows KL divergence patterns between LM outputs and n-gram distributions; complementary to this paper's behavioral phase analysis." 424 }, 425 { 426 "title": "Understanding Transformers via N-Gram Statistics", 427 "authors": ["Tian Nguyen"], 428 "year": 2024, 429 "relevance": "Shows n-gram rules can predict LM outputs with 68-79% accuracy; complements the variance-explained findings here." 430 }, 431 { 432 "title": "Embers of autoregression show how large language models are shaped by the problem they are trained to solve", 433 "authors": ["R. Thomas McCoy", "Shunyu Yao", "Dan Friedman"], 434 "year": 2024, 435 "relevance": "Demonstrates how autoregressive training objective shapes LM behavior; directly relevant to understanding behavioral biases." 436 }, 437 { 438 "title": "Inverse Scaling: When Bigger Isn't Better", 439 "authors": ["Ian R. McKenzie"], 440 "year": 2023, 441 "relevance": "Documents cases where larger models perform worse; related to this paper's finding that larger models may over-rely on heuristics." 442 } 443 ] 444 }