scan.json (28000B)
1 { 2 "paper": { 3 "title": "Are Emergent Abilities of Large Language Models a Mirage?", 4 "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"], 5 "year": 2023, 6 "venue": "NeurIPS 2023", 7 "arxiv_id": "2304.15004" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage", "survey_methodology"], 11 "methodology_tags": ["meta-analysis", "benchmark-eval", "theoretical"], 12 "key_findings": "The paper demonstrates that claimed emergent abilities of LLMs are likely artifacts of metric choice rather than fundamental phase transitions in model behavior. Over 92% of claimed emergent abilities on BIG-Bench appear under just two metrics (Multiple Choice Grade and Exact String Match), both of which nonlinearly or discontinuously scale per-token error rates. Switching to linear or continuous metrics (Token Edit Distance, Brier Score) reveals smooth, predictable performance improvement. The authors further show that seemingly emergent abilities can be artificially induced in vision tasks by choosing appropriately sharp metrics.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL or code archive is mentioned in the paper. The paper is listed as 'Preprint. Under review' with no link to reproduction code." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper uses publicly available data: BIG-Bench is public, GPT-3/InstructGPT outputs were collected via a public API, and the hand-annotated emergent abilities list from Wei (2022) is public. CIFAR100, MNIST, and Omniglot are standard public datasets." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment specifications, requirements files, or dependency details are provided." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided. The experimental setup is described in prose but no scripts or detailed reproduction steps are given." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper reports point estimates for metrics across model scales. No confidence intervals or error bars are shown on the main figures (Figs. 2-8)." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes comparative claims (e.g., emergent abilities disappear with different metrics) but does not use any statistical significance tests. Comparisons are made visually via plots." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports concrete effect sizes: '>92% of emergent abilities on BIG-Bench appear under either of these two metrics' (Sec. 1), and provides quantitative breakdowns of which metrics produce emergence across all 39 BIG-Bench metrics (Fig. 5)." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification is given for the number of tasks analyzed, the number of test examples generated for GPT-3 experiments, or the model family sizes chosen for vision experiments." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance or standard deviation is reported across experimental runs for any of the three analysis tracks (GPT-3, BIG-Bench meta-analysis, vision experiments)." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper directly compares against the original emergent abilities claims (Wei et al. 2022, Ganguli et al. 2022, Srivastava et al. 2022), using the same model families and tasks but with different metrics." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The baselines are the original emergent abilities papers (2022), which are the most recent and relevant prior work on this topic." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The core methodology is effectively an ablation: holding model outputs fixed while varying the metric (Accuracy vs Token Edit Distance, Multiple Choice Grade vs Brier Score) to isolate the metric's contribution to the emergence phenomenon." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper explicitly uses multiple metrics (Accuracy, Token Edit Distance, Multiple Choice Grade, Brier Score, cross-entropy, Reconstruction_c) and shows how results change across them." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Human evaluation is not relevant to the claims, which are about mathematical properties of metrics applied to model outputs." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "For GPT-3 experiments, the authors generate new test data for arithmetic tasks. For BIG-Bench, standard test splits are used. For vision experiments, standard test sets (CIFAR100, MNIST, Omniglot) are used." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Fig. 5 provides per-metric breakdowns across all 39 BIG-Bench metrics. Figs. 3-4 break down by target string length (1-5 digits). Fig. 6 shows per-task LaMDA results." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "The paper does not discuss cases where changing the metric did NOT remove the emergent ability, or tasks where the alternative explanation might not fully apply." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": false, 107 "justification": "Every experiment confirms the authors' hypothesis. No failed attempts, counterexamples, or cases where the alternative explanation was insufficient are reported." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims that emergent abilities appear due to metric choice, and the paper provides three lines of evidence: GPT-3 arithmetic (Sec. 3), BIG-Bench meta-analysis (Sec. 4), and induced emergence in vision (Sec. 5), all supporting this claim." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper claims metric choice 'creates' emergent abilities. This causal claim is justified by a controlled manipulation: holding model outputs fixed and varying only the metric, which is a valid single-variable manipulation. The mathematical model (Sec. 2) provides the theoretical grounding." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "The Discussion (Sec. 7) explicitly states: 'nothing in this paper should be interpreted as claiming that large language models cannot display emergent abilities; rather, our message is that previously claimed emergent abilities in [3, 8, 28, 33] might likely be a mirage.' The claims are bounded to the specific prior work analyzed." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 6 discusses alternative explanations: Caballero et al.'s piece-wise power law model (where emergence is real), Michaud et al.'s strong data assumptions, and Srivastava et al.'s original hypothesis. The paper positions itself against these alternatives." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The entire paper is about the distinction between what metrics measure (per-token error rate) and what researchers claim they show (emergent abilities). The paper explicitly argues that the proxy (discontinuous/nonlinear metric) distorts the underlying phenomenon (smooth improvement)." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper states '4 models with 350M, 1.3B, 6.7B, 175B parameters are available via the OpenAI API' (footnote 3) but does not specify exact model version identifiers or API snapshot dates. The LaMDA analysis uses published BIG-Bench outputs." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper mentions '2-shot multiplication between two 2-digit integers and 2-shot addition between two 4-digit integers' but does not provide the actual prompt text used to query the GPT-3 models." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "Fig. 3 shows results at temperature 0.0 and 1.0 but no other API parameters (top-p, max tokens) are reported. Vision experiment hyperparameters (learning rates, optimizers, training epochs) are not reported in the main text." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. The paper queries models directly and analyzes outputs." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "For GPT-3 experiments, how arithmetic test examples were generated is not described. For the BIG-Bench meta-analysis, the filtering from all tasks to those with claimed emergent abilities is described but the exact criteria for the emergence score threshold used in Fig. 5A are not stated." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations section. The Discussion (Sec. 7) includes one caveat ('nothing in this paper should be interpreted as claiming that large language models cannot display emergent abilities') but no structured limitations discussion." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No specific threats to validity are discussed. For example, the paper does not address whether its mathematical model's independence assumption (footnote 1: 'the independence assumption is not true') might affect the conclusions, or whether the GPT-3 analysis with only 4 model sizes is sufficient." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The Discussion states specific scope boundaries: 'nothing in this paper should be interpreted as claiming that large language models cannot display emergent abilities' and that the claims are specifically about 'previously claimed emergent abilities in [3, 8, 28, 33].'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "The GPT-3 model outputs collected by the authors are not released. BIG-Bench data is public but the authors' specific analysis scripts and intermediate data are not available." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3 describes collecting outputs from InstructGPT/GPT-3 via the OpenAI API on arithmetic tasks. Section 4 describes using BIG-Bench published outputs and hand-annotated emergence data from Wei (2022). Section 5 describes training vision models on standard datasets." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data sources are standard benchmarks and public APIs." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "The pipeline from raw BIG-Bench outputs to the emergence score analysis is not fully documented. How many task-metric-model family triplets were analyzed, how many were excluded, and the exact filtering steps are not stated." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding sources are mentioned in the paper. There is no acknowledgments section listing grants or sponsors." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly stated: all three authors are from Computer Science, Stanford University." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is provided." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper uses GPT-3/InstructGPT models but does not state their training data cutoff dates. For vision models trained by the authors, training data is standard benchmarks so cutoff is less relevant, but for the LLM evaluation it matters." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether GPT-3 may have seen arithmetic problems similar to the test set in its training data. The arithmetic tasks are generated, but the format may overlap with training data." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "BIG-Bench tasks were publicly available before GPT-3's training. No contamination analysis is performed. However, contamination would actually strengthen the paper's argument (if models saw answers, smooth improvement would be even more expected), so this is a lesser concern." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "The paper queries GPT-3/InstructGPT via API and trains vision models but does not report API costs or compute costs." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No computational budget is stated for any of the three experimental tracks." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No seed sensitivity analysis for vision experiments (autoencoders, LeNet, transformers). GPT-3 API queries may have used temperature but no multi-seed analysis is shown." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is not stated for any experiment. It is unclear whether vision models were trained once or multiple times." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget is reported for the vision experiments. The mathematical model parameters (c, α) are not described as searched." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper does not describe how hyperparameters for vision experiments were selected or whether the shown configurations were cherry-picked from a larger set." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable. Interestingly, the paper notes the multiple comparisons problem in BIG-Bench (Sec. 7: '~10^6 task-metric-model family triplets') but as a criticism of prior work, not as something requiring correction in their own analysis." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors define their own metrics (Token Edit Distance, Reconstruction_c) and their own mathematical model to test their hypothesis. They do not acknowledge that their choices of alternative metrics and model parameters could be tuned to support their thesis." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": false, 324 "answer": false, 325 "justification": "The paper compares metrics at fixed model scales, not compute budgets. Compute differences are not the focus." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "The entire paper is a construct validity critique: it argues that benchmarks using discontinuous/nonlinear metrics do not validly measure emergence because the metric creates the appearance of a phase transition. This is the core contribution." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No scaffolding is involved in any of the experiments." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether BIG-Bench tasks or arithmetic formats appeared in GPT-3's training data before the evaluation." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the 2-shot examples provide leakage or whether the evaluation format matches training distribution." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of independence between training and test data for any of the evaluated models." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention method is applied." 358 } 359 }, 360 "survey_methodology": { 361 "prisma_or_structured_protocol": { 362 "applies": true, 363 "answer": false, 364 "justification": "The meta-analysis of BIG-Bench uses a quantitative emergence score (Eq. 1, from Srivastava et al. 2022) but does not follow PRISMA or any structured review protocol for selecting which emergent abilities claims to analyze." 365 }, 366 "quality_assessment_of_sources": { 367 "applies": true, 368 "answer": false, 369 "justification": "The paper does not assess the methodological quality of the source papers claiming emergent abilities. It takes the claims at face value and re-analyzes the metrics, but does not evaluate whether the original experiments were well-designed." 370 }, 371 "publication_bias_discussed": { 372 "applies": true, 373 "answer": true, 374 "justification": "Section 7 discusses a form of publication bias: 'emergent abilities claims are possibly infected by a failure to control for multiple comparisons. In BIG-Bench alone, there are ≥220 tasks, ~40 metrics per task, ~10 model families, for a total of ~10^6 task-metric-model family triplets.'" 375 } 376 } 377 }, 378 "claims": [ 379 { 380 "claim": "Emergent abilities of LLMs are created by the researcher's choice of nonlinear or discontinuous metrics, not fundamental changes in model behavior with scale.", 381 "evidence": "Mathematical model (Sec. 2) shows how power-law cross-entropy decline produces sharp transitions under Accuracy but smooth transitions under Token Edit Distance. Three empirical analyses confirm this: GPT-3 arithmetic (Sec. 3), BIG-Bench meta-analysis (Sec. 4), and induced emergence in vision (Sec. 5).", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Over 92% of claimed emergent abilities on BIG-Bench appear under just two metrics: Multiple Choice Grade and Exact String Match.", 386 "evidence": "Fig. 5C shows hand-annotated data from Wei (2022) revealing that 2 out of 39 BIG-Bench metrics account for >92% of emergent abilities. Multiple Choice Grade is discontinuous and Exact String Match is nonlinear.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Changing from nonlinear/discontinuous metrics to linear/continuous metrics removes emergent abilities in GPT-3 arithmetic tasks.", 391 "evidence": "Fig. 3 shows that switching from Accuracy to Token Edit Distance on 2-digit multiplication and 4-digit addition tasks reveals smooth, predictable improvement. Fig. 4 shows that even under Accuracy, increasing test dataset size reveals above-chance performance in smaller models.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "LaMDA's emergent abilities on BIG-Bench disappear when measured with Brier Score instead of Multiple Choice Grade.", 396 "evidence": "Fig. 6 shows LaMDA results on multiple tasks where sharp transitions under Multiple Choice Grade become smooth curves under Brier Score.", 397 "supported": "strong" 398 }, 399 { 400 "claim": "Seemingly emergent abilities can be induced in vision tasks (autoencoders, CNNs, transformers) by choosing appropriately sharp metrics.", 401 "evidence": "Fig. 7 shows induced emergence in shallow autoencoders on CIFAR100 via Reconstruction_c metric. Fig. 8 shows induced emergence in autoregressive transformers on Omniglot. Fig. 10 shows induced emergence in LeNet on MNIST.", 402 "supported": "strong" 403 } 404 ], 405 "red_flags": [ 406 { 407 "flag": "All experiments confirm hypothesis", 408 "detail": "Every single experiment and analysis in the paper supports the authors' alternative explanation. No counterexamples, failed predictions, or boundary cases where the explanation is insufficient are discussed. This uniformly positive pattern is unusual for a paper making such a strong claim." 409 }, 410 { 411 "flag": "Limited model sampling", 412 "detail": "The GPT-3/InstructGPT analysis uses only 4 model sizes (350M, 1.3B, 6.7B, 175B). With so few data points, visual assessment of 'smooth vs sharp' transitions is subjective. No statistical test is applied to distinguish the two patterns." 413 }, 414 { 415 "flag": "Metric choice for alternative may be optimized", 416 "detail": "The authors choose Token Edit Distance and Brier Score as their alternative metrics, and define the novel Reconstruction_c metric with specific threshold values. These choices could themselves be optimized to support the thesis, mirroring the very concern they raise about the original emergence claims." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "Emergent abilities of large language models", 422 "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani", "Colin Raffel", "Barret Zoph"], 423 "year": 2022, 424 "arxiv_id": "2206.07682", 425 "relevance": "The primary paper being critiqued — defines and catalogs emergent abilities of LLMs." 426 }, 427 { 428 "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models", 429 "authors": ["Aarohi Srivastava", "Abhinav Rastogi"], 430 "year": 2022, 431 "arxiv_id": "2206.04615", 432 "relevance": "BIG-Bench benchmark used as the primary data source for the meta-analysis of emergent abilities." 433 }, 434 { 435 "title": "Predictability and surprise in large generative models", 436 "authors": ["Deep Ganguli", "Danny Hernandez"], 437 "year": 2022, 438 "relevance": "Key prior work that emphasized emergent abilities as unpredictable and surprising, directly challenged by this paper." 439 }, 440 { 441 "title": "Language models are few-shot learners", 442 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 443 "year": 2020, 444 "relevance": "GPT-3 paper — the model family used in the primary experimental validation of the alternative explanation." 445 }, 446 { 447 "title": "Scaling laws for neural language models", 448 "authors": ["Jared Kaplan", "Sam McCandlish"], 449 "year": 2020, 450 "arxiv_id": "2001.08361", 451 "relevance": "Foundational neural scaling laws work that underpins the paper's mathematical model of smooth performance improvement." 452 }, 453 { 454 "title": "Broken neural scaling laws", 455 "authors": ["Ethan Caballero", "Kshitij Gupta", "Irina Rish", "David Krueger"], 456 "year": 2022, 457 "arxiv_id": "2210.14891", 458 "relevance": "Alternative explanation for emergence via piece-wise power laws, contrasted with this paper's metric-based explanation." 459 }, 460 { 461 "title": "The quantization model of neural scaling", 462 "authors": ["Eric J. Michaud", "Ziming Liu", "Uzay Girit", "Max Tegmark"], 463 "year": 2023, 464 "relevance": "Posits that emergent abilities may be real under strong data assumptions, an alternative view to this paper's metric artifact explanation." 465 }, 466 { 467 "title": "Data distributional properties drive emergent in-context learning in transformers", 468 "authors": ["Stephanie CY Chan", "Adam Santoro"], 469 "year": 2022, 470 "relevance": "Studies emergence in in-context learning with controlled transformer experiments, methodology inspiration for this paper's vision experiments." 471 }, 472 { 473 "title": "Training compute-optimal large language models", 474 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud"], 475 "year": 2022, 476 "arxiv_id": "2203.15556", 477 "relevance": "Chinchilla scaling laws — foundational work on compute-optimal training that informs the scaling assumptions." 478 }, 479 { 480 "title": "PaLM: Scaling language modeling with pathways", 481 "authors": ["Aakanksha Chowdhery", "Sharan Narang"], 482 "year": 2022, 483 "arxiv_id": "2204.02311", 484 "relevance": "One of the key model families claimed to exhibit emergent abilities, cited as evidence in the original emergence papers." 485 }, 486 { 487 "title": "Attention is all you need", 488 "authors": ["Ashish Vaswani", "Noam Shazeer"], 489 "year": 2017, 490 "relevance": "Transformer architecture used in the paper's induced emergence experiments with autoregressive models." 491 } 492 ] 493 }