scan.json (23072B)
1 { 2 "paper": { 3 "title": "Emergent Abilities of Large Language Models", 4 "authors": [ 5 "Jason Wei", 6 "Yi Tay", 7 "Rishi Bommasani", 8 "Colin Raffel", 9 "Barret Zoph", 10 "Sebastian Borgeaud", 11 "Dani Yogatama", 12 "Maarten Bosma", 13 "Denny Zhou", 14 "Donald Metzler", 15 "Ed H. Chi", 16 "Tatsunori Hashimoto", 17 "Oriol Vinyals", 18 "Percy Liang", 19 "Jeff Dean", 20 "William Fedus" 21 ], 22 "year": 2022, 23 "venue": "Transactions on Machine Learning Research", 24 "arxiv_id": "2206.07682" 25 }, 26 "scan_version": 2, 27 "active_modules": ["survey_methodology"], 28 "checklist": { 29 "artifacts": { 30 "code_released": { 31 "applies": true, 32 "answer": false, 33 "justification": "No code repository or analysis scripts are mentioned or released. The paper is a survey but could have released code for its BIG-Bench cross-entropy analysis or task classification annotations." 34 }, 35 "data_released": { 36 "applies": true, 37 "answer": true, 38 "justification": "The paper includes full task classification annotations in Appendix E (E.1–E.5), listing all 210+ BIG-Bench tasks categorized as emergent, smoothly increasing, flat, or other. The underlying BIG-Bench benchmark is publicly available." 39 }, 40 "environment_specified": { 41 "applies": false, 42 "answer": false, 43 "justification": "This is a survey paper that does not run new experiments requiring environment specifications." 44 }, 45 "reproduction_instructions": { 46 "applies": false, 47 "answer": false, 48 "justification": "This is a survey paper; no new experiments to reproduce." 49 } 50 }, 51 "statistical_methodology": { 52 "confidence_intervals_or_error_bars": { 53 "applies": false, 54 "answer": false, 55 "justification": "Survey paper that reports results from prior work; does not run its own experiments requiring confidence intervals." 56 }, 57 "significance_tests": { 58 "applies": false, 59 "answer": false, 60 "justification": "Survey paper; no new comparative claims requiring significance tests." 61 }, 62 "effect_sizes_reported": { 63 "applies": false, 64 "answer": false, 65 "justification": "Survey paper; does not run experiments requiring effect size reporting." 66 }, 67 "sample_size_justified": { 68 "applies": false, 69 "answer": false, 70 "justification": "Survey paper; no experimental samples." 71 }, 72 "variance_reported": { 73 "applies": false, 74 "answer": false, 75 "justification": "Survey paper; no own experimental runs to report variance across." 76 } 77 }, 78 "evaluation_design": { 79 "baselines_included": { 80 "applies": false, 81 "answer": false, 82 "justification": "Survey paper; does not propose or evaluate a system that would require baselines." 83 }, 84 "baselines_contemporary": { 85 "applies": false, 86 "answer": false, 87 "justification": "Survey paper; no system evaluation." 88 }, 89 "ablation_study": { 90 "applies": false, 91 "answer": false, 92 "justification": "Survey paper; no system with components to ablate." 93 }, 94 "multiple_metrics": { 95 "applies": true, 96 "answer": true, 97 "justification": "The paper compares multiple evaluation metrics for the same tasks (Appendix A): exact match, BLEU, ROUGE, BLEURT, cross-entropy loss, and accuracy. Figure 7 and Appendix A.2 explicitly compare how emergence appears under different metrics." 98 }, 99 "human_evaluation": { 100 "applies": false, 101 "answer": false, 102 "justification": "Survey paper; no system outputs to evaluate with human judges." 103 }, 104 "held_out_test_set": { 105 "applies": false, 106 "answer": false, 107 "justification": "Survey paper; no experiments requiring held-out test sets." 108 }, 109 "per_category_breakdown": { 110 "applies": true, 111 "answer": true, 112 "justification": "Extensive per-category breakdowns provided: Figure 8 breaks down BIG-Bench tasks by keyword tag, Appendix B breaks MMLU into four subject categories (Humanities, STEM, Social Science, Other), and Appendix E categorizes all 210+ tasks." 113 }, 114 "failure_cases_discussed": { 115 "applies": true, 116 "answer": true, 117 "justification": "Section 5.2 discusses limitations of scaling. Appendix E.4 lists dozens of 'flat' tasks where no model performs better than random, explicitly identifying where emergence fails. Section 5.1 acknowledges incomplete explanations." 118 }, 119 "negative_results_reported": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper reports tasks where emergence does not occur (Appendix E.4), notes that GPT-3 fails on WiC even at largest scale (Figure 2H), and discusses how instruction tuning hurts smaller models (Section 4, Figure 3B)." 123 } 124 }, 125 "claims_and_evidence": { 126 "abstract_claims_supported": { 127 "applies": true, 128 "answer": true, 129 "justification": "The abstract claims that scaling leads to unpredictable emergent abilities. The paper supports this with extensive examples across multiple model families (Figures 2-3, Table 1) showing near-random performance until a threshold scale." 130 }, 131 "causal_claims_justified": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper uses causal language ('scaling... lead to', 'scale to unpredictably enable') but the evidence is observational — models that are larger also differ in training data, architecture, and training procedure. Section 5.2 partially acknowledges this ('model scale is not the singular factor') but the overall framing attributes emergence to scale without adequate causal identification." 135 }, 136 "generalization_bounded": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 2 explicitly states 'Our goal in this paper is not to characterize or claim that a specific scale is required to observe emergent abilities.' Section 5.2 notes emergence depends on data quality, architecture, and training, not just scale. Section 5 discusses limitations of scale-only framing." 140 }, 141 "alternative_explanations_discussed": { 142 "applies": true, 143 "answer": true, 144 "justification": "Section 5.1 discusses metric choice as an alternative explanation (exact match hiding gradual improvement). Appendix A provides cross-entropy loss analysis showing underlying gradual improvement. Section 5.2 discusses architecture, data quality, and training objective as alternative explanations for why emergence thresholds vary." 145 }, 146 "proxy_outcome_distinction": { 147 "applies": true, 148 "answer": true, 149 "justification": "Section 5.1 and Appendix A explicitly discuss how downstream metrics (exact match, accuracy) may be a poor proxy for underlying model improvement, showing that cross-entropy loss improves continuously even when downstream metrics appear flat. This is a substantive proxy-outcome distinction." 150 } 151 }, 152 "setup_transparency": { 153 "model_versions_specified": { 154 "applies": false, 155 "answer": false, 156 "justification": "Survey paper that does not run its own model experiments. Models discussed are from prior work with citations." 157 }, 158 "prompts_provided": { 159 "applies": false, 160 "answer": false, 161 "justification": "Survey paper; does not use prompting in its own experiments." 162 }, 163 "hyperparameters_reported": { 164 "applies": false, 165 "answer": false, 166 "justification": "Survey paper; no own experiments requiring hyperparameter reporting." 167 }, 168 "scaffolding_described": { 169 "applies": false, 170 "answer": false, 171 "justification": "No agentic scaffolding used." 172 }, 173 "data_preprocessing_documented": { 174 "applies": true, 175 "answer": false, 176 "justification": "The paper does not describe the process for selecting which emergent abilities or papers to include in the survey. The task classification in Appendix A.3 mentions 'two co-authors of the paper worked together and agreed with confidence on all the tasks labeled as emergent,' but the selection criteria for which prior work to survey are not documented." 177 } 178 }, 179 "limitations_and_scope": { 180 "limitations_section_present": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 5 contains substantial discussion of limitations: Section 5.1 on incomplete explanations, Section 5.2 on factors beyond scaling, Section 5.4 on emergent risks. The Broader Impact Statement also acknowledges unpredictability." 184 }, 185 "threats_to_validity_specific": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 5.1 raises specific threats: evaluation metrics may disguise gradual improvement as emergence. Appendix A provides concrete analysis of this threat. Section 5.2 raises that PaLM 62B shows emergence where larger GPT-3/LaMDA do not, challenging the scale-only narrative." 189 }, 190 "scope_boundaries_stated": { 191 "applies": true, 192 "answer": true, 193 "justification": "Section 2 states: 'Our goal in this paper is not to characterize or claim that a specific scale is required to observe emergent abilities.' The paper explicitly acknowledges limitations of scale as the sole variable (Section 5.2, 5.3) and that some abilities may never emerge (end of Section 5.2)." 194 } 195 }, 196 "data_integrity": { 197 "raw_data_available": { 198 "applies": true, 199 "answer": true, 200 "justification": "The underlying BIG-Bench benchmark data is publicly available and the paper references it with URLs. The task annotations are fully listed in Appendix E." 201 }, 202 "data_collection_described": { 203 "applies": true, 204 "answer": false, 205 "justification": "The paper does not describe its systematic process for collecting examples of emergent abilities from the literature. It surveys 'a range of prior work' without describing how those papers were identified or selected." 206 }, 207 "recruitment_methods_described": { 208 "applies": false, 209 "answer": false, 210 "justification": "No human participants; data comes from publicly available benchmarks." 211 }, 212 "data_pipeline_documented": { 213 "applies": true, 214 "answer": false, 215 "justification": "No description of how the survey scope was determined, which papers were included/excluded, or how the examples in Figures 2-3 were selected from the broader literature." 216 } 217 }, 218 "conflicts_of_interest": { 219 "funding_disclosed": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding acknowledgment section. Authors are from Google Research, DeepMind, Stanford, and UNC Chapel Hill, but no explicit funding statement." 223 }, 224 "affiliations_disclosed": { 225 "applies": true, 226 "answer": true, 227 "justification": "Author affiliations are clearly listed: Google Research, Stanford University, UNC Chapel Hill, and DeepMind. These affiliations are relevant since many of the models discussed (PaLM, LaMDA, Gopher, Chinchilla) are from Google/DeepMind." 228 }, 229 "funder_independent_of_outcome": { 230 "applies": true, 231 "answer": false, 232 "justification": "Authors are employed by Google Research and DeepMind, which have a direct commercial interest in demonstrating that scaling up language models yields valuable emergent capabilities. This conflict is not acknowledged." 233 }, 234 "financial_interests_declared": { 235 "applies": true, 236 "answer": false, 237 "justification": "No competing interests or financial disclosure statement is provided. Google and DeepMind employees writing about why scaling up language models produces valuable emergent abilities is a notable undisclosed conflict." 238 } 239 }, 240 "contamination": { 241 "training_cutoff_stated": { 242 "applies": false, 243 "answer": false, 244 "justification": "The paper does not evaluate a pre-trained model's capability on benchmarks itself — it surveys prior work's evaluations." 245 }, 246 "train_test_overlap_discussed": { 247 "applies": false, 248 "answer": false, 249 "justification": "Survey paper; does not evaluate models on benchmarks directly." 250 }, 251 "benchmark_contamination_addressed": { 252 "applies": false, 253 "answer": false, 254 "justification": "Survey paper; does not evaluate models on benchmarks directly." 255 } 256 }, 257 "human_studies": { 258 "pre_registered": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "irb_or_ethics_approval": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "demographics_reported": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "inclusion_exclusion_criteria": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 }, 278 "randomization_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants." 282 }, 283 "blinding_described": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants." 287 }, 288 "attrition_reported": { 289 "applies": false, 290 "answer": false, 291 "justification": "No human participants." 292 } 293 }, 294 "cost_and_practicality": { 295 "inference_cost_reported": { 296 "applies": false, 297 "answer": false, 298 "justification": "Survey paper; no own method whose cost needs reporting." 299 }, 300 "compute_budget_stated": { 301 "applies": false, 302 "answer": false, 303 "justification": "Survey paper; no own experiments requiring compute budget reporting." 304 } 305 }, 306 "survey_methodology": { 307 "prisma_or_structured_protocol": { 308 "applies": true, 309 "answer": false, 310 "justification": "No structured review protocol. The paper does not describe systematic search queries, databases searched, or inclusion/exclusion criteria. Examples appear to be selected ad-hoc from the authors' knowledge of the literature." 311 }, 312 "quality_assessment_of_sources": { 313 "applies": true, 314 "answer": false, 315 "justification": "The paper does not assess the methodological quality of the source papers it surveys. Results from all cited papers are treated as equally reliable regardless of experimental rigor." 316 }, 317 "publication_bias_discussed": { 318 "applies": true, 319 "answer": false, 320 "justification": "No discussion of publication bias. Papers showing emergent abilities are more likely to be published than papers showing gradual improvement, but this selection bias is not addressed." 321 } 322 } 323 }, 324 "claims": [ 325 { 326 "claim": "Certain abilities of large language models are emergent — not present in smaller models but appearing in larger models, unpredictable by extrapolating smaller model performance.", 327 "evidence": "Figures 2-3 and Table 1 show scaling curves across GPT-3, LaMDA, Gopher, Chinchilla, and PaLM where performance is near-random until a critical threshold, then jumps sharply above random.", 328 "supported": "moderate" 329 }, 330 { 331 "claim": "Cross-entropy loss improves continuously even when downstream metrics like exact match and accuracy appear flat, suggesting emergence may partly reflect metric choice.", 332 "evidence": "Appendix A (Figures 5-6) shows cross-entropy loss improving monotonically for six BIG-Bench tasks where downstream metrics are near-random at smaller scales.", 333 "supported": "strong" 334 }, 335 { 336 "claim": "Emergence is not solely a function of model size — PaLM 62B shows emergence on 14 BIG-Bench tasks where GPT-3 175B and LaMDA 137B do not.", 337 "evidence": "Section 5.2 and Appendix F list these 14 tasks. PaLM uses different training data (more multilingual and code data) and architecture (split digit-encodings).", 338 "supported": "strong" 339 }, 340 { 341 "claim": "Social Science and Humanities MMLU categories show stronger emergence than STEM.", 342 "evidence": "Appendix B, Figure 9-10 show the performance jump from 7B to 70B/280B models is largest for Social Science and Humanities, smallest for STEM.", 343 "supported": "moderate" 344 } 345 ], 346 "methodology_tags": ["meta-analysis"], 347 "key_findings": "The paper defines and catalogs emergent abilities of large language models — capabilities that appear only above certain scale thresholds and cannot be predicted by extrapolating smaller models. It documents emergence across few-shot prompting tasks (arithmetic, QA, language understanding) and augmented prompting strategies (chain-of-thought, instruction following, scratchpad). Importantly, the paper also shows that cross-entropy loss improves continuously even when downstream task metrics appear flat, suggesting emergence may partly be an artifact of metric choice. The paper notes that emergence depends on multiple factors beyond raw scale, including data quality and model architecture.", 348 "red_flags": [ 349 { 350 "flag": "Conflict of interest not acknowledged", 351 "detail": "Most authors are from Google Research and DeepMind, which build and commercialize the large language models discussed. The paper's central thesis — that scaling up produces valuable emergent abilities — directly supports their employers' business strategy. This conflict is never acknowledged." 352 }, 353 { 354 "flag": "Selection bias in examples", 355 "detail": "The paper selectively surveys examples of emergence from the literature without a systematic review protocol. There is no accounting for how many abilities scale smoothly versus emerge abruptly, creating a potentially misleading impression of how common emergence is." 356 }, 357 { 358 "flag": "No quality assessment of source papers", 359 "detail": "Results from all cited papers are treated as equally reliable. The quality of the underlying evaluations (e.g., BIG-Bench task design, evaluation methodology) is not assessed, potentially laundering weak results." 360 }, 361 { 362 "flag": "Metric choice confound undermines core thesis", 363 "detail": "The paper's own Appendix A shows that cross-entropy loss improves continuously, suggesting 'emergence' may be an artifact of using discrete metrics (exact match, accuracy). This finding substantially undermines the paper's framing of emergence as a qualitative phase transition, yet it is presented as a minor observation rather than a central caveat." 364 } 365 ], 366 "cited_papers": [ 367 { 368 "title": "Language models are few-shot learners", 369 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 370 "year": 2020, 371 "relevance": "GPT-3 paper establishing few-shot prompting paradigm; key source of emergence examples in this survey." 372 }, 373 { 374 "title": "Scaling laws for neural language models", 375 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 376 "year": 2020, 377 "arxiv_id": "2001.08361", 378 "relevance": "Foundational scaling laws paper showing predictable performance improvements, which emergence departs from." 379 }, 380 { 381 "title": "Training compute-optimal large language models", 382 "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"], 383 "year": 2022, 384 "arxiv_id": "2203.15556", 385 "relevance": "Chinchilla paper arguing prior work underestimated training data needs; relevant to understanding compute-optimal scaling." 386 }, 387 { 388 "title": "Chain of thought prompting elicits reasoning in large language models", 389 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 390 "year": 2022, 391 "arxiv_id": "2201.11903", 392 "relevance": "Chain-of-thought prompting as an emergent augmented prompting ability for multi-step reasoning." 393 }, 394 { 395 "title": "PaLM: Scaling language modeling with Pathways", 396 "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"], 397 "year": 2022, 398 "arxiv_id": "2204.02311", 399 "relevance": "540B parameter model showing emergence on tasks where smaller models fail, key evidence for emergence thesis." 400 }, 401 { 402 "title": "Beyond the imitation game: Measuring and extrapolating the capabilities of language models", 403 "authors": ["BIG-Bench"], 404 "year": 2022, 405 "arxiv_id": "2206.04615", 406 "relevance": "BIG-Bench benchmark suite providing 200+ tasks used as primary source of emergence examples." 407 }, 408 { 409 "title": "On the opportunities and risks of foundation models", 410 "authors": ["Rishi Bommasani", "Drew A. Hudson"], 411 "year": 2021, 412 "arxiv_id": "2108.07258", 413 "relevance": "Foundation models survey discussing risks and opportunities including emergent capabilities and risks." 414 }, 415 { 416 "title": "Language models (mostly) know what they know", 417 "authors": ["Saurav Kadavath", "Tom Conerly", "Amanda Askell"], 418 "year": 2022, 419 "arxiv_id": "2207.05221", 420 "relevance": "Model calibration study showing emergent self-knowledge via P(True) technique at sufficient scale." 421 }, 422 { 423 "title": "Training language models to follow instructions with human feedback", 424 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 425 "year": 2022, 426 "arxiv_id": "2203.02155", 427 "relevance": "InstructGPT/RLHF showing instruction following can be achieved at smaller scale via fine-tuning, relevant to 'beyond scaling' discussion." 428 }, 429 { 430 "title": "TruthfulQA: Measuring how models mimic human falsehoods", 431 "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"], 432 "year": 2021, 433 "arxiv_id": "2109.07958", 434 "relevance": "Benchmark showing models can become less truthful with scale, relevant to emergent risks discussion." 435 }, 436 { 437 "title": "Self-consistency improves chain of thought reasoning in language models", 438 "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"], 439 "year": 2022, 440 "arxiv_id": "2203.11171", 441 "relevance": "Emergent augmented prompting technique that only helps at sufficient scale." 442 }, 443 { 444 "title": "Data distributional properties drive emergent few-shot learning in transformers", 445 "authors": ["Stephanie C.Y. Chan", "Adam Santoro", "Andrew K. Lampinen"], 446 "year": 2022, 447 "arxiv_id": "2205.05055", 448 "relevance": "Analyzes what data properties enable emergent few-shot learning, relevant to understanding emergence mechanisms." 449 } 450 ] 451 }