scan-v5.json (23620B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "Emergent Abilities of Large Language Models", 6 "authors": [ 7 "Jason Wei", 8 "Yi Tay", 9 "Rishi Bommasani", 10 "Colin Raffel", 11 "Barret Zoph", 12 "Sebastian Borgeaud", 13 "Dani Yogatama", 14 "Maarten Bosma", 15 "Denny Zhou", 16 "Donald Metzler", 17 "Ed H. Chi", 18 "Tatsunori Hashimoto", 19 "Oriol Vinyals", 20 "Percy Liang", 21 "Jeff Dean", 22 "William Fedus" 23 ], 24 "year": 2022, 25 "venue": "Trans. Mach. Learn. Res.", 26 "arxiv_id": "2206.07682", 27 "doi": "10.48550/arXiv.2206.07682" 28 }, 29 "checklist": { 30 "claims_and_evidence": { 31 "abstract_claims_supported": { 32 "applies": true, 33 "answer": true, 34 "justification": "The abstract claims emergent abilities cannot be predicted by extrapolating smaller-model performance; this is supported by scaling curves across five model families (Figures 2–3) showing near-random performance until critical thresholds, with 20+ documented examples across §3–4 and appendices.", 35 "source": "haiku" 36 }, 37 "causal_claims_justified": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper uses causal language throughout ('further scaling could potentially further expand capabilities'), but the design is purely observational across heterogeneous model families with different architectures and training data, which cannot isolate scale as the causal factor from confounders like training data quality or architecture.", 41 "source": "haiku" 42 }, 43 "generalization_bounded": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper presents emergence as a general phenomenon of 'large language models' but evidence covers only five specific families (GPT-3, LaMDA, Gopher, Chinchilla, PaLM) on specific benchmarks; broad claims about all LLMs exceed the evidential scope, and the paper does not bound generalizations accordingly.", 47 "source": "haiku" 48 }, 49 "alternative_explanations_discussed": { 50 "applies": true, 51 "answer": true, 52 "justification": "Section 5.1 explicitly discusses metric artifacts (exact string match masking gradual improvement) as an alternative explanation, presents cross-entropy loss as an alternative metric, and mentions architecture and training data quality as possible non-scale factors in §5.2.", 53 "source": "haiku" 54 }, 55 "proxy_outcome_distinction": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper uses accuracy/BLEU/exact match as proxies for 'abilities,' and Appendix A demonstrates these metrics mask continuous underlying improvement (CE loss improves for all small models), yet the paper still designates tasks as 'emergent' without resolving the proxy-outcome conflict.", 59 "source": "haiku" 60 } 61 }, 62 "limitations_and_scope": { 63 "limitations_section_present": { 64 "applies": true, 65 "answer": false, 66 "justification": "There is no dedicated limitations section; the Broader Impact Statement is a single paragraph noting the paper surveyed existing literature without proposing new methods, which does not constitute a limitations section.", 67 "source": "haiku" 68 }, 69 "threats_to_validity_specific": { 70 "applies": true, 71 "answer": false, 72 "justification": "The paper briefly discusses metric artifacts in §5.1 but does not systematically enumerate threats to validity such as selection bias in which tasks were included, heterogeneity of compared model families, or the non-comparability of FLOPs across architectures.", 73 "source": "haiku" 74 }, 75 "scope_boundaries_stated": { 76 "applies": true, 77 "answer": false, 78 "justification": "The focus on pre-trained Transformer language models is mentioned only in a footnote; the paper does not explicitly state what the results do NOT show, nor does it bound claims to specific model families, tasks, or training regimes.", 79 "source": "haiku" 80 } 81 }, 82 "conflicts_of_interest": { 83 "funding_disclosed": { 84 "applies": true, 85 "answer": false, 86 "justification": "No funding source is disclosed anywhere in the paper; the acknowledgments section thanks colleagues for feedback but contains no funding statement.", 87 "source": "haiku" 88 }, 89 "affiliations_disclosed": { 90 "applies": true, 91 "answer": true, 92 "justification": "Author affiliations are clearly listed on the first page: Google Research (7 authors), Stanford University (3), UNC Chapel Hill (1), and DeepMind (3).", 93 "source": "haiku" 94 }, 95 "funder_independent_of_outcome": { 96 "applies": true, 97 "answer": false, 98 "justification": "Ten of sixteen authors are from Google Research and DeepMind, organizations that develop and commercially benefit from scaling the very models whose emergent capabilities are being highlighted; the institutional interest is not independent of the outcome.", 99 "source": "haiku" 100 }, 101 "financial_interests_declared": { 102 "applies": true, 103 "answer": false, 104 "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper.", 105 "source": "haiku" 106 } 107 }, 108 "scope_and_framing": { 109 "key_terms_defined": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper explicitly defines 'emergent abilities' in §2 ('An ability is emergent if it is not present in smaller models but is present in larger models') and 'emergence' as 'when quantitative changes in a system result in qualitative changes in behavior.'", 113 "source": "haiku" 114 }, 115 "intended_contribution_clear": { 116 "applies": true, 117 "answer": true, 118 "justification": "The introduction clearly states the paper will survey emergent abilities observed in prior work, categorize them by setting (few-shot prompting and augmented prompting), and raise open questions about why emergence occurs and whether further scaling will yield more.", 119 "source": "haiku" 120 }, 121 "engagement_with_prior_work": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper engages substantively with 70+ prior works, situating emergence against predictable scaling laws (Kaplan et al.), foundation model risks (Bommasani et al.), and specific model families, showing how this survey synthesizes and extends existing findings.", 125 "source": "haiku" 126 } 127 } 128 }, 129 "type_checklist": { 130 "survey": { 131 "search_and_selection": { 132 "search_strategy_reproducible": { 133 "applies": true, 134 "answer": false, 135 "justification": "No search strategy is described; the paper is a selective synthesis based on the authors' existing knowledge of the field rather than a documented, reproducible search process.", 136 "source": "haiku" 137 }, 138 "inclusion_exclusion_explicit": { 139 "applies": true, 140 "answer": false, 141 "justification": "No explicit inclusion or exclusion criteria are stated; examples are selected if they visually match the emergence definition (near-random then sharp jump), but this selection process is not formally described or consistently applied.", 142 "source": "haiku" 143 }, 144 "prisma_or_structured_protocol": { 145 "applies": true, 146 "answer": false, 147 "justification": "No PRISMA or other structured review protocol is used or mentioned anywhere in the paper.", 148 "source": "haiku" 149 }, 150 "search_terms_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "No search terms or queries are provided; papers are cited from the authors' familiarity with the literature without any documented search.", 154 "source": "haiku" 155 }, 156 "databases_listed": { 157 "applies": true, 158 "answer": false, 159 "justification": "No databases or literature sources are listed; the paper draws entirely on the authors' existing knowledge without documenting where papers were identified.", 160 "source": "haiku" 161 }, 162 "screening_process_documented": { 163 "applies": true, 164 "answer": false, 165 "justification": "No screening process with counts at each stage is documented; Appendix E classifies BIG-Bench tasks into emergence categories, but this is analysis of a single pre-existing benchmark, not a general literature screening workflow.", 166 "source": "haiku" 167 }, 168 "review_scope_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "The focus on pre-trained Transformer language models is noted only in a footnote without formal justification; there is no explanation of why specific model families, years, or benchmarks were included over others.", 172 "source": "haiku" 173 } 174 }, 175 "synthesis_quality": { 176 "conflicting_findings_acknowledged": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5.2 and Appendix F explicitly document 14 BIG-Bench tasks where PaLM 62B achieves emergence but GPT-3 175B and LaMDA 137B do not despite more FLOPs, acknowledging that scale alone does not consistently predict emergence across model families.", 180 "source": "haiku" 181 }, 182 "quality_assessment_of_sources": { 183 "applies": true, 184 "answer": false, 185 "justification": "Source papers are taken at face value without any quality assessment, risk-of-bias evaluation, or methodological rubric; the paper treats all cited results as equally reliable regardless of study design.", 186 "source": "haiku" 187 }, 188 "publication_bias_discussed": { 189 "applies": true, 190 "answer": false, 191 "justification": "Publication bias is never mentioned; the survey only includes positive demonstrations of emergence from the published literature without acknowledging that negative results (abilities that failed to emerge despite scaling) are far less likely to have been published.", 192 "source": "haiku" 193 }, 194 "quantitative_synthesis_present": { 195 "applies": true, 196 "answer": false, 197 "justification": "The paper presents qualitative categorization and illustrative scaling curves; there is no meta-analysis, effect size aggregation, or systematic vote counting—the synthesis is narrative with curated illustrative examples.", 198 "source": "haiku" 199 }, 200 "recommendations_supported_by_evidence": { 201 "applies": true, 202 "answer": true, 203 "justification": "The future directions in §5.6 (improved architectures, data scaling, better prompting) are grounded in documented cases where PaLM achieved emergence at smaller scale via different training, and where instruction-following was enabled in smaller encoder-decoder models (Sanh et al.).", 204 "source": "haiku" 205 } 206 } 207 } 208 }, 209 "claims": [ 210 { 211 "claim": "Emergent abilities of LLMs cannot be predicted by extrapolating the performance of smaller models.", 212 "evidence": "Scaling curves across five model families (Figures 2–3) show near-random performance across multiple orders of magnitude before a sharp jump, inconsistent with smooth extrapolation from smaller models.", 213 "supported": "moderate" 214 }, 215 { 216 "claim": "Chain-of-thought prompting only surpasses standard prompting at approximately 10^23 FLOPs (~100B parameters).", 217 "evidence": "Figure 3A shows GSM8K accuracy for LaMDA models; chain-of-thought underperforms or matches the no-chain-of-thought baseline below ~10^23 FLOPs and surpasses it above this threshold.", 218 "supported": "strong" 219 }, 220 { 221 "claim": "Instruction finetuning hurts performance for models ≤8B parameters and only improves performance at ≥68B parameters.", 222 "evidence": "Figure 3B from Wei et al. (2022a) shows 10-NLU task average dropping with instruction tuning for small LaMDA models and rising sharply for models above the threshold.", 223 "supported": "strong" 224 }, 225 { 226 "claim": "Model scale is not the only factor enabling emergent abilities; architecture and training data also matter.", 227 "evidence": "Section 5.2 and Appendix F document 14 BIG-Bench tasks where PaLM 62B achieves above-random performance while GPT-3 175B and LaMDA 137B with more FLOPs do not.", 228 "supported": "strong" 229 }, 230 { 231 "claim": "Cross-entropy loss improves continuously for small models even when downstream accuracy/BLEU metrics appear near random.", 232 "evidence": "Appendix A analysis of six BIG-Bench tasks (Figures 5–6) shows cross-entropy loss decreasing across all model scales including small models where exact match is near 100% error rate.", 233 "supported": "strong" 234 }, 235 { 236 "claim": "Emergent risks such as toxicity, bias, and data memorization also scale with model size.", 237 "evidence": "Section 5.4 cites Carlini et al. (memorization increases with scale), Weidinger et al. (ethical risks), and BIG-Bench BBQ results showing bias can increase for ambiguous contexts.", 238 "supported": "moderate" 239 } 240 ], 241 "methodology_tags": [ 242 "observational", 243 "benchmark-eval", 244 "meta-analysis" 245 ], 246 "key_findings": "The paper surveys and categorizes emergent abilities of large language models—capabilities absent in smaller models that appear sharply above certain compute thresholds—spanning few-shot prompting tasks and augmented strategies like chain-of-thought and instruction following. A self-undermining secondary finding is that cross-entropy loss improves continuously for small models even when discrete downstream metrics appear stuck near random, which is consistent with emergence being partly a metric artifact rather than a true capability discontinuity. Scale alone is insufficient: PaLM 62B achieves emergence on tasks where larger GPT-3 and LaMDA models fail, implicating training data quality and architecture as co-factors. The paper calls for understanding the mechanistic basis of emergence, lowering scale thresholds via improved training, and monitoring emergent safety risks.", 247 "red_flags": [ 248 { 249 "flag": "No systematic search methodology", 250 "detail": "The paper presents as a survey but uses no documented search strategy, databases, inclusion/exclusion criteria, or PRISMA protocol; examples were selected based on authors' familiarity, biasing toward dramatic phase-transition demonstrations." 251 }, 252 { 253 "flag": "Metric artifact insufficiently resolved", 254 "detail": "Appendix A shows cross-entropy loss improves continuously for small models even when accuracy/BLEU are near random, directly supporting the interpretation that 'emergence' is a metric artifact; the paper notes this but continues classifying tasks as emergent without resolving the contradiction." 255 }, 256 { 257 "flag": "Undisclosed institutional conflict of interest", 258 "detail": "Ten of sixteen authors are from Google Research or DeepMind, organizations that develop and commercially benefit from large-scale LLMs whose capabilities are being surveyed and promoted; no conflict of interest is disclosed." 259 }, 260 { 261 "flag": "Causal language without causal design", 262 "detail": "The paper frames scale as causing emergence but cannot isolate scale from confounders (architecture, training data, training procedure) across the heterogeneous model families compared." 263 }, 264 { 265 "flag": "Publication bias unaddressed", 266 "detail": "The survey systematically excludes negative results—abilities that failed to emerge despite scaling—without acknowledging that published literature skews toward positive demonstrations, inflating the apparent prevalence and reliability of emergence." 267 } 268 ], 269 "cited_papers": [ 270 { 271 "title": "Language Models are Few-Shot Learners (GPT-3)", 272 "relevance": "Foundational source establishing the few-shot prompting paradigm and early emergence observations; the primary baseline model family throughout the survey." 273 }, 274 { 275 "title": "Beyond the Imitation Game: Measuring and Extrapolating the Capabilities of Language Models (BIG-Bench)", 276 "relevance": "Primary source of emergent task examples; provides the majority of §3 examples, the task classification in Appendix A.3, and the flat-task candidates for future emergence." 277 }, 278 { 279 "title": "Scaling Laws for Neural Language Models", 280 "relevance": "Establishes the baseline expectation of predictable scaling that emergence is contrasted against; the paper's central claim is that emergence violates these smooth extrapolations." 281 }, 282 { 283 "title": "Training Compute-Optimal Large Language Models (Chinchilla)", 284 "relevance": "Source of Chinchilla model results and the argument that prior work underestimated training data requirements; used as a key model family showing emergence and revising scale assumptions." 285 }, 286 { 287 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 288 "relevance": "Demonstrates chain-of-thought as a key emergent augmented prompting ability (Figure 3A); one of the clearest examples of a technique that hurts small models and helps large ones." 289 }, 290 { 291 "title": "On the Opportunities and Risks of Foundation Models", 292 "relevance": "Situates the emergence survey within the broader foundation models research agenda; provides context for emergence risks and the sociological shifts described in §5.5." 293 }, 294 { 295 "title": "PaLM: Scaling Language Modeling with Pathways", 296 "relevance": "Source of PaLM model results; PaLM's ability to achieve emergence at smaller parameter counts than GPT-3/LaMDA is the central evidence for the 'beyond scaling' argument in §5.2." 297 }, 298 { 299 "title": "Scaling Language Models: Methods, Analysis and Insights from Training Gopher", 300 "relevance": "Source of Gopher and Chinchilla model results used across multiple emergence examples including TruthfulQA and MMLU." 301 }, 302 { 303 "title": "Finetuned Language Models Are Zero-Shot Learners (FLAN)", 304 "relevance": "Demonstrates instruction-following as an emergent ability and documents that instruction tuning hurts performance below ~68B parameters; foundational example in §4." 305 }, 306 { 307 "title": "Predictability and Surprise in Large Generative Models", 308 "relevance": "Directly studies which LLM capabilities are unpredictable across scale, closely related to the emergence framing; cited for the observation that certain tasks cannot be predicted ahead of time." 309 } 310 ], 311 "engagement_factors": { 312 "practical_relevance": { 313 "score": 2, 314 "justification": "Practitioners gain awareness of which capability classes to expect at different compute scales, but the paper provides no concrete guidance beyond 'use a larger model' and explicitly notes emergence thresholds are uncertain and scale-dependent." 315 }, 316 "surprise_contrarian": { 317 "score": 3, 318 "justification": "Directly challenges the dominant scaling-laws view that LLM capabilities improve predictably; the claim that performance can be near-random then jump sharply was genuinely novel, widely cited, and subsequently contested." 319 }, 320 "fear_safety": { 321 "score": 2, 322 "justification": "Section 5.4 explicitly discusses emergent risks including bias, toxicity, data memorization, and potential future harms (backdoors, inadvertent deception) that may only manifest in future, larger models." 323 }, 324 "drama_conflict": { 325 "score": 2, 326 "justification": "Creates conflict with the smooth-scaling community; this was later intensified when Schaeffer et al. (2023) argued emergence is entirely a metric artifact, turning this paper into a flashpoint in the scaling debate." 327 }, 328 "demo_ability": { 329 "score": 1, 330 "justification": "The surveyed abilities (chain-of-thought, instruction following) are demonstrable via existing APIs, but the paper itself provides no demo and most documented emergence thresholds require access to proprietary 100B+ models." 331 }, 332 "brand_recognition": { 333 "score": 3, 334 "justification": "Sixteen co-authors from Google Research, DeepMind, and Stanford; associated with GPT-3, PaLM, Gopher, and Chinchilla—among the highest-profile models and labs in the field." 335 } 336 }, 337 "hn_data": { 338 "threads": [ 339 { 340 "hn_id": "40689833", 341 "title": "Survey of Rickrolling in Academic Literature [pdf]", 342 "points": 69, 343 "comments": 14, 344 "url": "https://news.ycombinator.com/item?id=40689833", 345 "created_at": "2024-06-15T13:54:57Z" 346 }, 347 { 348 "hn_id": "37543595", 349 "title": "Ask HN: Transformer alternatives that could have emergent properties when scaled", 350 "points": 6, 351 "comments": 3, 352 "url": "https://news.ycombinator.com/item?id=37543595", 353 "created_at": "2023-09-17T10:45:52Z" 354 }, 355 { 356 "hn_id": "36349856", 357 "title": "SqueezeLLM: Dense-and-Sparse Quantization", 358 "points": 5, 359 "comments": 1, 360 "url": "https://news.ycombinator.com/item?id=36349856", 361 "created_at": "2023-06-16T01:43:39Z" 362 }, 363 { 364 "hn_id": "35621735", 365 "title": "Emergent Abilities of Large Language Models", 366 "points": 4, 367 "comments": 1, 368 "url": "https://news.ycombinator.com/item?id=35621735", 369 "created_at": "2023-04-18T23:06:51Z" 370 }, 371 { 372 "hn_id": "36342137", 373 "title": "SqueezeLLM: Lossless 3-bit quantization with improved performance", 374 "points": 4, 375 "comments": 0, 376 "url": "https://news.ycombinator.com/item?id=36342137", 377 "created_at": "2023-06-15T15:43:48Z" 378 }, 379 { 380 "hn_id": "35410181", 381 "title": "Emergent Abilities of Large Language Models", 382 "points": 3, 383 "comments": 0, 384 "url": "https://news.ycombinator.com/item?id=35410181", 385 "created_at": "2023-04-02T13:16:17Z" 386 }, 387 { 388 "hn_id": "34785902", 389 "title": "Emergent Abilities of Large Language Models", 390 "points": 2, 391 "comments": 1, 392 "url": "https://news.ycombinator.com/item?id=34785902", 393 "created_at": "2023-02-14T05:48:21Z" 394 }, 395 { 396 "hn_id": "40419434", 397 "title": "Emergent Abilities of Large Language Models", 398 "points": 2, 399 "comments": 0, 400 "url": "https://news.ycombinator.com/item?id=40419434", 401 "created_at": "2024-05-20T19:46:53Z" 402 }, 403 { 404 "hn_id": "47174820", 405 "title": "Emergent Abilities of Large Language Models (2022)", 406 "points": 1, 407 "comments": 0, 408 "url": "https://news.ycombinator.com/item?id=47174820", 409 "created_at": "2026-02-27T00:58:33Z" 410 }, 411 { 412 "hn_id": "41730269", 413 "title": "Emergent Abilities of Large Language Models (2022)", 414 "points": 1, 415 "comments": 0, 416 "url": "https://news.ycombinator.com/item?id=41730269", 417 "created_at": "2024-10-03T12:47:11Z" 418 } 419 ], 420 "top_points": 69, 421 "total_points": 97, 422 "total_comments": 20 423 } 424 }