scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (22884B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "Emergent Abilities of Large Language Models",
      6     "authors": [
      7       "Jason Wei",
      8       "Yi Tay",
      9       "Rishi Bommasani",
     10       "Colin Raffel",
     11       "Barret Zoph",
     12       "Sebastian Borgeaud",
     13       "Dani Yogatama",
     14       "Maarten Bosma",
     15       "Denny Zhou",
     16       "Donald Metzler",
     17       "Ed H. Chi",
     18       "Tatsunori Hashimoto",
     19       "Oriol Vinyals",
     20       "Percy Liang",
     21       "Jeff Dean",
     22       "William Fedus"
     23     ],
     24     "year": 2022,
     25     "venue": "Trans. Mach. Learn. Res.",
     26     "arxiv_id": "2206.07682",
     27     "doi": "10.48550/arXiv.2206.07682"
     28   },
     29   "checklist": {
     30     "claims_and_evidence": {
     31       "abstract_claims_supported": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The abstract claims that emergent abilities exist and cannot be predicted by extrapolating smaller models. The paper provides extensive evidence through scaling curves (Figures 2-4) showing near-random performance followed by sharp jumps.",
     35         "source": "opus"
     36       },
     37       "causal_claims_justified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper uses causal language like 'scaling up language models... can lead to better performance' and 'scale to unpredictably enable new techniques' (§5). However, it acknowledges in §5.1 that 'there are currently few compelling explanations for why such abilities emerge' and that scale covaries with data quality, architecture, etc. The causal mechanism is not established.",
     41         "source": "opus"
     42       },
     43       "generalization_bounded": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Section 2 explicitly notes 'the scale at which an ability is first observed to emerge depends on a number of factors and is not an immutable property of the ability.' Section 5.2 discusses how emergence can occur at smaller scales with better data/architecture. The paper is careful not to claim specific scale thresholds are universal.",
     47         "source": "opus"
     48       },
     49       "alternative_explanations_discussed": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 5.1 discusses alternative explanations: evaluation metrics masking gradual improvement, model depth requirements, and memorization. Appendix A provides cross-entropy loss analysis showing improvements are real but masked by downstream metrics. Section 5.2 discusses architecture and data quality as alternative factors.",
     53         "source": "opus"
     54       },
     55       "proxy_outcome_distinction": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5.1 explicitly discusses how downstream metrics (exact match, accuracy) may be proxies that mask gradual improvement visible in cross-entropy loss. Appendix A provides detailed analysis of this proxy gap across six tasks.",
     59         "source": "opus"
     60       }
     61     },
     62     "limitations_and_scope": {
     63       "limitations_section_present": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 5 contains extensive discussion of limitations including §5.1 (potential explanations/metric artifacts), §5.2 (beyond scaling — emergence not solely about scale), §5.4 (emergent risks), and the Broader Impact Statement.",
     67         "source": "opus"
     68       },
     69       "threats_to_validity_specific": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 5.1 specifically discusses that evaluation metrics may create an illusion of emergence by not giving partial credit: 'using exact string match as the evaluation metric for long-sequence targets may disguise compounding incremental improvements as emergence.' Appendix A.3 notes the subjectivity of the emergence classification.",
     73         "source": "opus"
     74       },
     75       "scope_boundaries_stated": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 2 states: 'Our goal in this paper is not to characterize or claim that a specific scale is required to observe emergent abilities, but rather, we aim to discuss examples of emergent behavior in prior work.' Footnote 1 limits scope to pre-trained Transformer language models.",
     79         "source": "opus"
     80       }
     81     },
     82     "conflicts_of_interest": {
     83       "funding_disclosed": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding statement or acknowledgment of funding sources. The Acknowledgments section thanks individuals but does not mention grants or funding.",
     87         "source": "opus"
     88       },
     89       "affiliations_disclosed": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Author affiliations are clearly listed: Google Research, Stanford University, UNC Chapel Hill, DeepMind. These are the companies/institutions that built the models being surveyed.",
     93         "source": "opus"
     94       },
     95       "funder_independent_of_outcome": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Authors are primarily from Google Research and DeepMind — companies that build the largest language models discussed. A finding that 'emergence exists and more scaling may unlock more abilities' directly supports their business interests in continued scaling. This conflict is not acknowledged.",
     99         "source": "opus"
    100       },
    101       "financial_interests_declared": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No competing interests statement is present. Authors from Google and DeepMind have direct financial interest in demonstrating value of scaling language models.",
    105         "source": "opus"
    106       }
    107     },
    108     "scope_and_framing": {
    109       "key_terms_defined": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper precisely defines 'emergent ability' in §2: 'An ability is emergent if it is not present in smaller models but is present in larger models,' and further specifies what 'not present' means (near-random performance) and how scale is measured (training FLOPs).",
    113         "source": "haiku"
    114       },
    115       "intended_contribution_clear": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper clearly states it surveys emergent abilities as observed in prior work, categorizes them across prompting settings, and raises questions about why emergence occurs and whether further scaling will produce more — the survey and framing contribution is explicitly articulated.",
    119         "source": "haiku"
    120       },
    121       "engagement_with_prior_work": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper engages substantively with prior work, situating emergence relative to scaling laws (Kaplan et al. 2020), discussing how results from BIG-Bench, GPT-3, Gopher, Chinchilla, and PaLM relate to each other, and contrasting findings (e.g., Sanh et al. finding instruction following in smaller encoder-decoder models).",
    125         "source": "haiku"
    126       }
    127     }
    128   },
    129   "type_checklist": {
    130     "survey": {
    131       "search_and_selection": {
    132         "search_strategy_reproducible": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No search strategy is described; the paper selects examples from prior work the authors were presumably already familiar with, with no documented process for identifying which papers to include.",
    136           "source": "haiku"
    137         },
    138         "inclusion_exclusion_explicit": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "No explicit inclusion/exclusion criteria are stated for the survey's scope; papers appear to have been selected because they demonstrate what the authors considered emergent behavior matching their definition, but this selection logic is not articulated.",
    142           "source": "haiku"
    143         },
    144         "prisma_or_structured_protocol": {
    145           "applies": true,
    146           "answer": false,
    147           "justification": "No PRISMA or other structured review protocol is followed or mentioned; this is a narrative synthesis, not a systematic review.",
    148           "source": "haiku"
    149         },
    150         "search_terms_provided": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No search terms or queries are provided; the paper makes no pretense of having conducted a database search.",
    154           "source": "haiku"
    155         },
    156         "databases_listed": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "No databases or sources searched are listed; the paper relies entirely on the authors' prior knowledge of the literature.",
    160           "source": "haiku"
    161         },
    162         "screening_process_documented": {
    163           "applies": true,
    164           "answer": false,
    165           "justification": "No screening process is documented; there are no counts of papers identified, screened, or included at any stage.",
    166           "source": "haiku"
    167         },
    168         "review_scope_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The scope (pre-trained Transformer language models, scaling-related emergence) is implicit but never formally justified — why these years, why only dense Transformers, why these benchmarks are not explained.",
    172           "source": "haiku"
    173         }
    174       },
    175       "synthesis_quality": {
    176         "conflicting_findings_acknowledged": {
    177           "applies": true,
    178           "answer": true,
    179           "justification": "The paper acknowledges conflicting evidence in §5.2, e.g., that instruction following emerged in smaller encoder-decoder models (Sanh et al.) contra the initial finding of 68B+ threshold, and that PaLM 62B achieves emergence on tasks where larger GPT-3 and LaMDA models did not.",
    180           "source": "haiku"
    181         },
    182         "quality_assessment_of_sources": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "No quality assessment of source papers is performed; results from all reviewed papers are treated equally with no rubric, risk-of-bias assessment, or evaluation of methodological rigor of the cited work.",
    186           "source": "haiku"
    187         },
    188         "publication_bias_discussed": {
    189           "applies": true,
    190           "answer": false,
    191           "justification": "Publication bias is not mentioned; the survey does not acknowledge that scaling success stories are more likely to be published than null results or cases where scaling failed to produce emergence.",
    192           "source": "haiku"
    193         },
    194         "quantitative_synthesis_present": {
    195           "applies": true,
    196           "answer": false,
    197           "justification": "There is no meta-analysis, vote counting, or effect size aggregation; the paper organizes and lists results from prior work narratively without statistically synthesizing them.",
    198           "source": "haiku"
    199         },
    200         "recommendations_supported_by_evidence": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Future work directions in §5.6 (model scaling, architecture improvements, data scaling, better prompting, understanding emergence) are grounded in the observed patterns from reviewed literature rather than unsupported speculation.",
    204           "source": "haiku"
    205         }
    206       }
    207     }
    208   },
    209   "claims": [
    210     {
    211       "claim": "Emergent abilities are not present in smaller models but appear above a certain scale threshold, showing a sharp phase transition in performance.",
    212       "evidence": "Scaling curves across 23+ abilities (BIG-Bench tasks, TruthfulQA, MMLU, WiC, chain-of-thought, etc.) showing near-random performance that suddenly jumps to substantially above random at specific FLOPs thresholds.",
    213       "supported": "moderate"
    214     },
    215     {
    216       "claim": "Emergent abilities cannot be predicted by extrapolating from smaller model performance.",
    217       "evidence": "Multiple examples where scaling laws for cross-entropy loss do not predict downstream task emergence; the WiC case where GPT-3 architects incorrectly attributed failure to architecture rather than insufficient scale.",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "Evaluation metrics based on discrete correctness (exact match, accuracy) can create the appearance of sharp emergence when underlying cross-entropy loss improves continuously.",
    222       "evidence": "Appendix A cross-entropy analysis of six BIG-Bench tasks shows loss improves even at small scales where downstream metrics show near-random performance.",
    223       "supported": "strong"
    224     },
    225     {
    226       "claim": "Scale is not the only factor enabling emergent abilities; data quality, architecture, and training procedures also matter.",
    227       "evidence": "PaLM 62B achieves emergence on 14 BIG-Bench tasks where larger GPT-3 175B and LaMDA 137B do not; Sanh et al. induced instruction following in 11B encoder-decoder models.",
    228       "supported": "moderate"
    229     },
    230     {
    231       "claim": "Emergent risks (bias, toxicity, memorization, harmful content) also arise with scale.",
    232       "evidence": "Review of prior work (Carlini et al., Askell et al., Weidinger et al.) showing increased memorization, potential toxicity, and adversarial vulnerabilities at scale; framed as risk inventory rather than new empirical finding.",
    233       "supported": "moderate"
    234     },
    235     {
    236       "claim": "Further scaling may unlock additional emergent abilities on tasks where current models perform at random.",
    237       "evidence": "Appendix E.4 lists ~45 BIG-Bench tasks where no current model exceeds random; framed as candidates for future emergence, but this is speculative extrapolation.",
    238       "supported": "weak"
    239     }
    240   ],
    241   "methodology_tags": [
    242     "benchmark-eval",
    243     "theoretical"
    244   ],
    245   "key_findings": "This paper surveys emergent abilities in large language models — capabilities absent in smaller models that appear suddenly at certain scale thresholds — cataloging 23+ such abilities across few-shot prompting and augmented prompting settings from five model families. The authors acknowledge but do not resolve a critical confound: discrete evaluation metrics (exact match, accuracy) can disguise continuously improving cross-entropy loss as a sharp emergence event, meaning 'emergence' may partly be a measurement artifact. Despite being framed as a survey, the paper uses no systematic search methodology and relies primarily on results from the authors' own organizations (Google Research, DeepMind), with no quality assessment of reviewed work.",
    246   "red_flags": [
    247     {
    248       "flag": "Metric artifact unresolved",
    249       "detail": "The paper demonstrates in Appendix A that cross-entropy loss improves continuously at small scales where downstream metrics appear random, yet continues to treat emergence as a real phenomenon rather than a measurement artifact. The authors call this an 'incomplete explanation' without resolving the tension."
    250     },
    251     {
    252       "flag": "No systematic search methodology",
    253       "detail": "Classified as a survey but uses no reproducible search strategy, no inclusion/exclusion criteria, no PRISMA protocol, and no screening documentation. Papers were selected based on authors' prior knowledge, biasing toward their own work."
    254     },
    255     {
    256       "flag": "Self-serving model selection",
    257       "detail": "11 of 16 authors are from Google Research or DeepMind; the most prominently featured 'emergent' models are their own (PaLM, LaMDA, Gopher, Chinchilla). No conflict of interest or funding disclosure is provided."
    258     },
    259     {
    260       "flag": "No funding disclosure",
    261       "detail": "Despite being authored primarily by employees of two of the world's largest AI labs conducting commercially relevant research, no funding source or competing interests statement appears anywhere in the paper."
    262     },
    263     {
    264       "flag": "Definition enables circular selection",
    265       "detail": "Emergence is defined as performance near-random until a threshold, then above random. This definition is applied retrospectively to select examples — tasks are labeled emergent because they show this pattern — potentially over-representing model-family-specific artifacts as general phenomena."
    266     }
    267   ],
    268   "cited_papers": [
    269     {
    270       "title": "Language Models are Few-Shot Learners (GPT-3)",
    271       "relevance": "Foundational model demonstrating few-shot prompting at scale; most cited source of emergent ability examples"
    272     },
    273     {
    274       "title": "Scaling Laws for Neural Language Models",
    275       "relevance": "Establishes predictable scaling laws that emergence claims are positioned against — emergence is what scaling laws do NOT predict"
    276     },
    277     {
    278       "title": "Beyond the Imitation Game: Measuring and Extrapolating the Capabilities of Language Models (BIG-Bench)",
    279       "relevance": "Primary benchmark suite used to document emergent abilities across 200+ tasks"
    280     },
    281     {
    282       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    283       "relevance": "Key example of an augmented prompting strategy that is itself emergent, only helping at large scales"
    284     },
    285     {
    286       "title": "PaLM: Scaling Language Modeling with Pathways",
    287       "relevance": "Largest model studied; exhibits emergence on tasks where smaller models fail, and demonstrates scale-vs-architecture confound"
    288     },
    289     {
    290       "title": "Training Compute-Optimal Large Language Models (Chinchilla)",
    291       "relevance": "Key model family in scaling curves; also introduces compute-optimal training argument affecting emergence thresholds"
    292     },
    293     {
    294       "title": "On the Opportunities and Risks of Foundation Models",
    295       "relevance": "Broader context for emergence and emergent risks; co-authored by several of the same authors"
    296     },
    297     {
    298       "title": "Predictability and Surprise in Large Generative Models",
    299       "relevance": "Prior work on unpredictability of LLM behavior that the emergence survey builds on"
    300     },
    301     {
    302       "title": "Finetuned Language Models are Zero-Shot Learners (FLAN)",
    303       "relevance": "Instruction-following emergence example; also complicates the story by showing smaller encoder-decoder models can exhibit similar behavior"
    304     },
    305     {
    306       "title": "Data Distributional Properties Drive Emergent Few-Shot Learning in Transformers",
    307       "relevance": "Provides mechanistic alternative explanation for emergence based on training data properties rather than scale per se"
    308     }
    309   ],
    310   "engagement_factors": {
    311     "practical_relevance": {
    312       "score": 2,
    313       "justification": "Helps practitioners anticipate capability thresholds and understand why smaller models fail on tasks that larger ones succeed at, though the compute requirements to observe emergence are beyond most practitioners."
    314     },
    315     "surprise_contrarian": {
    316       "score": 3,
    317       "justification": "Directly challenges the prevailing scaling laws narrative by showing that some capabilities are genuinely unpredictable and cannot be extrapolated from smaller model performance."
    318     },
    319     "fear_safety": {
    320       "score": 2,
    321       "justification": "Section 5.4 explicitly discusses emergent risks including toxicity, memorization, bias, and future adversarial vulnerabilities that may only manifest at larger scales, raising genuine safety concerns."
    322     },
    323     "drama_conflict": {
    324       "score": 2,
    325       "justification": "The WiC benchmark narrative — where GPT-3's architects incorrectly diagnosed an architecture problem that was actually solved by more scale — provides a concrete dramatic example of the unpredictability thesis."
    326     },
    327     "demo_ability": {
    328       "score": 0,
    329       "justification": "Pure survey paper with no demos, datasets, or tools released; readers cannot try anything themselves."
    330     },
    331     "brand_recognition": {
    332       "score": 3,
    333       "justification": "Authored by researchers from Google Research, DeepMind, and Stanford; covers GPT-3, PaLM, Gopher, Chinchilla, and LaMDA — maximum brand density for 2022 AI research."
    334     }
    335   },
    336   "hn_data": {
    337     "threads": [
    338       {
    339         "hn_id": "40689833",
    340         "title": "Survey of Rickrolling in Academic Literature [pdf]",
    341         "points": 69,
    342         "comments": 14,
    343         "url": "https://news.ycombinator.com/item?id=40689833",
    344         "created_at": "2024-06-15T13:54:57Z"
    345       },
    346       {
    347         "hn_id": "37543595",
    348         "title": "Ask HN: Transformer alternatives that could have emergent properties when scaled",
    349         "points": 6,
    350         "comments": 3,
    351         "url": "https://news.ycombinator.com/item?id=37543595",
    352         "created_at": "2023-09-17T10:45:52Z"
    353       },
    354       {
    355         "hn_id": "36349856",
    356         "title": "SqueezeLLM: Dense-and-Sparse Quantization",
    357         "points": 5,
    358         "comments": 1,
    359         "url": "https://news.ycombinator.com/item?id=36349856",
    360         "created_at": "2023-06-16T01:43:39Z"
    361       },
    362       {
    363         "hn_id": "35621735",
    364         "title": "Emergent Abilities of Large Language Models",
    365         "points": 4,
    366         "comments": 1,
    367         "url": "https://news.ycombinator.com/item?id=35621735",
    368         "created_at": "2023-04-18T23:06:51Z"
    369       },
    370       {
    371         "hn_id": "36342137",
    372         "title": "SqueezeLLM: Lossless 3-bit quantization with improved performance",
    373         "points": 4,
    374         "comments": 0,
    375         "url": "https://news.ycombinator.com/item?id=36342137",
    376         "created_at": "2023-06-15T15:43:48Z"
    377       },
    378       {
    379         "hn_id": "35410181",
    380         "title": "Emergent Abilities of Large Language Models",
    381         "points": 3,
    382         "comments": 0,
    383         "url": "https://news.ycombinator.com/item?id=35410181",
    384         "created_at": "2023-04-02T13:16:17Z"
    385       },
    386       {
    387         "hn_id": "34785902",
    388         "title": "Emergent Abilities of Large Language Models",
    389         "points": 2,
    390         "comments": 1,
    391         "url": "https://news.ycombinator.com/item?id=34785902",
    392         "created_at": "2023-02-14T05:48:21Z"
    393       },
    394       {
    395         "hn_id": "40419434",
    396         "title": "Emergent Abilities of Large Language Models",
    397         "points": 2,
    398         "comments": 0,
    399         "url": "https://news.ycombinator.com/item?id=40419434",
    400         "created_at": "2024-05-20T19:46:53Z"
    401       },
    402       {
    403         "hn_id": "47174820",
    404         "title": "Emergent Abilities of Large Language Models (2022)",
    405         "points": 1,
    406         "comments": 0,
    407         "url": "https://news.ycombinator.com/item?id=47174820",
    408         "created_at": "2026-02-27T00:58:33Z"
    409       },
    410       {
    411         "hn_id": "41730269",
    412         "title": "Emergent Abilities of Large Language Models (2022)",
    413         "points": 1,
    414         "comments": 0,
    415         "url": "https://news.ycombinator.com/item?id=41730269",
    416         "created_at": "2024-10-03T12:47:11Z"
    417       }
    418     ],
    419     "top_points": 69,
    420     "total_points": 97,
    421     "total_comments": 20
    422   }
    423 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs