scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23774B)
      1 {
      2   "paper": {
      3     "title": "Emergent Abilities of Large Language Models",
      4     "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani", "Colin Raffel", "Barret Zoph", "Sebastian Borgeaud", "Dani Yogatama", "Maarten Bosma", "Denny Zhou", "Donald Metzler", "Ed H. Chi", "Tatsunori Hashimoto", "Oriol Vinyals", "Percy Liang", "Jeff Dean", "William Fedus"],
      5     "year": 2022,
      6     "venue": "Transactions on Machine Learning Research",
      7     "arxiv_id": "2206.07682",
      8     "doi": "10.48550/arXiv.2206.07682"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No code repository or analysis scripts are released. The paper surveys prior work without providing code."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": false,
     22         "justification": "No dataset of surveyed abilities, classification annotations, or extracted data is released as a downloadable artifact. The BIG-Bench task classifications (Appendix E) are listed in the paper text but not as structured data."
     23       },
     24       "environment_specified": {
     25         "applies": false,
     26         "answer": false,
     27         "justification": "This is a survey paper that does not run experiments requiring an environment specification."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No reproduction instructions are provided. While Appendix E lists task classifications, there are no scripts or procedures to reproduce the analysis or figures."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": false,
     38         "answer": false,
     39         "justification": "This is a survey that aggregates results from prior work. It does not run its own experiments requiring confidence intervals."
     40       },
     41       "significance_tests": {
     42         "applies": false,
     43         "answer": false,
     44         "justification": "The paper does not perform statistical comparisons — it surveys and visualizes results from prior work."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "No original statistical analysis is conducted; the paper plots results from prior work."
     50       },
     51       "sample_size_justified": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "No original experiments with sample sizes. This is a survey."
     55       },
     56       "variance_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "No original experiments. The paper re-plots results from prior work without variance analysis."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper includes random performance baselines on all scaling curve plots (Figures 2-4), allowing readers to see when performance exceeds chance."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The surveyed models (GPT-3, LaMDA, Gopher, Chinchilla, PaLM) were contemporary at time of writing (2022). PaLM was the most recent large model available."
     72       },
     73       "ablation_study": {
     74         "applies": false,
     75         "answer": false,
     76         "justification": "This is a survey paper with no system to ablate."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper examines multiple evaluation metrics: accuracy, exact match, BLEU, cross-entropy loss, ROUGE, BLEURT (Appendix A.2, Figure 7). Section 5.1 and Appendix A explicitly compare downstream metrics vs. cross-entropy loss."
     82       },
     83       "human_evaluation": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "Human evaluation is irrelevant for a survey cataloging emergent abilities from prior work."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "No original experiments requiring train/test splits."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Appendix A.3 and Figure 8 break down BIG-Bench tasks by keyword tags. Appendix B and Figure 10 break down MMLU by four categories (Humanities, STEM, Social Science, Other)."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Section 5.2 discusses limitations of scaling. Appendix E.4 lists 44 'flat' tasks where no model achieves above-random performance. Figure 2H shows WiC as a task where GPT-3 and Chinchilla fail."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The paper reports that instruction-finetuning hurts performance at small scale (Figure 3B), that GPT-3 fails on WiC even at 175B (Figure 2H), and lists many tasks where models fail (Appendix E.4)."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The abstract claims that emergent abilities exist and cannot be predicted by extrapolating smaller models. The paper provides extensive evidence through scaling curves (Figures 2-4) showing near-random performance followed by sharp jumps."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper uses causal language like 'scaling up language models... can lead to better performance' and 'scale to unpredictably enable new techniques' (§5). However, it acknowledges in §5.1 that 'there are currently few compelling explanations for why such abilities emerge' and that scale covaries with data quality, architecture, etc. The causal mechanism is not established."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Section 2 explicitly notes 'the scale at which an ability is first observed to emerge depends on a number of factors and is not an immutable property of the ability.' Section 5.2 discusses how emergence can occur at smaller scales with better data/architecture. The paper is careful not to claim specific scale thresholds are universal."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Section 5.1 discusses alternative explanations: evaluation metrics masking gradual improvement, model depth requirements, and memorization. Appendix A provides cross-entropy loss analysis showing improvements are real but masked by downstream metrics. Section 5.2 discusses architecture and data quality as alternative factors."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 5.1 explicitly discusses how downstream metrics (exact match, accuracy) may be proxies that mask gradual improvement visible in cross-entropy loss. Appendix A provides detailed analysis of this proxy gap across six tasks."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Table 2 (Appendix C) provides detailed specifications for all models: exact parameter counts, training tokens, and training FLOPs for GPT-3, LaMDA, Gopher, Chinchilla, PaLM, and Anthropic LM families."
    141       },
    142       "prompts_provided": {
    143         "applies": false,
    144         "answer": false,
    145         "justification": "The paper does not run its own prompting experiments — it surveys results from prior work. It shows a generic example in Figure 1 but this is illustrative."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "No original experiments are conducted. The paper surveys results from prior work."
    151       },
    152       "scaffolding_described": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "No agentic scaffolding is used."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The paper does not describe how the set of emergent abilities was collected or what criteria determined inclusion. Appendix A.3 mentions 'two co-authors worked together and agreed with confidence on all the tasks labeled as emergent' but provides no formal criteria for the classification."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 5 contains extensive discussion of limitations including §5.1 (potential explanations/metric artifacts), §5.2 (beyond scaling — emergence not solely about scale), §5.4 (emergent risks), and the Broader Impact Statement."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 5.1 specifically discusses that evaluation metrics may create an illusion of emergence by not giving partial credit: 'using exact string match as the evaluation metric for long-sequence targets may disguise compounding incremental improvements as emergence.' Appendix A.3 notes the subjectivity of the emergence classification."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 2 states: 'Our goal in this paper is not to characterize or claim that a specific scale is required to observe emergent abilities, but rather, we aim to discuss examples of emergent behavior in prior work.' Footnote 1 limits scope to pre-trained Transformer language models."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw data is released. The scaling curve data points, BIG-Bench task classifications, and cross-entropy loss analyses are not available for independent verification."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper does not describe how the set of emergent abilities was systematically identified. It surveys 'a range of prior work' without specifying search criteria or completeness."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data sources are standard benchmarks and published model evaluations."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The pipeline from identifying candidate emergent abilities to the final set presented is not documented. Appendix A.3 mentions co-author agreement on BIG-Bench task classification but provides no formal protocol."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding statement or acknowledgment of funding sources. The Acknowledgments section thanks individuals but does not mention grants or funding."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Author affiliations are clearly listed: Google Research, Stanford University, UNC Chapel Hill, DeepMind. These are the companies/institutions that built the models being surveyed."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Authors are primarily from Google Research and DeepMind — companies that build the largest language models discussed. A finding that 'emergence exists and more scaling may unlock more abilities' directly supports their business interests in continued scaling. This conflict is not acknowledged."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present. Authors from Google and DeepMind have direct financial interest in demonstrating value of scaling language models."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "The paper does not evaluate a model's capability on benchmarks — it surveys prior evaluations. Contamination analysis is the responsibility of the original papers."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Same as above — this is a survey of prior work, not a benchmark evaluation."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Same as above — this is a survey of prior work."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "Survey paper — no method with associated costs."
    283       },
    284       "compute_budget_stated": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "Survey paper — no original computation performed beyond plotting."
    288       }
    289     },
    290     "survey_methodology": {
    291       "prisma_or_structured_protocol": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No structured review protocol. The paper collects examples of emergence from prior work without a systematic search strategy, reproducible queries, or PRISMA-style flow diagram. The selection appears ad hoc."
    295       },
    296       "quality_assessment_of_sources": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "The paper does not assess the quality of the source evaluations it surveys. It treats all cited benchmark results equally regardless of methodological rigor. Section 5.1 briefly questions whether evaluation metrics create an illusion of emergence, but does not assess whether the underlying evaluations were well-conducted."
    300       },
    301       "publication_bias_discussed": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No discussion of publication bias. The surveyed abilities come overwhelmingly from papers by the authors' own organizations (Google, DeepMind, Anthropic). There is no consideration of whether negative results (abilities that did NOT emerge) are underreported."
    305       }
    306     }
    307   },
    308   "claims": [
    309     {
    310       "claim": "Certain abilities of large language models are emergent — not present in smaller models but appearing in larger models, with performance near-random until a critical threshold of scale.",
    311       "evidence": "Figures 2-4 show scaling curves for 8+ tasks across 5 model families (GPT-3, LaMDA, Gopher, Chinchilla, PaLM) where performance jumps from near-random to well above random at specific scale thresholds. Table 1 catalogs ~25 emergent abilities with specific emergence scales.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "Augmented prompting strategies (chain-of-thought, instruction following, scratchpad) are also emergent in that they only help at sufficient scale.",
    316       "evidence": "Figure 3 shows four examples: chain-of-thought on GSM8K only helps at ~10^23 FLOPs (§4), instruction tuning hurts below ~7×10^21 FLOPs, scratchpad for 8-digit addition only helps above ~9×10^19 FLOPs, and P(True) calibration only emerges at ~3×10^23 FLOPs.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "Cross-entropy loss improves continuously even when downstream metrics show no improvement, suggesting emergence may be partly a measurement artifact.",
    321       "evidence": "Appendix A.1, Figures 5-6 show that for six BIG-Bench tasks, cross-entropy loss decreases monotonically while downstream metrics (EM, BLEU, accuracy) remain at random for small models.",
    322       "supported": "strong"
    323     },
    324     {
    325       "claim": "Scale is not the only factor for emergence — better data, architectures, and training can unlock abilities at smaller scale.",
    326       "evidence": "Section 5.2 notes PaLM 62B achieves above-random on 14 BIG-Bench tasks where LaMDA 137B and GPT-3 175B do not. Sanh et al. induced instruction-following in an 11B model. InstructGPT 1.3B outperformed much larger models.",
    327       "supported": "strong"
    328     }
    329   ],
    330   "methodology_tags": ["meta-analysis"],
    331   "key_findings": "The paper catalogs dozens of emergent abilities across language model families where performance is near-random until a specific scale threshold, then jumps sharply. These include both few-shot prompting tasks (arithmetic, TruthfulQA, MMLU) and augmented prompting strategies (chain-of-thought, instruction following). Cross-entropy loss analysis reveals that models improve continuously even when downstream metrics appear flat, suggesting emergence may partly reflect metric choice rather than a true phase transition. The paper notes that scale is not the sole factor — better architectures and data can lower the emergence threshold.",
    332   "red_flags": [
    333     {
    334       "flag": "Conflict of interest",
    335       "detail": "Authors are primarily from Google Research and DeepMind — the organizations that build and sell the largest language models. The paper's central thesis (scaling unlocks unpredictable new abilities) directly supports continued investment in scaling, which benefits these organizations. This conflict is not disclosed or discussed."
    336     },
    337     {
    338       "flag": "No systematic review methodology",
    339       "detail": "The set of emergent abilities surveyed appears to be an ad hoc collection from papers the authors were familiar with, mostly from their own organizations. There is no systematic search, no inclusion/exclusion criteria, and no PRISMA-style protocol. This makes the survey susceptible to selection bias."
    340     },
    341     {
    342       "flag": "Subjective classification of emergence",
    343       "detail": "Appendix A.3 acknowledges the task classification is 'potentially subjective' and relies on two co-authors agreeing. No formal criteria for what constitutes 'near-random' or 'substantially above random' are provided. Different thresholds could substantially change which abilities are classified as emergent."
    344     },
    345     {
    346       "flag": "Subsequent work challenges core claims",
    347       "detail": "Schaeffer et al. (2023, 'Are Emergent Abilities of Large Language Models a Mirage?') showed that emergence disappears when using continuous metrics instead of discontinuous ones, suggesting the phenomenon is largely a measurement artifact — consistent with this paper's own Appendix A findings about cross-entropy loss."
    348     }
    349   ],
    350   "cited_papers": [
    351     {
    352       "title": "Language models are few-shot learners",
    353       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    354       "year": 2020,
    355       "relevance": "GPT-3 paper that popularized few-shot prompting and documented first emergence examples."
    356     },
    357     {
    358       "title": "Scaling laws for neural language models",
    359       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    360       "year": 2020,
    361       "arxiv_id": "2001.08361",
    362       "relevance": "Foundational scaling laws work showing predictable performance improvement, against which emergence is contrasted."
    363     },
    364     {
    365       "title": "Training compute-optimal large language models",
    366       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    367       "year": 2022,
    368       "arxiv_id": "2203.15556",
    369       "relevance": "Chinchilla paper on compute-optimal training, relevant to understanding scaling and efficiency."
    370     },
    371     {
    372       "title": "PaLM: Scaling language modeling with Pathways",
    373       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    374       "year": 2022,
    375       "arxiv_id": "2204.02311",
    376       "relevance": "PaLM paper demonstrating emergence at 540B scale on tasks where smaller models fail."
    377     },
    378     {
    379       "title": "Chain of thought prompting elicits reasoning in large language models",
    380       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    381       "year": 2022,
    382       "arxiv_id": "2201.11903",
    383       "relevance": "Introduced chain-of-thought prompting as an emergent augmented prompting strategy."
    384     },
    385     {
    386       "title": "Beyond the imitation game: Measuring and extrapolating the capabilities of language models",
    387       "authors": ["BIG-Bench"],
    388       "year": 2022,
    389       "arxiv_id": "2206.04615",
    390       "relevance": "BIG-Bench benchmark suite providing 200+ tasks used to identify emergent abilities."
    391     },
    392     {
    393       "title": "On the opportunities and risks of foundation models",
    394       "authors": ["Rishi Bommasani", "Drew A. Hudson"],
    395       "year": 2021,
    396       "arxiv_id": "2108.07258",
    397       "relevance": "Foundation models survey discussing emergence as a key phenomenon and associated risks."
    398     },
    399     {
    400       "title": "Training language models to follow instructions with human feedback",
    401       "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"],
    402       "year": 2022,
    403       "arxiv_id": "2203.02155",
    404       "relevance": "InstructGPT showing RLHF can unlock instruction-following in smaller models, relevant to emergence beyond scale."
    405     },
    406     {
    407       "title": "Measuring massive multitask language understanding",
    408       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    409       "year": 2021,
    410       "relevance": "MMLU benchmark used as key example of emergent multi-task language understanding."
    411     },
    412     {
    413       "title": "TruthfulQA: Measuring how models mimic human falsehoods",
    414       "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"],
    415       "year": 2021,
    416       "arxiv_id": "2109.07958",
    417       "relevance": "Benchmark for truthfulness showing emergent performance and also emergent risks (mimicking falsehoods)."
    418     },
    419     {
    420       "title": "Predictability and surprise in large generative models",
    421       "authors": ["Deep Ganguli", "Danny Hernandez", "Liane Lovitt"],
    422       "year": 2022,
    423       "arxiv_id": "2202.07785",
    424       "relevance": "Discusses unpredictability of downstream task performance despite predictable scaling of loss, directly relevant to emergence."
    425     },
    426     {
    427       "title": "Extracting training data from large language models",
    428       "authors": ["Nicholas Carlini", "Florian Tramer", "Eric Wallace"],
    429       "year": 2021,
    430       "relevance": "Shows larger models memorize more training data — an emergent risk that scales with model size."
    431     }
    432   ]
    433 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs