scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31148B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Emergent Abilities of Large Language Models",
      6     "authors": [
      7       "Jason Wei",
      8       "Yi Tay",
      9       "Rishi Bommasani",
     10       "Colin Raffel",
     11       "Barret Zoph",
     12       "Sebastian Borgeaud",
     13       "Dani Yogatama",
     14       "Maarten Bosma",
     15       "Denny Zhou",
     16       "Donald Metzler",
     17       "Ed H. Chi",
     18       "Tatsunori Hashimoto",
     19       "Oriol Vinyals",
     20       "Percy Liang",
     21       "Jeff Dean",
     22       "William Fedus"
     23     ],
     24     "year": 2022,
     25     "venue": "Transactions on Machine Learning Research",
     26     "arxiv_id": "2206.07682",
     27     "doi": null
     28   },
     29   "checklist": {
     30     "claims_and_evidence": {
     31       "abstract_claims_supported": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The abstract claims that scaling leads to unpredictable emergent abilities. The paper supports this with extensive examples across multiple model families (Figures 2-3, Table 1) showing near-random performance until a threshold scale.",
     35         "source": "opus"
     36       },
     37       "causal_claims_justified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper uses causal language ('scaling... lead to', 'scale to unpredictably enable') but the evidence is observational — models that are larger also differ in training data, architecture, and training procedure. Section 5.2 partially acknowledges this ('model scale is not the singular factor') but the overall framing attributes emergence to scale without adequate causal identification.",
     41         "source": "opus"
     42       },
     43       "generalization_bounded": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Section 2 explicitly states 'Our goal in this paper is not to characterize or claim that a specific scale is required to observe emergent abilities.' Section 5.2 notes emergence depends on data quality, architecture, and training, not just scale. Section 5 discusses limitations of scale-only framing.",
     47         "source": "opus"
     48       },
     49       "alternative_explanations_discussed": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 5.1 discusses metric choice as an alternative explanation (exact match hiding gradual improvement). Appendix A provides cross-entropy loss analysis showing underlying gradual improvement. Section 5.2 discusses architecture, data quality, and training objective as alternative explanations for why emergence thresholds vary.",
     53         "source": "opus"
     54       },
     55       "proxy_outcome_distinction": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 5.1 and Appendix A explicitly discuss how downstream metrics (exact match, accuracy) may be a poor proxy for underlying model improvement, showing that cross-entropy loss improves continuously even when downstream metrics appear flat. This is a substantive proxy-outcome distinction.",
     59         "source": "opus"
     60       }
     61     },
     62     "limitations_and_scope": {
     63       "limitations_section_present": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 5 contains substantial discussion of limitations: Section 5.1 on incomplete explanations, Section 5.2 on factors beyond scaling, Section 5.4 on emergent risks. The Broader Impact Statement also acknowledges unpredictability.",
     67         "source": "opus"
     68       },
     69       "threats_to_validity_specific": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 5.1 raises specific threats: evaluation metrics may disguise gradual improvement as emergence. Appendix A provides concrete analysis of this threat. Section 5.2 raises that PaLM 62B shows emergence where larger GPT-3/LaMDA do not, challenging the scale-only narrative.",
     73         "source": "opus"
     74       },
     75       "scope_boundaries_stated": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 2 states: 'Our goal in this paper is not to characterize or claim that a specific scale is required to observe emergent abilities.' The paper explicitly acknowledges limitations of scale as the sole variable (Section 5.2, 5.3) and that some abilities may never emerge (end of Section 5.2).",
     79         "source": "opus"
     80       }
     81     },
     82     "conflicts_of_interest": {
     83       "funding_disclosed": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding acknowledgment section. Authors are from Google Research, DeepMind, Stanford, and UNC Chapel Hill, but no explicit funding statement.",
     87         "source": "opus"
     88       },
     89       "affiliations_disclosed": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Author affiliations are clearly listed: Google Research, Stanford University, UNC Chapel Hill, and DeepMind. These affiliations are relevant since many of the models discussed (PaLM, LaMDA, Gopher, Chinchilla) are from Google/DeepMind.",
     93         "source": "opus"
     94       },
     95       "funder_independent_of_outcome": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "Authors are employed by Google Research and DeepMind, which have a direct commercial interest in demonstrating that scaling up language models yields valuable emergent capabilities. This conflict is not acknowledged.",
     99         "source": "opus"
    100       },
    101       "financial_interests_declared": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "No competing interests or financial disclosure statement is provided. Google and DeepMind employees writing about why scaling up language models produces valuable emergent abilities is a notable undisclosed conflict.",
    105         "source": "opus"
    106       }
    107     },
    108     "scope_and_framing": {
    109       "key_terms_defined": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper provides precise operational definitions in §2: 'An ability is emergent if it is not present in smaller models but is present in larger models,' and defines emergence as 'when quantitative changes in a system result in qualitative changes in behavior.'",
    113         "source": "haiku"
    114       },
    115       "intended_contribution_clear": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The introduction explicitly states the paper surveys emergent abilities as observed in prior work, categorizes them across settings, and motivates future research questions about why they arise and whether more scaling will produce further emergence.",
    119         "source": "haiku"
    120       },
    121       "engagement_with_prior_work": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper extensively engages with prior work — scaling laws (Kaplan 2020, Hoffmann 2022), BIG-Bench, GPT-3, PaLM — showing how each result relates to the emergence framework and identifying what prior work failed to predict.",
    125         "source": "haiku"
    126       }
    127     }
    128   },
    129   "type_checklist": {
    130     "empirical": {
    131       "artifacts": {
    132         "code_released": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No code repository or analysis scripts are mentioned or released. The paper is a survey but could have released code for its BIG-Bench cross-entropy analysis or task classification annotations.",
    136           "source": "opus"
    137         },
    138         "data_released": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "The paper includes full task classification annotations in Appendix E (E.1–E.5), listing all 210+ BIG-Bench tasks categorized as emergent, smoothly increasing, flat, or other. The underlying BIG-Bench benchmark is publicly available.",
    142           "source": "opus"
    143         },
    144         "environment_specified": {
    145           "applies": false,
    146           "answer": false,
    147           "justification": "This is a survey paper that does not run new experiments requiring environment specifications.",
    148           "source": "opus"
    149         },
    150         "reproduction_instructions": {
    151           "applies": false,
    152           "answer": false,
    153           "justification": "This is a survey paper; no new experiments to reproduce.",
    154           "source": "opus"
    155         }
    156       },
    157       "statistical_methodology": {
    158         "confidence_intervals_or_error_bars": {
    159           "applies": false,
    160           "answer": false,
    161           "justification": "Survey paper that reports results from prior work; does not run its own experiments requiring confidence intervals.",
    162           "source": "opus"
    163         },
    164         "significance_tests": {
    165           "applies": false,
    166           "answer": false,
    167           "justification": "Survey paper; no new comparative claims requiring significance tests.",
    168           "source": "opus"
    169         },
    170         "effect_sizes_reported": {
    171           "applies": false,
    172           "answer": false,
    173           "justification": "Survey paper; does not run experiments requiring effect size reporting.",
    174           "source": "opus"
    175         },
    176         "sample_size_justified": {
    177           "applies": false,
    178           "answer": false,
    179           "justification": "Survey paper; no experimental samples.",
    180           "source": "opus"
    181         },
    182         "variance_reported": {
    183           "applies": false,
    184           "answer": false,
    185           "justification": "Survey paper; no own experimental runs to report variance across.",
    186           "source": "opus"
    187         }
    188       },
    189       "evaluation_design": {
    190         "baselines_included": {
    191           "applies": false,
    192           "answer": false,
    193           "justification": "Survey paper; does not propose or evaluate a system that would require baselines.",
    194           "source": "opus"
    195         },
    196         "baselines_contemporary": {
    197           "applies": false,
    198           "answer": false,
    199           "justification": "Survey paper; no system evaluation.",
    200           "source": "opus"
    201         },
    202         "ablation_study": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "Survey paper; no system with components to ablate.",
    206           "source": "opus"
    207         },
    208         "multiple_metrics": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "The paper compares multiple evaluation metrics for the same tasks (Appendix A): exact match, BLEU, ROUGE, BLEURT, cross-entropy loss, and accuracy. Figure 7 and Appendix A.2 explicitly compare how emergence appears under different metrics.",
    212           "source": "opus"
    213         },
    214         "human_evaluation": {
    215           "applies": false,
    216           "answer": false,
    217           "justification": "Survey paper; no system outputs to evaluate with human judges.",
    218           "source": "opus"
    219         },
    220         "held_out_test_set": {
    221           "applies": false,
    222           "answer": false,
    223           "justification": "Survey paper; no experiments requiring held-out test sets.",
    224           "source": "opus"
    225         },
    226         "per_category_breakdown": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Extensive per-category breakdowns provided: Figure 8 breaks down BIG-Bench tasks by keyword tag, Appendix B breaks MMLU into four subject categories (Humanities, STEM, Social Science, Other), and Appendix E categorizes all 210+ tasks.",
    230           "source": "opus"
    231         },
    232         "failure_cases_discussed": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Section 5.2 discusses limitations of scaling. Appendix E.4 lists dozens of 'flat' tasks where no model performs better than random, explicitly identifying where emergence fails. Section 5.1 acknowledges incomplete explanations.",
    236           "source": "opus"
    237         },
    238         "negative_results_reported": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "The paper reports tasks where emergence does not occur (Appendix E.4), notes that GPT-3 fails on WiC even at largest scale (Figure 2H), and discusses how instruction tuning hurts smaller models (Section 4, Figure 3B).",
    242           "source": "opus"
    243         }
    244       },
    245       "setup_transparency": {
    246         "model_versions_specified": {
    247           "applies": false,
    248           "answer": false,
    249           "justification": "Survey paper that does not run its own model experiments. Models discussed are from prior work with citations.",
    250           "source": "opus"
    251         },
    252         "prompts_provided": {
    253           "applies": false,
    254           "answer": false,
    255           "justification": "Survey paper; does not use prompting in its own experiments.",
    256           "source": "opus"
    257         },
    258         "hyperparameters_reported": {
    259           "applies": false,
    260           "answer": false,
    261           "justification": "Survey paper; no own experiments requiring hyperparameter reporting.",
    262           "source": "opus"
    263         },
    264         "scaffolding_described": {
    265           "applies": false,
    266           "answer": false,
    267           "justification": "No agentic scaffolding used.",
    268           "source": "opus"
    269         },
    270         "data_preprocessing_documented": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "The paper does not describe the process for selecting which emergent abilities or papers to include in the survey. The task classification in Appendix A.3 mentions 'two co-authors of the paper worked together and agreed with confidence on all the tasks labeled as emergent,' but the selection criteria for which prior work to survey are not documented.",
    274           "source": "opus"
    275         }
    276       },
    277       "data_integrity": {
    278         "raw_data_available": {
    279           "applies": true,
    280           "answer": true,
    281           "justification": "The underlying BIG-Bench benchmark data is publicly available and the paper references it with URLs. The task annotations are fully listed in Appendix E.",
    282           "source": "opus"
    283         },
    284         "data_collection_described": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The paper does not describe its systematic process for collecting examples of emergent abilities from the literature. It surveys 'a range of prior work' without describing how those papers were identified or selected.",
    288           "source": "opus"
    289         },
    290         "recruitment_methods_described": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "No human participants; data comes from publicly available benchmarks.",
    294           "source": "opus"
    295         },
    296         "data_pipeline_documented": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "No description of how the survey scope was determined, which papers were included/excluded, or how the examples in Figures 2-3 were selected from the broader literature.",
    300           "source": "opus"
    301         }
    302       },
    303       "contamination": {
    304         "training_cutoff_stated": {
    305           "applies": false,
    306           "answer": false,
    307           "justification": "The paper does not evaluate a pre-trained model's capability on benchmarks itself — it surveys prior work's evaluations.",
    308           "source": "opus"
    309         },
    310         "train_test_overlap_discussed": {
    311           "applies": false,
    312           "answer": false,
    313           "justification": "Survey paper; does not evaluate models on benchmarks directly.",
    314           "source": "opus"
    315         },
    316         "benchmark_contamination_addressed": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "Survey paper; does not evaluate models on benchmarks directly.",
    320           "source": "opus"
    321         }
    322       },
    323       "human_studies": {
    324         "pre_registered": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants.",
    328           "source": "opus"
    329         },
    330         "irb_or_ethics_approval": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants.",
    334           "source": "opus"
    335         },
    336         "demographics_reported": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants.",
    340           "source": "opus"
    341         },
    342         "inclusion_exclusion_criteria": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants.",
    346           "source": "opus"
    347         },
    348         "randomization_described": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants.",
    352           "source": "opus"
    353         },
    354         "blinding_described": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "No human participants.",
    358           "source": "opus"
    359         },
    360         "attrition_reported": {
    361           "applies": false,
    362           "answer": false,
    363           "justification": "No human participants.",
    364           "source": "opus"
    365         }
    366       },
    367       "cost_and_practicality": {
    368         "inference_cost_reported": {
    369           "applies": false,
    370           "answer": false,
    371           "justification": "Survey paper; no own method whose cost needs reporting.",
    372           "source": "opus"
    373         },
    374         "compute_budget_stated": {
    375           "applies": false,
    376           "answer": false,
    377           "justification": "Survey paper; no own experiments requiring compute budget reporting.",
    378           "source": "opus"
    379         }
    380       },
    381       "survey_methodology": {
    382         "prisma_or_structured_protocol": {
    383           "applies": true,
    384           "answer": false,
    385           "justification": "No structured review protocol. The paper does not describe systematic search queries, databases searched, or inclusion/exclusion criteria. Examples appear to be selected ad-hoc from the authors' knowledge of the literature.",
    386           "source": "opus"
    387         },
    388         "quality_assessment_of_sources": {
    389           "applies": true,
    390           "answer": false,
    391           "justification": "The paper does not assess the methodological quality of the source papers it surveys. Results from all cited papers are treated as equally reliable regardless of experimental rigor.",
    392           "source": "opus"
    393         },
    394         "publication_bias_discussed": {
    395           "applies": true,
    396           "answer": false,
    397           "justification": "No discussion of publication bias. Papers showing emergent abilities are more likely to be published than papers showing gradual improvement, but this selection bias is not addressed.",
    398           "source": "opus"
    399         }
    400       }
    401     }
    402   },
    403   "claims": [
    404     {
    405       "claim": "Certain language model abilities are absent in smaller models and appear suddenly at larger scales, constituting 'emergent abilities' that cannot be predicted by extrapolating scaling laws from smaller models.",
    406       "evidence": "Scaling curves across BIG-Bench tasks, MMLU, TruthfulQA, and WiC show near-random performance for all small models followed by sharp jumps at specific compute thresholds across multiple model families.",
    407       "supported": "moderate"
    408     },
    409     {
    410       "claim": "Augmented prompting strategies (chain-of-thought, instruction tuning, scratchpad) are themselves emergent — harmful or neutral at small scales and beneficial only above certain compute thresholds.",
    411       "evidence": "Figure 3 shows chain-of-thought only surpasses standard prompting at 10²³ training FLOPs (~100B parameters) for LaMDA, and instruction tuning hurts performance below 7×10²¹ FLOPs.",
    412       "supported": "moderate"
    413     },
    414     {
    415       "claim": "Emergence is not solely a function of scale — architecture, data quality, and training procedure also determine whether and when abilities emerge.",
    416       "evidence": "PaLM 62B achieves above-random performance on 14 BIG-Bench tasks where LaMDA 137B and GPT-3 175B (both larger) do not, despite having fewer parameters and FLOPs.",
    417       "supported": "weak"
    418     },
    419     {
    420       "claim": "Cross-entropy loss improves continuously with scale even when discrete downstream metrics (exact match, accuracy) show random performance, suggesting gradual improvement masked by metric choice.",
    421       "evidence": "Appendix A shows all six analyzed BIG-Bench tasks fall under 'Outcome 2' — cross-entropy improves for small models even when error rate is near 100%, with an 'elbow' coinciding with the emergence threshold.",
    422       "supported": "strong"
    423     },
    424     {
    425       "claim": "Social risks (bias, toxicity, memorization) may also emerge unpredictably with scale, paralleling capability emergence.",
    426       "evidence": "Section 5.4 surveys prior work showing that memorization increases with scale (Carlini et al.), bias can increase in ambiguous contexts (BBQ), and TruthfulQA performance initially worsens before improving at very large scale.",
    427       "supported": "weak"
    428     },
    429     {
    430       "claim": "There are no clear trends identifying which types of BIG-Bench tasks are most likely to be emergent.",
    431       "evidence": "Appendix A.3 classifies all 210 tasks; the highest-emergence keyword categories (analogical reasoning, word sense disambiguation, truthfulness) do not form a coherent pattern, and arithmetic — with early famous emergence examples — has relatively low overall emergence fraction.",
    432       "supported": "moderate"
    433     }
    434   ],
    435   "methodology_tags": [
    436     "benchmark-eval",
    437     "meta-analysis"
    438   ],
    439   "key_findings": "The paper documents 'emergent abilities' of large language models — capabilities absent in smaller models that appear sharply at specific compute thresholds across diverse benchmarks (BIG-Bench, MMLU, TruthfulQA) and model families (GPT-3, LaMDA, Gopher, Chinchilla, PaLM). Augmented prompting strategies including chain-of-thought, instruction tuning, and scratchpad are also emergent, benefiting only models above ~10²³ training FLOPs. Cross-entropy analysis reveals that models improve gradually even when discrete metrics show random performance, suggesting that sharp emergence in downstream metrics partly reflects metric artifacts rather than purely discontinuous capability acquisition. The paper raises the open question of whether further scaling will produce additional emergent abilities, and warns that safety-relevant risks may emerge similarly unpredictably.",
    440   "red_flags": [
    441     {
    442       "flag": "Undisclosed COI: in-house model evaluation",
    443       "detail": "The majority of authors are Google Research employees, yet the paper prominently showcases Google's LaMDA and PaLM models; three DeepMind authors evaluate Gopher and Chinchilla. No competing interests statement exists."
    444     },
    445     {
    446       "flag": "Benchmark contamination ignored",
    447       "detail": "BIG-Bench and MMLU were publicly available before or during training of GPT-3, PaLM, and other evaluated models. The paper does not discuss whether training data overlap could explain apparent emergent performance jumps."
    448     },
    449     {
    450       "flag": "Non-causal design for causal claim",
    451       "detail": "The core framing is that scale causes emergence, but evidence is purely correlational across model families that differ in architecture, data, and training recipes. No controlled ablation exists."
    452     },
    453     {
    454       "flag": "Registry misattribution",
    455       "detail": "The registry entry ID 'future-ml-systems-2022' corresponds to a Steinhardt blog post cited in this paper, not to the paper itself (Wei et al. 2022, arXiv:2206.07682). The paper type label 'empirical' is also misleading — the paper's own Broader Impact Statement describes it as a survey of existing literature."
    456     },
    457     {
    458       "flag": "No statistical uncertainty on phase-transition claims",
    459       "detail": "Emergence thresholds are identified visually from scaling curves with no confidence intervals, significance tests, or sensitivity analyses. The number of benchmark examples per task varies widely (as low as 32 for 'Logical arguments') without correction."
    460     },
    461     {
    462       "flag": "Metric artifact explanation unresolved",
    463       "detail": "The paper raises and partially investigates the hypothesis that emergence is an artifact of discrete metrics, shows cross-entropy improves gradually, but concludes this is 'at best an incomplete explanation' without resolving the question — leaving the central claim in an ambiguous epistemic state."
    464     }
    465   ],
    466   "cited_papers": [
    467     {
    468       "title": "Beyond the Imitation Game: Quantifying and Extrapolating the Capabilities of Language Models (BIG-Bench)",
    469       "relevance": "Primary benchmark source for emergent ability documentation; the paper surveys and re-analyzes BIG-Bench results across 200+ tasks"
    470     },
    471     {
    472       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    473       "relevance": "Key example of an emergent augmented prompting strategy; shown to only improve over standard prompting above ~10²³ FLOPs"
    474     },
    475     {
    476       "title": "Scaling Laws for Neural Language Models (Kaplan et al. 2020)",
    477       "relevance": "Establishes the scaling law baseline that emergent abilities are defined against — emergence contradicts smooth power-law predictions"
    478     },
    479     {
    480       "title": "Training Compute-Optimal Large Language Models (Chinchilla / Hoffmann et al. 2022)",
    481       "relevance": "Source of Chinchilla scaling curves and the argument that prior work underestimated training data needs; central model family in emergence analysis"
    482     },
    483     {
    484       "title": "Language Models are Few-Shot Learners (GPT-3 / Brown et al. 2020)",
    485       "relevance": "Foundational few-shot prompting paper; GPT-3 is a primary model family in the emergence documentation"
    486     },
    487     {
    488       "title": "Measuring Massive Multitask Language Understanding (MMLU / Hendrycks et al. 2021)",
    489       "relevance": "Key benchmark showing emergent multi-topic knowledge acquisition; used for per-category breakdown analysis in Appendix B"
    490     },
    491     {
    492       "title": "PaLM: Scaling Language Modeling with Pathways (Chowdhery et al. 2022)",
    493       "relevance": "Largest model evaluated; provides key examples of emergence occurring at 540B parameter scale not seen in smaller models"
    494     },
    495     {
    496       "title": "Predictability and Surprise in Large Generative Models (Ganguli et al. 2022)",
    497       "relevance": "Related work on the unpredictability of large model behaviors; frames the problem of predicting when new abilities emerge"
    498     },
    499     {
    500       "title": "On the Opportunities and Risks of Foundation Models (Bommasani et al. 2021)",
    501       "relevance": "Provides the broader framing of emergent risks and sociological shifts that the paper draws on for its risk and sociological sections"
    502     },
    503     {
    504       "title": "Finetuned Language Models are Zero-Shot Learners (FLAN / Wei et al. 2022)",
    505       "relevance": "Source of instruction-following emergence data; shows instruction tuning only benefits models above a scale threshold"
    506     }
    507   ],
    508   "engagement_factors": {
    509     "practical_relevance": {
    510       "score": 2,
    511       "justification": "Identifies when to expect qualitative capability jumps, useful for planning compute investments, but cannot predict which specific abilities will emerge or when."
    512     },
    513     "surprise_contrarian": {
    514       "score": 3,
    515       "justification": "Directly challenges the prevailing scaling-law paradigm by showing that performance improvements are not always smooth and predictable — a fundamental surprise for the field."
    516     },
    517     "fear_safety": {
    518       "score": 2,
    519       "justification": "Section 5.4 explicitly argues that safety-relevant risks (toxicity, deception, backdoor vulnerabilities) may also emerge unpredictably with scale, raising concern about unforeseeable harms."
    520     },
    521     "drama_conflict": {
    522       "score": 2,
    523       "justification": "The paper spawned significant subsequent debate (Schaeffer et al. 2023 argued emergence is a metric artifact), making it the center of an ongoing methodological controversy in the field."
    524     },
    525     "demo_ability": {
    526       "score": 1,
    527       "justification": "Describes existing model behaviors on public benchmarks but requires access to very large proprietary models; no live demo or accessible reproduction is possible."
    528     },
    529     "brand_recognition": {
    530       "score": 3,
    531       "justification": "Published by Google Research and DeepMind researchers, showcases PaLM and LaMDA, and has become one of the most-cited papers in the LLM scaling literature."
    532     }
    533   },
    534   "hn_data": {
    535     "threads": [
    536       {
    537         "hn_id": "40689833",
    538         "title": "Survey of Rickrolling in Academic Literature [pdf]",
    539         "points": 69,
    540         "comments": 14,
    541         "url": "https://news.ycombinator.com/item?id=40689833",
    542         "created_at": "2024-06-15T13:54:57Z"
    543       },
    544       {
    545         "hn_id": "37543595",
    546         "title": "Ask HN: Transformer alternatives that could have emergent properties when scaled",
    547         "points": 6,
    548         "comments": 3,
    549         "url": "https://news.ycombinator.com/item?id=37543595",
    550         "created_at": "2023-09-17T10:45:52Z"
    551       },
    552       {
    553         "hn_id": "36349856",
    554         "title": "SqueezeLLM: Dense-and-Sparse Quantization",
    555         "points": 5,
    556         "comments": 1,
    557         "url": "https://news.ycombinator.com/item?id=36349856",
    558         "created_at": "2023-06-16T01:43:39Z"
    559       },
    560       {
    561         "hn_id": "35621735",
    562         "title": "Emergent Abilities of Large Language Models",
    563         "points": 4,
    564         "comments": 1,
    565         "url": "https://news.ycombinator.com/item?id=35621735",
    566         "created_at": "2023-04-18T23:06:51Z"
    567       },
    568       {
    569         "hn_id": "36342137",
    570         "title": "SqueezeLLM: Lossless 3-bit quantization with improved performance",
    571         "points": 4,
    572         "comments": 0,
    573         "url": "https://news.ycombinator.com/item?id=36342137",
    574         "created_at": "2023-06-15T15:43:48Z"
    575       },
    576       {
    577         "hn_id": "35410181",
    578         "title": "Emergent Abilities of Large Language Models",
    579         "points": 3,
    580         "comments": 0,
    581         "url": "https://news.ycombinator.com/item?id=35410181",
    582         "created_at": "2023-04-02T13:16:17Z"
    583       },
    584       {
    585         "hn_id": "34785902",
    586         "title": "Emergent Abilities of Large Language Models",
    587         "points": 2,
    588         "comments": 1,
    589         "url": "https://news.ycombinator.com/item?id=34785902",
    590         "created_at": "2023-02-14T05:48:21Z"
    591       },
    592       {
    593         "hn_id": "40419434",
    594         "title": "Emergent Abilities of Large Language Models",
    595         "points": 2,
    596         "comments": 0,
    597         "url": "https://news.ycombinator.com/item?id=40419434",
    598         "created_at": "2024-05-20T19:46:53Z"
    599       },
    600       {
    601         "hn_id": "47174820",
    602         "title": "Emergent Abilities of Large Language Models (2022)",
    603         "points": 1,
    604         "comments": 0,
    605         "url": "https://news.ycombinator.com/item?id=47174820",
    606         "created_at": "2026-02-27T00:58:33Z"
    607       },
    608       {
    609         "hn_id": "41730269",
    610         "title": "Emergent Abilities of Large Language Models (2022)",
    611         "points": 1,
    612         "comments": 0,
    613         "url": "https://news.ycombinator.com/item?id=41730269",
    614         "created_at": "2024-10-03T12:47:11Z"
    615       }
    616     ],
    617     "top_points": 69,
    618     "total_points": 97,
    619     "total_comments": 20
    620   }
    621 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs