scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30582B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Emergent Abilities of Large Language Models",
      6     "authors": [
      7       "Jason Wei",
      8       "Yi Tay",
      9       "Rishi Bommasani",
     10       "Colin Raffel",
     11       "Barret Zoph",
     12       "Sebastian Borgeaud",
     13       "Dani Yogatama",
     14       "Maarten Bosma",
     15       "Denny Zhou",
     16       "Donald Metzler",
     17       "Ed H. Chi",
     18       "Tatsunori Hashimoto",
     19       "Oriol Vinyals",
     20       "Percy Liang",
     21       "Jeff Dean",
     22       "William Fedus"
     23     ],
     24     "year": 2022,
     25     "venue": "Transactions on Machine Learning Research",
     26     "arxiv_id": "2206.07682",
     27     "doi": null
     28   },
     29   "checklist": {
     30     "claims_and_evidence": {
     31       "abstract_claims_supported": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The abstract claims emergent abilities cannot be predicted by extrapolating smaller-model performance; the paper supports this with 20+ documented examples across five model families showing near-random to above-random performance phase transitions.",
     35         "source": "haiku"
     36       },
     37       "causal_claims_justified": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper makes causal-sounding claims that scale 'enables' or 'unlocks' emergent abilities, but the study design is entirely observational (aggregating published results); no controlled experiments isolate scale from confounders such as training data quality or architecture.",
     41         "source": "haiku"
     42       },
     43       "generalization_bounded": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper explicitly scopes to 'pre-trained Transformer language models' (footnote 1), acknowledges emergence thresholds vary by factors beyond compute, and Section 5.2 discusses how scale is not the only factor.",
     47         "source": "haiku"
     48       },
     49       "alternative_explanations_discussed": {
     50         "applies": true,
     51         "answer": true,
     52         "justification": "Section 5.1 discusses metric artifact explanations (exact string match masking incremental gains), depth requirements for multi-step reasoning, and Appendix A provides cross-entropy analysis showing continuous sub-threshold improvement.",
     53         "source": "haiku"
     54       },
     55       "proxy_outcome_distinction": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper explicitly addresses that discrete downstream metrics like exact match may disguise incremental improvements, and Appendix A demonstrates that cross-entropy loss improves continuously even when accuracy metrics appear at-random.",
     59         "source": "haiku"
     60       }
     61     },
     62     "limitations_and_scope": {
     63       "limitations_section_present": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "There is no dedicated limitations or threats-to-validity section; Section 5 is a discussion of explanations and future directions, and the Broader Impact Statement is only two sentences.",
     67         "source": "haiku"
     68       },
     69       "threats_to_validity_specific": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section 5.1 specifically discusses how metric choice (exact match vs. cross-entropy) can produce apparent emergence from smooth underlying improvements, and notes that the BIG-Bench task classification involved subjective judgment by two co-authors.",
     73         "source": "haiku"
     74       },
     75       "scope_boundaries_stated": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "While the paper focuses on pre-trained Transformers (footnote 1), it does not explicitly bound what the results do NOT show — for instance, it does not clarify that emergence may be a metric artifact for most tasks, or that results apply only to dense models trained at frontier compute.",
     79         "source": "haiku"
     80       }
     81     },
     82     "conflicts_of_interest": {
     83       "funding_disclosed": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No funding source is disclosed anywhere; the Acknowledgments section thanks individuals for feedback but contains no funding statement.",
     87         "source": "haiku"
     88       },
     89       "affiliations_disclosed": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Author affiliations are clearly listed on the title page: Google Research (10 authors), Stanford University (3 authors), UNC Chapel Hill (1 author), and DeepMind (3 authors).",
     93         "source": "haiku"
     94       },
     95       "funder_independent_of_outcome": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "13 of 16 authors are from Google Research or DeepMind, the same organizations that trained LaMDA, PaLM, Gopher, and Chinchilla — the models whose impressive emergent abilities are being documented and celebrated in the paper.",
     99         "source": "haiku"
    100       },
    101       "financial_interests_declared": {
    102         "applies": true,
    103         "answer": false,
    104         "justification": "There is no competing interests or financial disclosure statement anywhere in the paper.",
    105         "source": "haiku"
    106       }
    107     },
    108     "scope_and_framing": {
    109       "key_terms_defined": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 provides a precise operational definition: 'An ability is emergent if it is not present in smaller models but is present in larger models,' with explicit operationalization via scaling curves showing near-random to above-random performance jumps.",
    113         "source": "haiku"
    114       },
    115       "intended_contribution_clear": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The introduction clearly states the paper will 'discuss emergent abilities as observed in a range of prior work' and categorize them in few-shot prompting and augmented prompting settings.",
    119         "source": "haiku"
    120       },
    121       "engagement_with_prior_work": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper extensively synthesizes results from GPT-3, FLAN, PaLM, Gopher, Chinchilla, and BIG-Bench, explicitly situating emergent abilities against established scaling laws from Kaplan et al. 2020 and Hoffmann et al. 2022.",
    125         "source": "haiku"
    126       }
    127     }
    128   },
    129   "type_checklist": {
    130     "empirical": {
    131       "artifacts": {
    132         "code_released": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "No code is released; the paper surveys published results from prior work and provides no analysis scripts for the BIG-Bench task classifications or cross-entropy re-analyses in the appendices.",
    136           "source": "haiku"
    137         },
    138         "data_released": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "The paper uses BIG-Bench, MMLU, TruthfulQA, and other publicly available benchmarks unmodified; these are standard public benchmarks accessible to the research community.",
    142           "source": "haiku"
    143         },
    144         "environment_specified": {
    145           "applies": false,
    146           "answer": false,
    147           "justification": "This is a survey paper re-analyzing published results from prior work; no new experimental environment or software dependencies are required.",
    148           "source": "haiku"
    149         },
    150         "reproduction_instructions": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "No reproduction instructions are provided; the paper does not describe how to replicate the scaling curve analyses, the BIG-Bench task classifications, or the cross-entropy re-evaluations.",
    154           "source": "haiku"
    155         }
    156       },
    157       "statistical_methodology": {
    158         "confidence_intervals_or_error_bars": {
    159           "applies": true,
    160           "answer": false,
    161           "justification": "The scaling curves in Figures 2-4 show individual model points without any confidence intervals or error bars; variance across evaluation seeds is not reported.",
    162           "source": "haiku"
    163         },
    164         "significance_tests": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "No statistical significance tests are applied; emergence is identified by visual inspection of scaling curves rather than formal statistical criteria for detecting phase transitions.",
    168           "source": "haiku"
    169         },
    170         "effect_sizes_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "The paper reports absolute performance metrics (accuracy %, BLEU %) at different model scales with random baseline comparisons throughout, allowing readers to assess the magnitude of emergent gains.",
    174           "source": "haiku"
    175         },
    176         "sample_size_justified": {
    177           "applies": true,
    178           "answer": false,
    179           "justification": "Benchmark sample sizes come from prior work and are not justified by the authors; no power analysis is conducted, and the paper notes that Logical Arguments has only 32 samples which 'may contribute to noise.'",
    180           "source": "haiku"
    181         },
    182         "variance_reported": {
    183           "applies": true,
    184           "answer": false,
    185           "justification": "Scaling curves show single evaluation runs per model without standard deviation or variance across evaluation seeds, prompting variations, or runs.",
    186           "source": "haiku"
    187         }
    188       },
    189       "evaluation_design": {
    190         "baselines_included": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Random performance baselines are consistently included as dashed lines in all scaling curve figures (Figures 2-4, 11-12), and prior state-of-the-art from fine-tuned task-specific models is shown in Figure 13.",
    194           "source": "haiku"
    195         },
    196         "baselines_contemporary": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "The paper compares across contemporary frontier models (GPT-3, LaMDA, Gopher, Chinchilla, PaLM) covering the full range of compute scales available at the time of publication.",
    200           "source": "haiku"
    201         },
    202         "ablation_study": {
    203           "applies": false,
    204           "answer": false,
    205           "justification": "This is a survey paper with no novel system components to ablate; N/A.",
    206           "source": "haiku"
    207         },
    208         "multiple_metrics": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Appendix A.2 explicitly tests emergence across multiple metrics (Exact Match, BLEU, ROUGE-1/2/L, BLEURT, Sequence F1) for generative tasks, showing the emergence pattern appears across all of them.",
    212           "source": "haiku"
    213         },
    214         "human_evaluation": {
    215           "applies": false,
    216           "answer": false,
    217           "justification": "No human evaluation of system outputs is conducted; all evaluation is on automated benchmarks.",
    218           "source": "haiku"
    219         },
    220         "held_out_test_set": {
    221           "applies": false,
    222           "answer": false,
    223           "justification": "This is not a prediction task in the ML training sense; the paper analyzes model performance on existing benchmark test sets from prior work.",
    224           "source": "haiku"
    225         },
    226         "per_category_breakdown": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Figure 8 provides per-keyword breakdown of emergent vs. non-emergent BIG-Bench tasks across 33 keyword categories, and Figure 10 breaks MMLU performance into four academic supercategories.",
    230           "source": "haiku"
    231         },
    232         "failure_cases_discussed": {
    233           "applies": true,
    234           "answer": true,
    235           "justification": "Appendix E.4 catalogs 45 BIG-Bench tasks where no model achieves above-random performance, and the WiC benchmark case (Figure 2H) is explicitly discussed as a failure for GPT-3 and Chinchilla even at their largest sizes.",
    236           "source": "haiku"
    237         },
    238         "negative_results_reported": {
    239           "applies": true,
    240           "answer": true,
    241           "justification": "The paper explicitly reports cases where scaling fails to unlock emergence (flat scaling curves in Appendix E.4), and discusses models that hurt performance when using prompting techniques below scale thresholds (Figure 3B).",
    242           "source": "haiku"
    243         }
    244       },
    245       "setup_transparency": {
    246         "model_versions_specified": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Table 2 provides comprehensive parameter counts, training tokens, and training FLOPs for all model families including GPT-3, LaMDA, Gopher, Chinchilla, PaLM, and Anthropic LM.",
    250           "source": "haiku"
    251         },
    252         "prompts_provided": {
    253           "applies": true,
    254           "answer": false,
    255           "justification": "The paper describes prompting setups (2-shot, few-shot) but does not provide the actual prompts used in evaluations; Figure 1 shows only a generic illustrative sentiment example, not the actual benchmark prompts.",
    256           "source": "haiku"
    257         },
    258         "hyperparameters_reported": {
    259           "applies": true,
    260           "answer": false,
    261           "justification": "Inference hyperparameters (temperature, top-p, number of samples) are not reported for most results; Figure 5 mentions T=0 and T=1 for one appendix analysis but systematic reporting of decoding parameters is absent.",
    262           "source": "haiku"
    263         },
    264         "scaffolding_described": {
    265           "applies": false,
    266           "answer": false,
    267           "justification": "This is not an agentic paper; no scaffolding is involved.",
    268           "source": "haiku"
    269         },
    270         "data_preprocessing_documented": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "The cross-entropy re-analyses in Appendix A use BIG-Bench data but only state they 'follow the same experimental setup from BIG-Bench (2022)' without documenting specific preprocessing or extraction steps.",
    274           "source": "haiku"
    275         }
    276       },
    277       "data_integrity": {
    278         "raw_data_available": {
    279           "applies": true,
    280           "answer": false,
    281           "justification": "Scaling curve data points are drawn from prior publications; no raw evaluation logs, digitized data files, or original scoring outputs are released with this paper.",
    282           "source": "haiku"
    283         },
    284         "data_collection_described": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The paper states results come from prior publications but does not describe how scaling curve data points were extracted or verified; the BIG-Bench task annotation methodology (Appendix E) is described only as two co-authors 'working together and agreeing with confidence.'",
    288           "source": "haiku"
    289         },
    290         "recruitment_methods_described": {
    291           "applies": false,
    292           "answer": false,
    293           "justification": "No human participants; N/A.",
    294           "source": "haiku"
    295         },
    296         "data_pipeline_documented": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "No data pipeline is documented; performance numbers are stated as coming from cited papers without a systematic protocol for extraction, normalization, or verification across heterogeneous sources.",
    300           "source": "haiku"
    301         }
    302       },
    303       "contamination": {
    304         "training_cutoff_stated": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "The paper evaluates model capabilities on many benchmarks but never states training data cutoffs for any of the evaluated models, despite this being highly relevant to whether benchmark examples could have appeared in training corpora.",
    308           "source": "haiku"
    309         },
    310         "train_test_overlap_discussed": {
    311           "applies": true,
    312           "answer": false,
    313           "justification": "Potential training data contamination of evaluated benchmarks is never mentioned anywhere in the paper; there is no discussion of whether BIG-Bench, MMLU, or TruthfulQA examples appeared in model training data.",
    314           "source": "haiku"
    315         },
    316         "benchmark_contamination_addressed": {
    317           "applies": true,
    318           "answer": false,
    319           "justification": "MMLU and TruthfulQA were publicly available before training of Gopher, Chinchilla, and PaLM; benchmark contamination that could partially explain emergent performance jumps is never addressed.",
    320           "source": "haiku"
    321         }
    322       },
    323       "human_studies": {
    324         "pre_registered": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants; N/A.",
    328           "source": "haiku"
    329         },
    330         "irb_or_ethics_approval": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants; N/A.",
    334           "source": "haiku"
    335         },
    336         "demographics_reported": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants; N/A.",
    340           "source": "haiku"
    341         },
    342         "inclusion_exclusion_criteria": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants; N/A.",
    346           "source": "haiku"
    347         },
    348         "randomization_described": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants; N/A.",
    352           "source": "haiku"
    353         },
    354         "blinding_described": {
    355           "applies": false,
    356           "answer": false,
    357           "justification": "No human participants; N/A.",
    358           "source": "haiku"
    359         },
    360         "attrition_reported": {
    361           "applies": false,
    362           "answer": false,
    363           "justification": "No human participants; N/A.",
    364           "source": "haiku"
    365         }
    366       },
    367       "cost_and_practicality": {
    368         "inference_cost_reported": {
    369           "applies": false,
    370           "answer": false,
    371           "justification": "This is a survey paper analyzing published results; inference cost is not a contribution being evaluated.",
    372           "source": "haiku"
    373         },
    374         "compute_budget_stated": {
    375           "applies": true,
    376           "answer": true,
    377           "justification": "Table 2 comprehensively lists training FLOPs, parameter counts, and training tokens for all evaluated model families (GPT-3, LaMDA, Gopher, Chinchilla, PaLM, Anthropic LM) spanning 2.1M to 540B parameters.",
    378           "source": "haiku"
    379         }
    380       }
    381     }
    382   },
    383   "claims": [
    384     {
    385       "claim": "Certain abilities emerge sharply above a critical scale threshold — going from near-random to well-above-random — and cannot be predicted by extrapolating performance from smaller models",
    386       "evidence": "Figures 2-3 show 12 examples across 5 model families where performance is at chance for many orders of magnitude of compute before jumping sharply; Table 1 catalogs 23 such abilities",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Chain-of-thought prompting only surpasses standard prompting at scales ≥~100B parameters (~10^23 training FLOPs)",
    391       "evidence": "Figure 3A shows CoT prompting hurts or is neutral for LaMDA models below this threshold and only helps above it on GSM8K math word problems",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "Apparent emergence is not merely an artifact of discrete metrics: cross-entropy loss improves continuously even at small scales where accuracy/BLEU appear near-random",
    396       "evidence": "Appendix A (Figures 5-6) shows monotonically improving cross-entropy for all 6 tested BIG-Bench tasks at small scales where downstream discrete metrics remain at chance",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Scale is not the only factor for emergence: PaLM 62B shows emergent performance on 14 BIG-Bench tasks where GPT-3 175B and LaMDA 137B do not, despite having fewer parameters",
    401       "evidence": "Appendix F lists the 14 tasks; Section 5.2 attributes this to training data quality and architecture, but no controlled ablation isolates these factors",
    402       "supported": "weak"
    403     },
    404     {
    405       "claim": "Social Science and Humanities tasks show larger performance jumps from second-largest to largest model than STEM tasks on the MMLU benchmark",
    406       "evidence": "Figure 10 shows per-category MMLU scaling curves for Gopher and Chinchilla; Figure 9 plots the performance gap explicitly, with STEM showing the smallest jump",
    407       "supported": "moderate"
    408     },
    409     {
    410       "claim": "Emergent risks such as toxicity amplification and training data memorization may also increase with model scale",
    411       "evidence": "Section 5.4 cites Gehman et al. and Carlini et al. showing larger models produce more toxic outputs and memorize more training data, with mitigation strategies noted",
    412       "supported": "moderate"
    413     }
    414   ],
    415   "methodology_tags": [
    416     "benchmark-eval",
    417     "meta-analysis",
    418     "observational"
    419   ],
    420   "key_findings": "The paper surveys and documents 'emergent abilities' — capabilities appearing suddenly above critical scale thresholds — across 20+ tasks in five language model families, contrasting with the smooth predictable scaling laws established for pre-training loss. Both few-shot prompting abilities (arithmetic, MMLU, TruthfulQA) and augmented prompting techniques (chain-of-thought, instruction following, calibration) exhibit this phase-transition pattern with performance near chance until a critical scale. A cross-entropy re-analysis in the appendix reveals that small models do improve continuously on these tasks but that discrete downstream metrics mask this progress, partially explaining the apparent discontinuity without fully resolving it. The paper highlights understanding and predicting emergence as a critical open problem, while also noting that emergent risks (toxicity, bias, memorization) may accompany emergent capabilities.",
    421   "red_flags": [
    422     {
    423       "flag": "Unaddressed benchmark contamination",
    424       "detail": "MMLU, TruthfulQA, and many BIG-Bench tasks were publicly available before the training of Gopher, Chinchilla, and PaLM; the paper never discusses whether benchmark examples in model training corpora could partially explain emergent performance jumps."
    425     },
    426     {
    427       "flag": "Organizational self-evaluation without disclosure",
    428       "detail": "13 of 16 authors are from Google Research or DeepMind, the same organizations that trained LaMDA, PaLM, Gopher, and Chinchilla; no conflict of interest statement is provided despite documenting these models' impressive capabilities."
    429     },
    430     {
    431       "flag": "No statistical criteria for emergence identification",
    432       "detail": "Emergence is identified by visual inspection of scaling curves with no formal statistical criteria, significance tests, or confidence intervals; the paper's own cross-entropy analysis shows performance improvements are actually continuous, not truly discontinuous."
    433     },
    434     {
    435       "flag": "Metric artifact partially acknowledged but unresolved",
    436       "detail": "Section 5.1 acknowledges that exact-match scoring on multi-step problems may create apparent phase transitions from smooth underlying improvements, but concludes this is 'at best an incomplete explanation' without providing a quantitative resolution or alternative analysis for most tasks."
    437     },
    438     {
    439       "flag": "Non-reproducible task classification",
    440       "detail": "The BIG-Bench task classifications in Appendix E (labeling all 210 tasks as emergent/smooth/flat/other) were performed by two co-authors by subjective consensus with no formalized criteria, no inter-rater agreement score, and no released annotation protocol."
    441     }
    442   ],
    443   "cited_papers": [
    444     {
    445       "title": "Beyond the Imitation Game: Measuring and Extrapolating the Capabilities of Language Models (BIG-Bench)",
    446       "relevance": "Primary benchmark used throughout for documenting emergent tasks; provides the 200+ task suite and scaling data for LaMDA, GPT-3, and PaLM"
    447     },
    448     {
    449       "title": "Language Models are Few-Shot Learners (GPT-3, Brown et al. 2020)",
    450       "relevance": "Establishes the few-shot prompting paradigm and provides foundational scaling data; starting point for emergence observations"
    451     },
    452     {
    453       "title": "Scaling Laws for Neural Language Models (Kaplan et al. 2020)",
    454       "relevance": "Establishes smooth scaling laws that emergent abilities directly contrast with"
    455     },
    456     {
    457       "title": "Training Compute-Optimal Large Language Models (Chinchilla, Hoffmann et al. 2022)",
    458       "relevance": "Provides key Chinchilla scaling data and challenges assumptions about optimal compute-parameter tradeoffs"
    459     },
    460     {
    461       "title": "PaLM: Scaling Language Modeling with Pathways (Chowdhery et al. 2022)",
    462       "relevance": "Key model showing emergent abilities at extreme scale (540B); PaLM unlocks Word-in-Context and other tasks where GPT-3 fails"
    463     },
    464     {
    465       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models (Wei et al. 2022)",
    466       "relevance": "Documents emergent benefit of chain-of-thought prompting above ~100B parameters"
    467     },
    468     {
    469       "title": "Finetuned Language Models Are Zero-Shot Learners / FLAN (Wei et al. 2022)",
    470       "relevance": "Documents emergent instruction-following behavior only effective at large fine-tuning scales"
    471     },
    472     {
    473       "title": "Measuring Massive Multitask Language Understanding / MMLU (Hendrycks et al. 2021)",
    474       "relevance": "Benchmark showing emergent multi-topic knowledge above ~70B parameters; used for per-category breakdown analysis"
    475     },
    476     {
    477       "title": "On the Opportunities and Risks of Foundation Models (Bommasani et al. 2021)",
    478       "relevance": "Broader context for emergent abilities and risks; provides framing for sociological shifts discussion"
    479     },
    480     {
    481       "title": "Scaling Language Models: Methods, Analysis and Insights from Training Gopher (Rae et al. 2021)",
    482       "relevance": "Provides extensive Gopher scaling data including TruthfulQA emergence results at 280B parameters"
    483     }
    484   ],
    485   "engagement_factors": {
    486     "practical_relevance": {
    487       "score": 2,
    488       "justification": "Practitioners and product teams need to understand capability thresholds for planning deployments, but the paper cannot predict specific emergence points for new abilities."
    489     },
    490     "surprise_contrarian": {
    491       "score": 3,
    492       "justification": "Directly challenges the prevailing smooth scaling law narrative by documenting qualitative capability phase transitions that cannot be predicted from extrapolation — a central claim of the scaling laws community."
    493     },
    494     "fear_safety": {
    495       "score": 2,
    496       "justification": "Section 5.4 explicitly frames unpredictable emergent risks (bias, toxicity, backdoor vulnerabilities, deception) as a safety concern accompanying emergent capabilities."
    497     },
    498     "drama_conflict": {
    499       "score": 2,
    500       "justification": "Sparked substantial follow-on debate including Schaeffer et al. 2023 arguing emergence is a metric artifact, making this a contested foundational claim in the scaling literature."
    501     },
    502     "demo_ability": {
    503       "score": 1,
    504       "justification": "Emergent abilities require 10B–540B parameter models not accessible to most practitioners, limiting direct demonstration or reproduction."
    505     },
    506     "brand_recognition": {
    507       "score": 3,
    508       "justification": "Published by Google Research, DeepMind, and Stanford researchers — three of the most prominent AI institutions — in TMLR; became one of the most cited papers on LLM scaling."
    509     }
    510   },
    511   "hn_data": {
    512     "threads": [
    513       {
    514         "hn_id": "40689833",
    515         "title": "Survey of Rickrolling in Academic Literature [pdf]",
    516         "points": 69,
    517         "comments": 14,
    518         "url": "https://news.ycombinator.com/item?id=40689833",
    519         "created_at": "2024-06-15T13:54:57Z"
    520       },
    521       {
    522         "hn_id": "37543595",
    523         "title": "Ask HN: Transformer alternatives that could have emergent properties when scaled",
    524         "points": 6,
    525         "comments": 3,
    526         "url": "https://news.ycombinator.com/item?id=37543595",
    527         "created_at": "2023-09-17T10:45:52Z"
    528       },
    529       {
    530         "hn_id": "36349856",
    531         "title": "SqueezeLLM: Dense-and-Sparse Quantization",
    532         "points": 5,
    533         "comments": 1,
    534         "url": "https://news.ycombinator.com/item?id=36349856",
    535         "created_at": "2023-06-16T01:43:39Z"
    536       },
    537       {
    538         "hn_id": "35621735",
    539         "title": "Emergent Abilities of Large Language Models",
    540         "points": 4,
    541         "comments": 1,
    542         "url": "https://news.ycombinator.com/item?id=35621735",
    543         "created_at": "2023-04-18T23:06:51Z"
    544       },
    545       {
    546         "hn_id": "36342137",
    547         "title": "SqueezeLLM: Lossless 3-bit quantization with improved performance",
    548         "points": 4,
    549         "comments": 0,
    550         "url": "https://news.ycombinator.com/item?id=36342137",
    551         "created_at": "2023-06-15T15:43:48Z"
    552       },
    553       {
    554         "hn_id": "35410181",
    555         "title": "Emergent Abilities of Large Language Models",
    556         "points": 3,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=35410181",
    559         "created_at": "2023-04-02T13:16:17Z"
    560       },
    561       {
    562         "hn_id": "34785902",
    563         "title": "Emergent Abilities of Large Language Models",
    564         "points": 2,
    565         "comments": 1,
    566         "url": "https://news.ycombinator.com/item?id=34785902",
    567         "created_at": "2023-02-14T05:48:21Z"
    568       },
    569       {
    570         "hn_id": "40419434",
    571         "title": "Emergent Abilities of Large Language Models",
    572         "points": 2,
    573         "comments": 0,
    574         "url": "https://news.ycombinator.com/item?id=40419434",
    575         "created_at": "2024-05-20T19:46:53Z"
    576       },
    577       {
    578         "hn_id": "47174820",
    579         "title": "Emergent Abilities of Large Language Models (2022)",
    580         "points": 1,
    581         "comments": 0,
    582         "url": "https://news.ycombinator.com/item?id=47174820",
    583         "created_at": "2026-02-27T00:58:33Z"
    584       },
    585       {
    586         "hn_id": "41730269",
    587         "title": "Emergent Abilities of Large Language Models (2022)",
    588         "points": 1,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=41730269",
    591         "created_at": "2024-10-03T12:47:11Z"
    592       }
    593     ],
    594     "top_points": 69,
    595     "total_points": 97,
    596     "total_comments": 20
    597   }
    598 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs