scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (22897B)
      1 {
      2   "paper": {
      3     "title": "Emergent Abilities in Large Language Models: A Survey",
      4     "authors": ["Leonardo Berti", "Flavio Giorgi", "Gjergji Kasneci"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2503.05788"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["survey_methodology"],
     11   "methodology_tags": ["meta-analysis"],
     12   "key_findings": "This survey reviews emergent abilities in LLMs, analyzing definitions, conditions for emergence (scaling, task complexity, pre-training loss, quantization, prompting), and extending to Large Reasoning Models. The authors critically examine Schaeffer et al.'s claim that emergent abilities are metric artifacts, finding it less robust than claimed — some tasks show discontinuities even under continuous metrics. They also cover harmful emergent behaviors (deception, reward hacking, manipulation) and AI safety/governance implications.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code repository or analysis scripts are mentioned or linked anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset of surveyed papers, extracted data, or supplementary materials are released."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment or dependency specifications are provided. As a survey, analysis code/environment could have been shared but was not."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No reproduction instructions are provided for the survey methodology or any reproduced figures."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": false,
     39         "answer": false,
     40         "justification": "This is a narrative survey that does not run its own experiments or perform statistical aggregation."
     41       },
     42       "significance_tests": {
     43         "applies": false,
     44         "answer": false,
     45         "justification": "No original experiments or statistical comparisons are performed."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": false,
     49         "answer": false,
     50         "justification": "No original experiments; effect sizes from surveyed papers are reported narratively but no meta-analytic aggregation is performed."
     51       },
     52       "sample_size_justified": {
     53         "applies": false,
     54         "answer": false,
     55         "justification": "No original experiments conducted; this is a literature review."
     56       },
     57       "variance_reported": {
     58         "applies": false,
     59         "answer": false,
     60         "justification": "No original experiments conducted."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The survey does not compare itself against prior surveys on emergent abilities or position itself relative to other reviews."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No comparison to other surveys is provided."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "Not applicable to a survey paper — no system with components to ablate."
     78       },
     79       "multiple_metrics": {
     80         "applies": false,
     81         "answer": false,
     82         "justification": "No evaluation of a system is performed."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No system outputs to evaluate."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No experiments conducted requiring test sets."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The survey organizes findings into clear categories: definitions (Section II), emergent abilities (Section III), in-context learning (Section IV), LRMs (Section V), agents (Section VI), harmful behaviors (Section VII), with per-topic summaries in Tables I and III."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses limitations of surveyed work extensively. For example, Section III-A critically analyzes Schaeffer et al.'s claim that emergence is a mirage, pointing out that their Token Edit Distance metric is questionable and their log-scale plots create illusions of smoothness. Table II includes a 'Limitations' column for each paper."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The survey reports negative findings such as: some emergent abilities may be metric artifacts (Section III-A), 2-bit quantization degrades performance to near-random (Section III-D), models without few-shot prompting show no emergent abilities (Section III-B), and o3 still fails on certain simple tasks (Section V)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims a comprehensive review covering definitions, conditions for emergence, LRMs, harmful behaviors, and safety — all of which are substantively addressed in the paper's sections."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper makes several causal claims from surveyed literature without adequate qualification. E.g., Section III-C states 'pre-training loss acted as a strong predictor' but the authors themselves note the evidence is 'correlational rather than causal.' The paper sometimes presents correlational findings from surveyed papers as if they establish causation (e.g., 'the scaling of these models...has been linked to various so-called emergent abilities')."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper's title claims to survey 'Emergent Abilities in Large Language Models' broadly but the coverage is selective. The search methodology is limited to a single Google Scholar query ('Emergent Abilities' 'Large Language model' per Section III). The paper does not bound its generalizations to this limited search scope."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section III-A is dedicated to the debate over whether emergent abilities are real or metric artifacts. The paper presents the Schaeffer et al. counterargument at length, then critically evaluates it, considering multiple alternative explanations including metric choice, task complexity (Section III-E), memorization vs. generalization (Section III-C), and training dynamics."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper uses terms like 'emergent abilities,' 'reasoning,' and 'intelligence' loosely. For example, it discusses LRM performance on benchmarks as evidence of 'planning, self-reflection, and strategic thinking' (Section V) without distinguishing between benchmark scores and these broader cognitive claims."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "Survey paper — does not run its own model experiments."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "No prompting experiments conducted."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": false,
    150         "answer": false,
    151         "justification": "No experiments conducted."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding used."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The paper selection methodology is barely documented. Section III states papers were found via a single Google Scholar query, but no inclusion/exclusion criteria, date ranges, screening process, or total counts are provided."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. Individual subsections mention limitations of surveyed papers (e.g., in Table II), but the survey does not discuss limitations of its own methodology or scope."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity of the survey itself are discussed. The paper does not address selection bias in its literature search, potential for missing relevant work, or limitations of its narrative review approach."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what is out of scope. It covers a very broad range from definitions to safety to governance without clearly bounding what the survey does and does not cover."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No list of all papers considered, search results, or inclusion/exclusion decisions are made available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Section III mentions a Google Scholar search with the query 'Emergent Abilities' 'Large Language model' but no further detail is given — no date range, no total results, no screening process. Other sections do not describe their paper selection at all."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data source is published literature. Standard benchmark for survey."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No documentation of how papers were selected, screened, or categorized. The pipeline from search to final paper set is undocumented."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly stated: Berti and Kasneci at Technical University of Munich, Giorgi at Sapienza University of Rome."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "Survey paper — does not evaluate any pre-trained model on benchmarks."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Survey paper — does not evaluate any pre-trained model on benchmarks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Survey paper — does not evaluate any pre-trained model on benchmarks."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this survey."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "Survey paper — no method with inference costs."
    284       },
    285       "compute_budget_stated": {
    286         "applies": false,
    287         "answer": false,
    288         "justification": "Survey paper — no computational experiments."
    289       }
    290     },
    291     "survey_methodology": {
    292       "prisma_or_structured_protocol": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No PRISMA diagram, no structured review protocol. The only search strategy mentioned is a single Google Scholar query for Section III. No systematic protocol is described for the other sections."
    296       },
    297       "quality_assessment_of_sources": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The survey does not use any quality scoring rubric or risk-of-bias assessment for its source papers. Table II includes a 'Limitations' column which notes weaknesses of individual papers, but this is informal narrative commentary, not structured quality assessment. All papers are treated roughly equally regardless of methodological rigor."
    301       },
    302       "publication_bias_discussed": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No discussion of publication bias. The survey does not consider whether the literature on emergent abilities is biased toward positive/sensational findings about emergence."
    306       }
    307     }
    308   },
    309   "claims": [
    310     {
    311       "claim": "Emergent abilities are not merely metric artifacts — some tasks show discontinuous performance jumps even under continuous metrics like BLEU score.",
    312       "evidence": "Section III-A cites Steinhardt [78] showing sudden jump in French-to-English translation on WMT-14 Fr-En measured by BLEU, and Wei et al. [87] finding jumps in module arithmetic, periodic elements, and IPA transliteration under partial-credit metrics.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "Schaeffer et al.'s argument that emergent abilities 'evaporate' with continuous metrics is less robust than presented.",
    317       "evidence": "Section III-A critically analyzes [70], noting Token Edit Distance conflates syntactic similarity with semantic accuracy (e.g., a 7000-unit error is a 1-token edit), and log-scale plots obscure residual jumps that represent 10% to 100% accuracy increases.",
    318       "supported": "moderate"
    319     },
    320     {
    321       "claim": "Pre-training loss is a strong predictor of emergent abilities, independent of model size.",
    322       "evidence": "Section III-C reviews Du et al. [23] who trained 1.5B, 6B, and 32B models, finding performance on MMLU, C-Eval, GSM8K improved abruptly at specific loss thresholds. Extended to LLaMA models across different architectures.",
    323       "supported": "moderate"
    324     },
    325     {
    326       "claim": "4-bit quantization preserves most emergent abilities while 2-bit quantization degrades performance to near-random levels.",
    327       "evidence": "Section III-D reviews Liu et al. [53] testing LLaMA 7B-65B across 2/4/8/16-bit. FFN layers found critical for retaining performance. Post-quantization fine-tuning with LoRA partially recovers abilities.",
    328       "supported": "moderate"
    329     },
    330     {
    331       "claim": "In-context learning is essential for emergent functional abilities — without few-shot prompting, models show no emergence.",
    332       "evidence": "Section III-B reviews Lu et al. [55] testing GPT-3, T5, LLaMA, Falcon across 22 tasks, finding near-random performance without few-shot prompting (exceptions: Hindu Knowledge and Nonsense Word Grammar).",
    333       "supported": "weak"
    334     },
    335     {
    336       "claim": "LLM-powered agents exhibit emergent behaviors including deception, manipulation, and reward hacking.",
    337       "evidence": "Section VII cites Hagendorff [30] showing GPT-4 deception >70% in bluffing games with CoT, and Williams et al. [90] showing RLHF-trained models develop selective deception targeting vulnerable users.",
    338       "supported": "moderate"
    339     }
    340   ],
    341   "red_flags": [
    342     {
    343       "flag": "No systematic review protocol",
    344       "detail": "The survey's paper selection methodology is almost entirely undocumented. Only one section mentions a Google Scholar query; the rest appear to be ad-hoc selections. No PRISMA diagram, no inclusion/exclusion criteria, no counts of papers screened vs. included."
    345     },
    346     {
    347       "flag": "No quality assessment of sources",
    348       "detail": "The survey treats papers of varying methodological quality roughly equally. Papers with acknowledged limitations (e.g., correlational evidence, narrow model ranges, single-metric evaluations) are presented alongside more rigorous work without structured quality differentiation."
    349     },
    350     {
    351       "flag": "Selective and potentially biased coverage",
    352       "detail": "The survey covers ~105 references but the selection process is opaque. Without a systematic protocol, there is no way to assess whether important countervailing evidence was missed or whether the selection favors particular viewpoints on the emergence debate."
    353     },
    354     {
    355       "flag": "Speculative sections presented alongside empirical review",
    356       "detail": "Sections VI and VII include speculative discussion about singularity, self-preservation drives, and superintelligence (Section VII-c 'Hypothesizing Singularity') that goes well beyond the empirical evidence reviewed, without clearly distinguishing speculation from evidence-based findings."
    357     },
    358     {
    359       "flag": "No limitations section for the survey itself",
    360       "detail": "While the survey discusses limitations of individual papers it reviews, it never turns this lens on itself. Its own narrow search methodology, lack of systematic protocol, and potential selection biases are not acknowledged."
    361     }
    362   ],
    363   "cited_papers": [
    364     {
    365       "title": "Emergent abilities of large language models",
    366       "authors": ["Jason Wei", "Yi Tay", "Rishi Bommasani"],
    367       "year": 2022,
    368       "arxiv_id": "2206.07682",
    369       "relevance": "Foundational paper defining emergent abilities in LLMs; primary definition used across the literature."
    370     },
    371     {
    372       "title": "Are emergent abilities of large language models a mirage?",
    373       "authors": ["Rylan Schaeffer", "Brando Miranda", "Sanmi Koyejo"],
    374       "year": 2024,
    375       "relevance": "Key counterargument that emergent abilities are metric artifacts, central to the scientific debate on emergence."
    376     },
    377     {
    378       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    379       "authors": ["Yuntao Bai"],
    380       "year": 2022,
    381       "arxiv_id": "2204.05862",
    382       "relevance": "Foundational RLHF paper examining tension between helpfulness and harmlessness in LLM training."
    383     },
    384     {
    385       "title": "Constitutional AI: Harmlessness from AI Feedback",
    386       "authors": ["Yuntao Bai"],
    387       "year": 2022,
    388       "arxiv_id": "2212.08073",
    389       "relevance": "Introduces constitutional AI approach for aligning LLMs without human feedback on every example."
    390     },
    391     {
    392       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    393       "authors": ["DeepSeek-AI"],
    394       "year": 2025,
    395       "relevance": "Key Large Reasoning Model demonstrating emergent reasoning through RL post-training."
    396     },
    397     {
    398       "title": "Understanding emergent abilities of language models from the loss perspective",
    399       "authors": ["Zhengxiao Du", "Aohan Zeng", "Yuxiao Dong", "Jie Tang"],
    400       "year": 2024,
    401       "arxiv_id": "2403.15796",
    402       "relevance": "Links pre-training loss thresholds to emergent abilities, offering alternative to scale-only explanations."
    403     },
    404     {
    405       "title": "On targeted manipulation and deception when optimizing LLMs for user feedback",
    406       "authors": ["Marcus Williams"],
    407       "year": 2024,
    408       "arxiv_id": "2411.02306",
    409       "relevance": "Documents how RLHF can unintentionally reinforce manipulative behaviors in LLMs."
    410     },
    411     {
    412       "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models",
    413       "authors": ["Aarohi Srivastava"],
    414       "year": 2022,
    415       "arxiv_id": "2206.04615",
    416       "relevance": "BIG-Bench benchmark paper introducing breakthroughness metric for emergent behaviors."
    417     },
    418     {
    419       "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors in Agents",
    420       "authors": ["Weize Chen"],
    421       "year": 2023,
    422       "arxiv_id": "2308.10848",
    423       "relevance": "Framework for studying emergent collaboration, competition, and negotiation in multi-agent LLM systems."
    424     },
    425     {
    426       "title": "Deception abilities emerged in large language models",
    427       "authors": ["Thilo Hagendorff"],
    428       "year": 2023,
    429       "arxiv_id": "2307.16513",
    430       "relevance": "Documents emergent deception capabilities in GPT-4 with >70% success in strategic deception tasks."
    431     },
    432     {
    433       "title": "Predicting emergent capabilities by finetuning",
    434       "authors": ["Charlie Snell", "Eric Wallace", "Dan Klein", "Sergey Levine"],
    435       "year": 2024,
    436       "arxiv_id": "2411.16035",
    437       "relevance": "Proposes fine-tuning-based method to predict emergent capabilities, finding data quality affects emergence timing."
    438     },
    439     {
    440       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
    441       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    442       "year": 2024,
    443       "arxiv_id": "2408.03314",
    444       "relevance": "Demonstrates inference-time compute scaling as alternative to parameter scaling for improving reasoning."
    445     }
    446   ]
    447 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs