scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24687B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Are Emergent Abilities of Large Language Models a Mirage?",
      6     "authors": [
      7       "Schaeffer, R.",
      8       "Miranda, B.",
      9       "Koyejo, S."
     10     ],
     11     "year": 2023,
     12     "venue": "NeurIPS 2023",
     13     "arxiv_id": "2304.15004",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims (metric choice causes emergent abilities, nonlinear metrics create illusion, linear metrics show smooth improvement) are supported by evidence. Section 3 tests predictions on GPT-3, Section 4 meta-analyzes BIG-Bench, Section 5 demonstrates on vision tasks.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claim 'researcher's metric choice causes emergent abilities' is justified by testing on fixed GPT-3 outputs with different metrics (Figures 3-4), meta-analysis isolating metric as the variable, and inducing emergence in vision tasks by metric choice alone.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Claims are bounded: they focus on previously claimed emergent abilities in LLMs/BIG-Bench, test on vision tasks to show metric effect generalizes, and explicitly state they 'cannot predict all emergent abilities cannot exist' (Discussion, p.8).",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Paper mentions Caballero et al.'s piecewise power-law and Michaud et al.'s data assumptions as alternatives (Section 6) but does not engage with them substantively. No discussion of why those explanations are insufficient.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Paper distinguishes measured quantities (per-token cross-entropy, accuracy, edit distance) from claimed abilities (emergent properties). Explicitly separates what metrics measure from what emergence means.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No dedicated limitations or threats-to-validity section. Discussion (Section 7) lacks a limitations subsection. Constraints are scattered (e.g., footnote 1 on independence assumption, p.4).",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Threats mentioned in scattered form (token independence assumption in footnote 1, limited model access, only analyzing published results) but not systematically organized. No dedicated section listing specific threats.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Scope (three experimental settings: GPT-3, BIG-Bench, vision) is implicit but not explicitly bounded. No clear statement of what the paper does NOT claim or test.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding statement or acknowledgments section in paper indicating funding source.",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Authors listed with 'Computer Science, Stanford University' affiliation on title page.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "No funding disclosed, so cannot evaluate independence.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement or declaration of financial interests (patents, consulting, equity).",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Emergent abilities defined precisely as 'abilities not present in smaller models but present in larger ones; cannot be predicted by extrapolating' (citing Wei et al. 2022). Metrics (nonlinear, discontinuous, linear, continuous) used consistently with clear meaning.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Contribution explicitly stated: alternative explanation that emergent abilities are artifacts of metric choice, not fundamental model properties. Abstract, introduction, and discussion all restate this.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 6 engages with Wei et al. (emergent abilities definition), Srivastava et al. (BIG-Bench), Caballero et al. (power laws), and Michaud et al. (data assumptions). Shows how this work converts discussion into testable predictions.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No code release mentioned. GPT-3 experiments require API access (not shareable), BIG-Bench analysis uses public data, vision experiments on standard datasets but no code provided.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": false,
    128           "justification": "Uses public benchmarks (BIG-Bench, CIFAR-100, Omniglot, MNIST) but custom arithmetic test data for GPT-3 is not released or made available.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No environment specifications (requirements.txt, Python version, dependencies) provided anywhere in paper.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step instructions to reproduce results. GPT-3 experiments require paid API access; vision experiments lack code.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Figures 3-4 and 6-10 show single lines with no error bars, confidence intervals, or uncertainty bands. All results presented as point estimates.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests (t-tests, p-values) reported. Predictions confirmed visually and qualitatively, not quantitatively.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": false,
    160           "justification": "Emergence score (Eq. 1) calculated but not reported as effect size with baseline context. Accuracy percentages shown but not contextualized as effect sizes.",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "Number of test examples for GPT-3 arithmetic tasks not specified. BIG-Bench has 220+ tasks but sample sizes per task not justified. No power analysis.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No standard deviations, variance, or multiple runs reported. All results are single point values.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Multiple metrics compared as baselines (accuracy vs edit distance, multiple choice grade vs Brier score). Published emergent ability claims serve as baseline for comparison.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Analyzes recent emergent ability claims (Wei et al. 2022, Srivastava et al. 2022, Ganguli et al. 2022). Uses current GPT-3 family as available in 2023.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Metric ablations present: changing from nonlinear to linear (accuracy→edit distance), discontinuous to continuous (multiple choice→Brier score). Shows which metric components drive emergence.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "Core contribution compares accuracy, token edit distance, multiple choice grade, Brier score, ROUGE-L-Sum. Section 4 analyzes all 39 BIG-Bench metrics.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Computational analysis only, no human evaluation needed.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "GPT-3 experiments use held-out test data (Figures 3-4). BIG-Bench has train/test splits. Vision experiments use standard dataset splits.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "GPT-3: breakdown by sequence length and temperature. BIG-Bench: breakdown by metric type and task. Vision: breakdown by model architecture and dataset.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Shows which metrics fail to produce emergence (Figure 5A: 34/39 metrics have zero emergent abilities). Demonstrates cases where emergent abilities disappear with metric change.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Figure 5A shows most BIG-Bench metrics (34/39) display zero emergent abilities, and linear metrics consistently fail to show emergence patterns.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "GPT-3 version date given (2023-03-15) with parameter counts, but exact model names/IDs not specified. LaMDA and vision models lack version details.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "Describes task setup (2-shot arithmetic) but does not provide actual prompt text or system instructions used with GPT-3.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Temperature shown varying (0.0, 1.0 in figures) but no other hyperparameters (top-p, max_tokens, etc.) reported.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": false,
    254           "justification": "Mentions 2-shot prompting for arithmetic but does not describe exact scaffolding structure or the two demonstration examples used.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "No documentation of how arithmetic tasks were generated, how BIG-Bench data was processed, or how vision datasets were preprocessed.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "Raw data (GPT-3 outputs, BIG-Bench results, vision task data) not made available for independent verification.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": false,
    274           "justification": "For GPT-3: generated arithmetic tasks but collection procedure not detailed. For BIG-Bench: analyzed published results but collection not described.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human subjects involved.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "Full pipeline from raw outputs to metrics to emergence scores not documented. BIG-Bench pipeline unclear.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Paper does not state GPT-3 training cutoff date, nor does it discuss whether arithmetic tasks (common on internet) may have been in training data.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether BIG-Bench tasks or arithmetic problems overlapped with GPT-3 training data.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No analysis of whether models were pre-trained on benchmark examples. BIG-Bench creation date vs model training dates not compared.",
    307           "source": "haiku"
    308         }
    309       },
    310       "cost_and_practicality": {
    311         "inference_cost_reported": {
    312           "applies": true,
    313           "answer": false,
    314           "justification": "Paper does not report GPT-3 API costs, latency, or computational requirements for running experiments.",
    315           "source": "haiku"
    316         },
    317         "compute_budget_stated": {
    318           "applies": true,
    319           "answer": false,
    320           "justification": "Total computational budget (GPU hours, API costs, runtime) not stated anywhere.",
    321           "source": "haiku"
    322         }
    323       }
    324     }
    325   },
    326   "claims": [
    327     {
    328       "claim": "Over 92% of claimed emergent abilities on BIG-Bench appear under just two metrics: Multiple Choice Grade and Exact String Match",
    329       "evidence": "Meta-analysis of hand-annotated emergent abilities in [32], Figure 5C showing 92% concentration in these two discontinuous/nonlinear metrics",
    330       "supported": "strong"
    331     },
    332     {
    333       "claim": "Changing from nonlinear accuracy to linear token edit distance removes apparent emergent ability in GPT-3 arithmetic without changing model outputs",
    334       "evidence": "Figure 3 (top vs bottom) shows same GPT-3 outputs scored differently produce sharp vs smooth curves. Predictions 1 and 3 confirmed.",
    335       "supported": "strong"
    336     },
    337     {
    338       "claim": "Increasing test dataset resolution reveals smooth above-chance performance even on accuracy metric for GPT-3 arithmetic",
    339       "evidence": "Figure 4 with higher resolution test data shows all models achieve above-chance accuracy, confirming Prediction 2",
    340       "supported": "strong"
    341     },
    342     {
    343       "claim": "Emergent abilities can be artificially induced in vision models (autoencoders, CNNs, Transformers) by choosing appropriately nonlinear/discontinuous metrics",
    344       "evidence": "Figures 7-10: CIFAR-100 reconstruction, Omniglot classification, MNIST classification all show metric-induced emergence with smooth underlying performance",
    345       "supported": "strong"
    346     },
    347     {
    348       "claim": "The phenomenon is metric-dependent, not task or model-dependent: most BIG-Bench metrics (34/39) show zero emergent abilities",
    349       "evidence": "Figure 5A analysis showing emergence score distribution heavily skewed toward zero across BIG-Bench metrics",
    350       "supported": "strong"
    351     }
    352   ],
    353   "methodology_tags": [
    354     "meta-analysis",
    355     "benchmark-eval",
    356     "theoretical"
    357   ],
    358   "key_findings": "The paper demonstrates that claimed emergent abilities of large language models are primarily artifacts of researchers' choice of nonlinear or discontinuous evaluation metrics rather than fundamental changes in model behavior with scale. When identical model outputs are evaluated using linear or continuous metrics (e.g., token edit distance vs. accuracy), performance improvements appear smooth and predictable. This phenomenon is not unique to language models—the authors artificially induce apparently emergent abilities in vision tasks across diverse architectures. The analysis of BIG-Bench shows that 92% of claimed emergent abilities concentrate in just two metrics (Multiple Choice Grade and Exact String Match), and changing metrics removes the emergence phenomenon entirely.",
    359   "red_flags": [
    360     {
    361       "flag": "No uncertainty quantification",
    362       "detail": "All figures show single point estimates with no error bars, confidence intervals, or statistical error measures"
    363     },
    364     {
    365       "flag": "No significance testing",
    366       "detail": "Results presented visually; no formal statistical tests comparing predictions to null hypotheses or quantifying surprise"
    367     },
    368     {
    369       "flag": "Sample sizes not justified",
    370       "detail": "Number of test examples for arithmetic tasks not specified; no power analysis or sample size justification provided"
    371     },
    372     {
    373       "flag": "No reproducible artifacts",
    374       "detail": "GPT-3 experiments require paid API access; vision experiments lack released code; no raw data made public for verification"
    375     },
    376     {
    377       "flag": "Training data contamination not addressed",
    378       "detail": "Paper does not discuss whether GPT-3 was pre-trained on arithmetic problems or BIG-Bench tasks; potential data leakage unexamined"
    379     },
    380     {
    381       "flag": "Key assumption not justified",
    382       "detail": "Token independence assumption (footnote 1, p.4) acknowledged as empirically false but used in mathematical model without justification"
    383     },
    384     {
    385       "flag": "Limited model coverage",
    386       "detail": "Only GPT-3 directly tested (publicly queryable); other model families analyzed only through published aggregate results from [32]"
    387     }
    388   ],
    389   "cited_papers": [
    390     {
    391       "title": "Emergent abilities of large language models",
    392       "authors": "Wei et al.",
    393       "year": 2022,
    394       "relevance": "Defines emergent abilities as sharp, unpredictable transitions; primary target of this paper's critique"
    395     },
    396     {
    397       "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models",
    398       "authors": "Srivastava et al.",
    399       "year": 2022,
    400       "relevance": "BIG-Bench benchmark with emergence claims; primary source for meta-analysis"
    401     },
    402     {
    403       "title": "Predictability and surprise in large generative models",
    404       "authors": "Ganguli et al.",
    405       "year": 2022,
    406       "relevance": "Key paper claiming emergent abilities in Chinchilla models"
    407     },
    408     {
    409       "title": "Language models are few-shot learners",
    410       "authors": "Brown et al.",
    411       "year": 2020,
    412       "relevance": "GPT-3 paper; first major claim of emergent arithmetic abilities"
    413     },
    414     {
    415       "title": "Broken neural scaling laws",
    416       "authors": "Caballero et al.",
    417       "year": 2022,
    418       "relevance": "Alternative explanation via piecewise power-law; briefly discussed as competing theory"
    419     },
    420     {
    421       "title": "137 emergent abilities of large language models",
    422       "authors": "Wei, J.",
    423       "year": 2022,
    424       "relevance": "Hand-annotated list used for meta-analysis in Section 4"
    425     }
    426   ],
    427   "engagement_factors": {
    428     "practical_relevance": {
    429       "score": 2,
    430       "justification": "Informs benchmark design practices but does not directly enable practitioners to build new capabilities or systems"
    431     },
    432     "surprise_contrarian": {
    433       "score": 3,
    434       "justification": "Directly challenges the major claim in Wei et al. (2000+ citations) that LLMs possess unpredictable emergent abilities; high-profile contrarian argument"
    435     },
    436     "fear_safety": {
    437       "score": 0,
    438       "justification": "Reduces AI safety concerns by undermining claims about unpredictable capability emergence; no new risks or safety implications raised"
    439     },
    440     "drama_conflict": {
    441       "score": 3,
    442       "justification": "Direct contradiction with Wei et al., Srivastava et al., and Ganguli et al. claims; central debate in LLM capabilities discourse"
    443     },
    444     "demo_ability": {
    445       "score": 1,
    446       "justification": "Requires GPT-3 API access to reproduce arithmetic experiments; vision experiments reproducible only with released code (not provided)"
    447     },
    448     "brand_recognition": {
    449       "score": 3,
    450       "justification": "Stanford CS affiliation, NeurIPS 2023 Outstanding Paper, challenges claims from OpenAI/DeepMind/Google"
    451     }
    452   },
    453   "hn_data": {
    454     "threads": [
    455       {
    456         "hn_id": "35768824",
    457         "title": "Are emergent abilities of large language models a mirage?",
    458         "points": 154,
    459         "comments": 130,
    460         "url": "https://news.ycombinator.com/item?id=35768824",
    461         "created_at": "2023-05-01T03:32:48Z"
    462       },
    463       {
    464         "hn_id": "37380462",
    465         "title": "Large language models converge toward human-like concept organization",
    466         "points": 3,
    467         "comments": 0,
    468         "url": "https://news.ycombinator.com/item?id=37380462",
    469         "created_at": "2023-09-04T13:49:33Z"
    470       },
    471       {
    472         "hn_id": "36931866",
    473         "title": "Universal and Transferable Adversarial Attacks on LLM",
    474         "points": 3,
    475         "comments": 0,
    476         "url": "https://news.ycombinator.com/item?id=36931866",
    477         "created_at": "2023-07-30T15:04:08Z"
    478       },
    479       {
    480         "hn_id": "37938665",
    481         "title": "The Surveillance AI Pipeline",
    482         "points": 2,
    483         "comments": 1,
    484         "url": "https://news.ycombinator.com/item?id=37938665",
    485         "created_at": "2023-10-19T05:00:48Z"
    486       },
    487       {
    488         "hn_id": "38280492",
    489         "title": "Ghostbuster: Detecting Text Ghostwritten by Large Language Models",
    490         "points": 2,
    491         "comments": 0,
    492         "url": "https://news.ycombinator.com/item?id=38280492",
    493         "created_at": "2023-11-15T18:36:51Z"
    494       },
    495       {
    496         "hn_id": "37675002",
    497         "title": "Reproducing Failures in Fault Signatures",
    498         "points": 2,
    499         "comments": 0,
    500         "url": "https://news.ycombinator.com/item?id=37675002",
    501         "created_at": "2023-09-27T14:17:13Z"
    502       },
    503       {
    504         "hn_id": "47174839",
    505         "title": "Are Emergent Abilities of Large Language Models a Mirage? (2023)",
    506         "points": 1,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=47174839",
    509         "created_at": "2026-02-27T01:00:02Z"
    510       },
    511       {
    512         "hn_id": "35659049",
    513         "title": "Finding Bug-Inducing Program Environments",
    514         "points": 1,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=35659049",
    517         "created_at": "2023-04-21T19:48:54Z"
    518       },
    519       {
    520         "hn_id": "36955679",
    521         "title": "A LLM Assisted Exploitation of AI-Guardian",
    522         "points": 1,
    523         "comments": 1,
    524         "url": "https://news.ycombinator.com/item?id=36955679",
    525         "created_at": "2023-08-01T13:28:45Z"
    526       },
    527       {
    528         "hn_id": "36903968",
    529         "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    530         "points": 1,
    531         "comments": 0,
    532         "url": "https://news.ycombinator.com/item?id=36903968",
    533         "created_at": "2023-07-28T07:30:39Z"
    534       }
    535     ],
    536     "top_points": 154,
    537     "total_points": 170,
    538     "total_comments": 132
    539   }
    540 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs