scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (37395B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Are Emergent Abilities of Large Language Models a Mirage?",
      6     "authors": [
      7       "Schaeffer, R.",
      8       "Miranda, B.",
      9       "Koyejo, S."
     10     ],
     11     "year": 2023,
     12     "venue": "NeurIPS 2023",
     13     "arxiv_id": "2304.15004",
     14     "doi": null
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The abstract claims that emergent abilities appear due to metric choice, and the paper provides three lines of evidence: GPT-3 arithmetic (Sec. 3), BIG-Bench meta-analysis (Sec. 4), and induced emergence in vision (Sec. 5), all supporting this claim.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper claims metric choice 'creates' emergent abilities. This causal claim is justified by a controlled manipulation: holding model outputs fixed and varying only the metric, which is a valid single-variable manipulation. The mathematical model (Sec. 2) provides the theoretical grounding.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The Discussion (Sec. 7) explicitly states: 'nothing in this paper should be interpreted as claiming that large language models cannot display emergent abilities; rather, our message is that previously claimed emergent abilities in [3, 8, 28, 33] might likely be a mirage.' The claims are bounded to the specific prior work analyzed.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 6 discusses alternative explanations: Caballero et al.'s piece-wise power law model (where emergence is real), Michaud et al.'s strong data assumptions, and Srivastava et al.'s original hypothesis. The paper positions itself against these alternatives.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The entire paper is about the distinction between what metrics measure (per-token error rate) and what researchers claim they show (emergent abilities). The paper explicitly argues that the proxy (discontinuous/nonlinear metric) distorts the underlying phenomenon (smooth improvement).",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations section. The Discussion (Sec. 7) includes one caveat ('nothing in this paper should be interpreted as claiming that large language models cannot display emergent abilities') but no structured limitations discussion.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No specific threats to validity are discussed. For example, the paper does not address whether its mathematical model's independence assumption (footnote 1: 'the independence assumption is not true') might affect the conclusions, or whether the GPT-3 analysis with only 4 model sizes is sufficient.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "The Discussion states specific scope boundaries: 'nothing in this paper should be interpreted as claiming that large language models cannot display emergent abilities' and that the claims are specifically about 'previously claimed emergent abilities in [3, 8, 28, 33].'",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding sources are mentioned in the paper. There is no acknowledgments section listing grants or sponsors.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are clearly stated: all three authors are from Computer Science, Stanford University.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding is disclosed, so independence cannot be assessed.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement is provided.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Core term 'emergent abilities' defined: 'abilities that are not present in smaller-scale models but are present in large-scale models' with two properties (sharpness, unpredictability). Technical terms like nonlinear/discontinuous metrics explained in context and illustrated mathematically.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Contribution explicitly stated: present alternative explanation that emergent abilities are metric artifacts not fundamental properties, test three predictions, show evidence that alleged emergent abilities 'may not be a fundamental property of scaling AI models.' Intent is unambiguous.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 6 engages with Srivastava et al. (observation converted to predictions), Caballero et al. (alternative piecewise power law explanation), Michaud et al. (quantization model). Paper cites foundational emergence work and shows how this work differentiates from prior explanations.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No repository URL or code archive is mentioned in the paper. The paper is listed as 'Preprint. Under review' with no link to reproduction code.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The paper uses publicly available data: BIG-Bench is public, GPT-3/InstructGPT outputs were collected via a public API, and the hand-annotated emergent abilities list from Wei (2022) is public. CIFAR100, MNIST, and Omniglot are standard public datasets.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No environment specifications, requirements files, or dependency details are provided.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions are provided. The experimental setup is described in prose but no scripts or detailed reproduction steps are given.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "The paper reports point estimates for metrics across model scales. No confidence intervals or error bars are shown on the main figures (Figs. 2-8).",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper makes comparative claims (e.g., emergent abilities disappear with different metrics) but does not use any statistical significance tests. Comparisons are made visually via plots.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "The paper reports concrete effect sizes: '>92% of emergent abilities on BIG-Bench appear under either of these two metrics' (Sec. 1), and provides quantitative breakdowns of which metrics produce emergence across all 39 BIG-Bench metrics (Fig. 5).",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No justification is given for the number of tasks analyzed, the number of test examples generated for GPT-3 experiments, or the model family sizes chosen for vision experiments.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance or standard deviation is reported across experimental runs for any of the three analysis tracks (GPT-3, BIG-Bench meta-analysis, vision experiments).",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The paper directly compares against the original emergent abilities claims (Wei et al. 2022, Ganguli et al. 2022, Srivastava et al. 2022), using the same model families and tasks but with different metrics.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The baselines are the original emergent abilities papers (2022), which are the most recent and relevant prior work on this topic.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "The core methodology is effectively an ablation: holding model outputs fixed while varying the metric (Accuracy vs Token Edit Distance, Multiple Choice Grade vs Brier Score) to isolate the metric's contribution to the emergence phenomenon.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The paper explicitly uses multiple metrics (Accuracy, Token Edit Distance, Multiple Choice Grade, Brier Score, cross-entropy, Reconstruction_c) and shows how results change across them.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Human evaluation is not relevant to the claims, which are about mathematical properties of metrics applied to model outputs.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "For GPT-3 experiments, the authors generate new test data for arithmetic tasks. For BIG-Bench, standard test splits are used. For vision experiments, standard test sets (CIFAR100, MNIST, Omniglot) are used.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Fig. 5 provides per-metric breakdowns across all 39 BIG-Bench metrics. Figs. 3-4 break down by target string length (1-5 digits). Fig. 6 shows per-task LaMDA results.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "The paper does not discuss cases where changing the metric did NOT remove the emergent ability, or tasks where the alternative explanation might not fully apply.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": false,
    228           "justification": "Every experiment confirms the authors' hypothesis. No failed attempts, counterexamples, or cases where the alternative explanation was insufficient are reported.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "The paper states '4 models with 350M, 1.3B, 6.7B, 175B parameters are available via the OpenAI API' (footnote 3) but does not specify exact model version identifiers or API snapshot dates. The LaMDA analysis uses published BIG-Bench outputs.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": false,
    242           "justification": "The paper mentions '2-shot multiplication between two 2-digit integers and 2-shot addition between two 4-digit integers' but does not provide the actual prompt text used to query the GPT-3 models.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": false,
    248           "justification": "Fig. 3 shows results at temperature 0.0 and 1.0 but no other API parameters (top-p, max tokens) are reported. Vision experiment hyperparameters (learning rates, optimizers, training epochs) are not reported in the main text.",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used. The paper queries models directly and analyzes outputs.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": false,
    260           "justification": "For GPT-3 experiments, how arithmetic test examples were generated is not described. For the BIG-Bench meta-analysis, the filtering from all tasks to those with claimed emergent abilities is described but the exact criteria for the emergence score threshold used in Fig. 5A are not stated.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "The GPT-3 model outputs collected by the authors are not released. BIG-Bench data is public but the authors' specific analysis scripts and intermediate data are not available.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 3 describes collecting outputs from InstructGPT/GPT-3 via the OpenAI API on arithmetic tasks. Section 4 describes using BIG-Bench published outputs and hand-annotated emergence data from Wei (2022). Section 5 describes training vision models on standard datasets.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Data sources are standard benchmarks and public APIs.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "The pipeline from raw BIG-Bench outputs to the emergence score analysis is not fully documented. How many task-metric-model family triplets were analyzed, how many were excluded, and the exact filtering steps are not stated.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "The paper uses GPT-3/InstructGPT models but does not state their training data cutoff dates. For vision models trained by the authors, training data is standard benchmarks so cutoff is less relevant, but for the LLM evaluation it matters.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of whether GPT-3 may have seen arithmetic problems similar to the test set in its training data. The arithmetic tasks are generated, but the format may overlap with training data.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "BIG-Bench tasks were publicly available before GPT-3's training. No contamination analysis is performed. However, contamination would actually strengthen the paper's argument (if models saw answers, smooth improvement would be even more expected), so this is a lesser concern.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants in this study.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants in this study.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants in this study.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants in this study.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants in this study.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants in this study.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": false,
    358           "justification": "The paper queries GPT-3/InstructGPT via API and trains vision models but does not report API costs or compute costs.",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No computational budget is stated for any of the three experimental tracks.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "No seed sensitivity analysis for vision experiments (autoencoders, LeNet, transformers). GPT-3 API queries may have used temperature but no multi-seed analysis is shown.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": false,
    378           "justification": "The number of experimental runs is not stated for any experiment. It is unclear whether vision models were trained once or multiple times.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No hyperparameter search budget is reported for the vision experiments. The mathematical model parameters (c, α) are not described as searched.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "The paper does not describe how hyperparameters for vision experiments were selected or whether the shown configurations were cherry-picked from a larger set.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": false,
    395           "answer": false,
    396           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable. Interestingly, the paper notes the multiple comparisons problem in BIG-Bench (Sec. 7: '~10^6 task-metric-model family triplets') but as a criticism of prior work, not as something requiring correction in their own analysis.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors define their own metrics (Token Edit Distance, Reconstruction_c) and their own mathematical model to test their hypothesis. They do not acknowledge that their choices of alternative metrics and model parameters could be tuned to support their thesis.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": false,
    407           "answer": false,
    408           "justification": "The paper compares metrics at fixed model scales, not compute budgets. Compute differences are not the focus.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": true,
    414           "justification": "The entire paper is a construct validity critique: it argues that benchmarks using discontinuous/nonlinear metrics do not validly measure emergence because the metric creates the appearance of a phase transition. This is the core contribution.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding is involved in any of the experiments.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "No discussion of whether BIG-Bench tasks or arithmetic formats appeared in GPT-3's training data before the evaluation.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of whether the 2-shot examples provide leakage or whether the evaluation format matches training distribution.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of independence between training and test data for any of the evaluated models.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention method is applied.",
    447           "source": "opus"
    448         }
    449       },
    450       "survey_methodology": {
    451         "prisma_or_structured_protocol": {
    452           "applies": true,
    453           "answer": false,
    454           "justification": "The meta-analysis of BIG-Bench uses a quantitative emergence score (Eq. 1, from Srivastava et al. 2022) but does not follow PRISMA or any structured review protocol for selecting which emergent abilities claims to analyze.",
    455           "source": "opus"
    456         },
    457         "quality_assessment_of_sources": {
    458           "applies": true,
    459           "answer": false,
    460           "justification": "The paper does not assess the methodological quality of the source papers claiming emergent abilities. It takes the claims at face value and re-analyzes the metrics, but does not evaluate whether the original experiments were well-designed.",
    461           "source": "opus"
    462         },
    463         "publication_bias_discussed": {
    464           "applies": true,
    465           "answer": true,
    466           "justification": "Section 7 discusses a form of publication bias: 'emergent abilities claims are possibly infected by a failure to control for multiple comparisons. In BIG-Bench alone, there are ≥220 tasks, ~40 metrics per task, ~10 model families, for a total of ~10^6 task-metric-model family triplets.'",
    467           "source": "opus"
    468         }
    469       }
    470     }
    471   },
    472   "claims": [
    473     {
    474       "claim": "Emergent abilities in LLMs are caused primarily by researcher's choice of nonlinear or discontinuous metrics, not fundamental changes in model behavior with scale.",
    475       "evidence": "Three complementary analyses: (1) GPT-3 shows smooth performance on linear metrics (token edit distance) but sharp on nonlinear (accuracy) for same model outputs; (2) >92% of BIG-Bench emergent abilities occur with two discontinuous/nonlinear metrics (Multiple Choice Grade, Exact String Match); (3) Vision models show induced emergent abilities by intentional metric choice.",
    476       "supported": "strong"
    477     },
    478     {
    479       "claim": "Changing evaluation metrics on fixed model outputs removes apparent emergence.",
    480       "evidence": "Figures 3 and 6 directly show: (1) GPT-3 arithmetic shows emergence on Accuracy but not Token Edit Distance on identical outputs; (2) LaMDA shows emergence on Multiple Choice Grade but not on Brier Score on same tasks. Metric change alone eliminates emergence.",
    481       "supported": "strong"
    482     },
    483     {
    484       "claim": "Increasing test data resolution reveals smooth, continuous improvement even on nonlinear metrics where small models appear to have zero ability.",
    485       "evidence": "Figure 4 shows GPT-3 on Accuracy metric: with more test data, all models achieve above-chance performance and improvement is smooth. Without sufficient data resolution, small models appear to have completely failed on the task.",
    486       "supported": "strong"
    487     },
    488     {
    489       "claim": "Emergent abilities can be artificially induced in vision models by choosing discontinuous metrics despite smooth underlying performance improvements.",
    490       "evidence": "Figures 7-10 demonstrate: shallow autoencoders on CIFAR100 and Transformers on Omniglot show smooth MSE improvements but display sharp emergent abilities when metrics require all samples correct (subset accuracy).",
    491       "supported": "moderate"
    492     },
    493     {
    494       "claim": "Most metrics in BIG-Bench (34/39) do not show any emergent abilities for any task-model family pair.",
    495       "evidence": "Figure 5A shows emergence score distribution across all 39 BIG-Bench metrics: vast majority have zero task-model family pairs exhibiting emergence, only 4-5 metrics show any emergence.",
    496       "supported": "strong"
    497     },
    498     {
    499       "claim": "The phenomenon of emergence is metric-specific, not task-specific or model-family-specific.",
    500       "evidence": "Analysis of BIG-Bench shows same tasks display emergence under some metrics but not others (e.g., LaMDA on specific tasks shows emergence with discontinuous metric but not continuous). Emergence appears where metrics appear, not where tasks or models appear.",
    501       "supported": "strong"
    502     }
    503   ],
    504   "methodology_tags": [
    505     "benchmark-eval",
    506     "meta-analysis"
    507   ],
    508   "key_findings": "Claimed emergent abilities of LLMs are primarily metric artifacts rather than fundamental model properties. When nonlinear or discontinuous metrics are used (accuracy, exact string match, multiple choice grade), smooth underlying improvements in per-token error appear as sharp, unpredictable transitions across model scales. By switching to linear or continuous metrics (token edit distance, Brier score) or increasing test data resolution, the same models display smooth, continuous, predictable improvement with scale. Analysis of 39 BIG-Bench metrics shows >92% of published emergent abilities use just two discontinuous metrics. The authors induce never-before-seen emergent abilities in vision models using identical metric manipulation, demonstrating the mechanism's generality across domains and architectures.",
    509   "red_flags": [
    510     {
    511       "flag": "No formal statistical significance testing",
    512       "detail": "Claims about emergence and metric effects are descriptive statistics (percentages, emergence scores) without hypothesis tests, p-values, or confidence intervals on key comparisons. Emergence score (Eq. 1) is a detection metric, not a significance test."
    513     },
    514     {
    515       "flag": "Sample sizes not justified",
    516       "detail": "No power analysis or a priori justification for test dataset sizes. For GPT-3 experiments, exact number of test examples not specified. Choices appear pragmatic rather than statistically principled."
    517     },
    518     {
    519       "flag": "Token independence assumption violated",
    520       "description": "Mathematical model assumes independent per-token errors (footnote 1: 'the independence assumption is not true'), which breaks the foundation of the geometric decay analysis. Approximation works 'qualitatively' but quantitative claims rely on false assumption."
    521     },
    522     {
    523       "flag": "Multiple comparisons problem not addressed",
    524       "detail": "BIG-Bench contains ~10^6 task-metric-model family triplets. Some emergence expected by chance. No multiple comparison correction (Bonferroni, FDR) applied when claiming >92% of emergent abilities use specific metrics."
    525     },
    526     {
    527       "flag": "No code or data release",
    528       "detail": "Analysis code not provided. GPT-3 API outputs not released. Reproducibility limited, especially for proprietary model experiments. Methods described but not implementable without significant reverse-engineering."
    529     },
    530     {
    531       "flag": "Prompts not fully specified",
    532       "detail": "Exact few-shot examples, system prompts, and formatting not provided. Task descriptions like '2-shot multiplication' leave ambiguity about actual prompt structure, limiting reproducibility."
    533     },
    534     {
    535       "flag": "Alternative mechanisms not deeply explored",
    536       "detail": "Paper argues against piecewise power laws (Caballero et al.) and data-dependent emergence (Michaud et al.) but doesn't comprehensively compare these models to metric artifacts. Dismissal somewhat brief."
    537     },
    538     {
    539       "flag": "Vision experiments somewhat artificial",
    540       "detail": "Induced emergence in vision relies on specifically crafted metrics (Reconstruction_c, subset accuracy requiring all N correct). While demonstrating mechanism, these aren't naturally-occurring metrics and may not represent real research decisions."
    541     },
    542     {
    543       "flag": "Contamination not addressed",
    544       "detail": "No discussion of whether arithmetic tasks or BIG-Bench examples appear in training data of evaluated models. GPT-3 training cutoff (2021) may overlap with benchmark creation."
    545     }
    546   ],
    547   "cited_papers": [
    548     {
    549       "title": "Emergent abilities of large language models",
    550       "authors": "Wei et al.",
    551       "year": 2022,
    552       "arxiv_id": "2206.07682",
    553       "relevance": "Foundational empirical claim paper; directly defines emergent abilities and provides the primary evidence that the present paper critiques."
    554     },
    555     {
    556       "title": "Predictability and surprise in large generative models",
    557       "authors": "Ganguli et al.",
    558       "year": 2022,
    559       "relevance": "Cited work making emergent ability claims about unpredictability. Emphasizes sharp transitions at specific model scales that the present paper argues are metric artifacts."
    560     },
    561     {
    562       "title": "Beyond the imitation game: Quantifying and extrapolating the capabilities of language models",
    563       "authors": "Srivastava et al.",
    564       "year": 2022,
    565       "arxiv_id": "2206.04615",
    566       "relevance": "BIG-Bench benchmark paper; source of 220+ tasks and ~40 metrics per task used in meta-analysis. Original authors noted metrics may affect emergence."
    567     },
    568     {
    569       "title": "Broken neural scaling laws",
    570       "authors": "Caballero et al.",
    571       "year": 2022,
    572       "arxiv_id": "2210.14891",
    573       "relevance": "Alternative explanation: piecewise power laws cause real emergence. Present paper argues metric artifacts explain observations without needing piece-wise model."
    574     },
    575     {
    576       "title": "The quantization model of neural scaling",
    577       "authors": "Michaud et al.",
    578       "year": 2023,
    579       "relevance": "Proposes emergence under quantization; argues emergent abilities may be real under strong data distributional assumptions. Represents alternative hypothesis to metric artifacts."
    580     },
    581     {
    582       "title": "137 emergent abilities of large language models",
    583       "authors": "Wei, J.",
    584       "year": 2022,
    585       "relevance": "Catalog of emergent abilities; hand-annotated subset used to analyze which metrics produce emergence. Key data source for meta-analysis."
    586     },
    587     {
    588       "title": "Scaling laws for neural language models",
    589       "authors": "Kaplan et al.",
    590       "year": 2020,
    591       "arxiv_id": "2001.08361",
    592       "relevance": "Foundational scaling laws work; demonstrates smooth power-law scaling of loss. Provides theoretical foundation for smooth underlying improvement claim."
    593     }
    594   ],
    595   "engagement_factors": {
    596     "practical_relevance": {
    597       "score": 2,
    598       "justification": "Researchers choosing benchmarks and metrics can use findings to evaluate LLMs more carefully, but paper primarily critiques existing methods rather than providing new evaluation tools."
    599     },
    600     "surprise_contrarian": {
    601       "score": 3,
    602       "justification": "Directly challenges major claims from Wei et al., Ganguli et al., and OpenAI's GPT papers about emergent abilities. Won NeurIPS 2023 Outstanding Paper for disrupting widely-accepted narrative."
    603     },
    604     "fear_safety": {
    605       "score": 1,
    606       "justification": "Paper dampens AI safety concerns about emergent undesirable capabilities appearing unpredictably at scale. Argues previous emergence evidence was overstated, reducing urgency of some safety concerns."
    607     },
    608     "demo_ability": {
    609       "score": 1,
    610       "justification": "Cannot easily demonstrate metric artifact claims without access to large models and benchmarks. Visuals are post-hoc analysis of published results rather than interactive demonstrations."
    611     },
    612     "drama_conflict": {
    613       "score": 3,
    614       "justification": "High drama: directly disputes widely-celebrated findings about emergent abilities from major labs (OpenAI, Google, Meta). Received significant attention and pushback from the community."
    615     },
    616     "brand_recognition": {
    617       "score": 2,
    618       "justification": "Stanford University authors, published at NeurIPS 2023 (top venue), but not household names compared to OpenAI/DeepMind. Work gained attention through contrarian position."
    619     }
    620   },
    621   "hn_data": {
    622     "threads": [
    623       {
    624         "hn_id": "35768824",
    625         "title": "Are emergent abilities of large language models a mirage?",
    626         "points": 154,
    627         "comments": 130,
    628         "url": "https://news.ycombinator.com/item?id=35768824",
    629         "created_at": "2023-05-01T03:32:48Z"
    630       },
    631       {
    632         "hn_id": "37380462",
    633         "title": "Large language models converge toward human-like concept organization",
    634         "points": 3,
    635         "comments": 0,
    636         "url": "https://news.ycombinator.com/item?id=37380462",
    637         "created_at": "2023-09-04T13:49:33Z"
    638       },
    639       {
    640         "hn_id": "36931866",
    641         "title": "Universal and Transferable Adversarial Attacks on LLM",
    642         "points": 3,
    643         "comments": 0,
    644         "url": "https://news.ycombinator.com/item?id=36931866",
    645         "created_at": "2023-07-30T15:04:08Z"
    646       },
    647       {
    648         "hn_id": "37938665",
    649         "title": "The Surveillance AI Pipeline",
    650         "points": 2,
    651         "comments": 1,
    652         "url": "https://news.ycombinator.com/item?id=37938665",
    653         "created_at": "2023-10-19T05:00:48Z"
    654       },
    655       {
    656         "hn_id": "38280492",
    657         "title": "Ghostbuster: Detecting Text Ghostwritten by Large Language Models",
    658         "points": 2,
    659         "comments": 0,
    660         "url": "https://news.ycombinator.com/item?id=38280492",
    661         "created_at": "2023-11-15T18:36:51Z"
    662       },
    663       {
    664         "hn_id": "37675002",
    665         "title": "Reproducing Failures in Fault Signatures",
    666         "points": 2,
    667         "comments": 0,
    668         "url": "https://news.ycombinator.com/item?id=37675002",
    669         "created_at": "2023-09-27T14:17:13Z"
    670       },
    671       {
    672         "hn_id": "47174839",
    673         "title": "Are Emergent Abilities of Large Language Models a Mirage? (2023)",
    674         "points": 1,
    675         "comments": 0,
    676         "url": "https://news.ycombinator.com/item?id=47174839",
    677         "created_at": "2026-02-27T01:00:02Z"
    678       },
    679       {
    680         "hn_id": "35659049",
    681         "title": "Finding Bug-Inducing Program Environments",
    682         "points": 1,
    683         "comments": 0,
    684         "url": "https://news.ycombinator.com/item?id=35659049",
    685         "created_at": "2023-04-21T19:48:54Z"
    686       },
    687       {
    688         "hn_id": "36955679",
    689         "title": "A LLM Assisted Exploitation of AI-Guardian",
    690         "points": 1,
    691         "comments": 1,
    692         "url": "https://news.ycombinator.com/item?id=36955679",
    693         "created_at": "2023-08-01T13:28:45Z"
    694       },
    695       {
    696         "hn_id": "36903968",
    697         "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    698         "points": 1,
    699         "comments": 0,
    700         "url": "https://news.ycombinator.com/item?id=36903968",
    701         "created_at": "2023-07-28T07:30:39Z"
    702       }
    703     ],
    704     "top_points": 154,
    705     "total_points": 170,
    706     "total_comments": 132
    707   }
    708 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs