scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20087B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning in AI",
      6     "authors": [
      7       "Elliott S. Glazer",
      8       "Ege Erdil",
      9       "T. Besiroglu",
     10       "Diego Chicharro",
     11       "Evan Chen"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2411.04872",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims are substantiated: hundreds of problems from 60+ mathematicians (Section 2), 70% of MSC2020 subjects covered (Section 3), <2% solve rate confirmed in Section 4.2, and contamination-prevention measures documented in Section 2.4.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "The paper does not make causal claims requiring controlled study design; it describes a benchmark and presents observational evaluation results.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Claims about model performance are bounded to the six specific models tested, and the paper explicitly cautions that 'the precise ordering of model performance should be interpreted with significant caution' due to low absolute success rates.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not systematically consider alternative explanations for the <2% solve rate—e.g., that the code-submission evaluation format, 10,000-token limits (exceeded in 45%+ of Claude/GPT-4o attempts), or prompting choices may have suppressed performance independent of mathematical ability.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes the measured proxy (correct numerical/computable answers) from the claimed capability (mathematical reasoning), acknowledging in Section 7 that 'the practical focus on automatically verifiable and numerical answers excludes proof-writing and open-ended exploration.'",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 contains multiple dedicated paragraphs on limitations including exclusion of proof-writing, the gap from real research timescales, low current solve rates limiting discrimination, and an approximately 10% problem error rate.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats are discussed: the ~10% estimated answer error rate (derived from 2/35 second-reviewed problems having errors), subjective and inconsistent difficulty ratings between reviewers, and the benchmark's inability to rank models meaningfully when solve rates are near zero.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The paper explicitly states the benchmark 'excludes proof-writing and open-ended exploration' and that problems 'fall short of typical mathematical research, which often spans weeks, months or even years of sustained investigation.'",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The acknowledgments mention 'OpenAI for their support in creating the benchmark' but there is no formal funding disclosure or funding statement in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All author affiliations are listed in the header (Epoch AI, MIT, UC Berkeley, Harvard, Cornell, etc.).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "OpenAI acknowledged as supporting the benchmark's creation, while three OpenAI models (o1-preview, o1-mini, GPT-4o) are directly evaluated and ranked on it—a clear funder-outcome conflict.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests appears anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "The central claimed capability—'advanced mathematical reasoning'—is never formally defined; the paper characterizes it only through examples and expert opinions rather than a precise definition of what cognitive or computational skills constitute 'mathematical reasoning.'",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The contribution is unambiguously stated: a benchmark of hundreds of original, expert-crafted, automatically verifiable mathematics problems targeting research-level difficulty to evaluate frontier AI models.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5 systematically reviews and distinguishes FrontierMath from MATH, GSM8K, ARB, GHOSTS, OlympiadBench, OmniMATH, Putnam-AXIOM, AIMO, and MiniF2F, explaining how FrontierMath addresses their limitations.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues that problems requiring 'deep theoretical understanding, creative insight, and specialized expertise' from research mathematicians measure genuine mathematical reasoning, and this is qualitatively validated by three Fields Medalists confirming problem difficulty.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "A three-dimensional difficulty system is described (Background 1–5, Creativity in hours, Execution in hours), and Section A presents five sample problems chosen from each difficulty quintile with ratings.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper explicitly shows a <2% solve rate (no ceiling effect), and Figure 2 compares FrontierMath to near-saturated benchmarks (MATH, GSM8K, MMLU) to motivate why a harder benchmark is needed.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper collects only qualitative expert opinions from four mathematicians; no quantitative human solve rate or completion time distribution is measured or reported.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "The binary correct/incorrect scoring via automated verification is justified as eliminating human bias and enabling scalable evaluation; edge cases (non-unique solutions, symbolic outputs) are addressed with SymPy and custom verification scripts.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Contamination resistance is a primary design goal: all problems are new and unpublished, submitted via encrypted channels, reviewed for originality against known problems, and checked with plagiarism detection tools.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Section 6 quotes Tao estimating the benchmark 'will resist AIs for several years at least,' and Section 8 commits to ongoing addition of rigorously vetted new problems to maintain relevance.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The paper discusses failure modes including guessing susceptibility (addressed via large nonobvious answers), the ~10% problem error rate (Section 2.3), shortcut strategies that undermine difficulty estimates, and the format's exclusion of proof-based problems.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "The full benchmark is not publicly accessible (requires contacting Epoch AI), so the reported evaluation numbers cannot be independently reproduced; only 5 sample problems with solutions are publicly released.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "While the collection methodology is described in detail, there is no formal data card, and the dataset itself is not publicly available, making complete documentation inaccessible to the research community.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Access is described only as 'reach out to math_evals@epochai.org' with no stated terms, license, or restrictions on use—others cannot determine under what conditions they can use the benchmark.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 7 explicitly states what the benchmark does and does not measure (no proof-writing, no long-horizon research skills), and the Discussion clarifies that <2% solve rates limit model discrimination, guiding appropriate interpretation.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "Current state-of-the-art AI models solve under 2% of FrontierMath problems",
    203       "evidence": "Evaluation of six frontier models (o1-preview, o1-mini, GPT-4o, Claude 3.5 Sonnet, Grok 2 Beta, Gemini 1.5 Pro 002) with 8 runs each; best model achieved ~2% mean accuracy (Figure 6)",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "FrontierMath problems require multiple hours of effort from expert research mathematicians",
    208       "evidence": "Difficulty ratings assign Creativity and Execution in hours; Fields Medalists (Tao, Gowers, Borcherds) unanimously characterized problems as 'exceptionally challenging' requiring 'deep domain expertise'",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "Existing mathematics benchmarks (MATH, GSM8K, MMLU) are near saturation with current AI models",
    213       "evidence": "Figure 2 shows >94% solve rates for top models on MATH, >96% on GSM8K, >98% on MMLU College Math subset",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "FrontierMath effectively prevents data contamination through novel, unpublished problems",
    218       "evidence": "Problems created specifically for the benchmark, handled via encrypted channels, checked with Quetext and Copyscape with no significant matches found",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "The benchmark has an estimated ~10% critical error rate in problem correctness",
    223       "evidence": "Second reviews of 35 problems found 2 with incorrect answers; Jeffreys prior yields posterior error rate of ~6.9%, rounded up to ~10% accounting for undetected errors",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "o1-preview demonstrates strongest performance on problems solved at least once",
    228       "evidence": "Repeated trial results (Table 2) show o1-preview solved one problem 5/5 times and outperformed other models on 3 of 4 repeated-trial problems",
    229       "supported": "moderate"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval"
    234   ],
    235   "key_findings": "FrontierMath introduces hundreds of original, research-level mathematics problems where no current frontier AI model achieves more than 2% accuracy, revealing a substantial gap between AI capabilities and expert mathematical ability. The benchmark uses automated verification via code execution to enable scalable, reproducible evaluation while covering 70% of MSC2020 top-level subjects. Three Fields Medalists (Tao, Gowers, Borcherds) qualitatively confirmed the problems as exceptionally difficult, requiring deep domain expertise. The authors acknowledge an estimated ~10% problem error rate, lack of quantitative human baseline, and that the benchmark's access-controlled nature limits independent reproducibility.",
    236   "red_flags": [
    237     {
    238       "flag": "Funder evaluates own models",
    239       "detail": "OpenAI is acknowledged as supporting benchmark creation, yet three OpenAI models are evaluated and ranked on the benchmark, creating a direct conflict of interest that is not disclosed or addressed."
    240     },
    241     {
    242       "flag": "No quantitative human baseline",
    243       "detail": "Human performance is characterized only qualitatively through four expert interviews; no systematic human solve rate, completion time distribution, or formal human evaluation is reported."
    244     },
    245     {
    246       "flag": "~10% estimated error rate",
    247       "detail": "Only 35 of hundreds of problems received second review; 2/35 had incorrect answers, yielding an estimated ~10% critical error rate—higher than the authors' own comparisons to ImageNet (6%) and MMLU (9%)."
    248     },
    249     {
    250       "flag": "Benchmark not publicly available",
    251       "detail": "The full problem set requires contacting Epoch AI; no license terms are stated. Reported evaluation numbers cannot be independently reproduced or verified."
    252     },
    253     {
    254       "flag": "Inconsistent difficulty ratings",
    255       "detail": "Section 2.5 explicitly acknowledges that first and second reviewers 'rarely matched and often showed substantial differences' in difficulty ratings, undermining the reliability of the difficulty characterization."
    256     },
    257     {
    258       "flag": "Evan Chen is both co-author and interviewee",
    259       "detail": "Section 6 presents interviews with mathematicians validating difficulty, but Evan Chen—listed as a core contributor and problem author—is one of four interviewees, creating a circularity in the validation."
    260     },
    261     {
    262       "flag": "Alternative evaluation explanations unexplored",
    263       "detail": "The <2% solve rate is attributed to mathematical difficulty, but token limits were exceeded in 45%+ of attempts by three models, suggesting evaluation framework constraints may partially explain poor performance."
    264     }
    265   ],
    266   "cited_papers": [
    267     {
    268       "title": "Measuring Mathematical Problem Solving with the MATH Dataset",
    269       "relevance": "Foundational math benchmark that FrontierMath directly supersedes; demonstrates saturation at near-perfect accuracy by frontier models"
    270     },
    271     {
    272       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    273       "relevance": "Elementary-level math benchmark used as comparison point for benchmark saturation analysis"
    274     },
    275     {
    276       "title": "ARB: Advanced Reasoning Benchmark for Large Language Models",
    277       "relevance": "Prior university/contest-level benchmark that FrontierMath aims to exceed in difficulty"
    278     },
    279     {
    280       "title": "Omni-MATH: A Universal Olympiad Level Mathematic Benchmark",
    281       "relevance": "Direct competitor benchmark at olympiad level; used in difficulty comparison figures"
    282     },
    283     {
    284       "title": "Putnam-AXIOM: A Functional and Static Benchmark for Measuring Higher Level Mathematical Reasoning",
    285       "relevance": "Related approach using programmatic variant generation for contamination resistance; demonstrates 50%→34% drop from contamination effects"
    286     },
    287     {
    288       "title": "MiniF2F: A Cross-System Benchmark for Formal Olympiad-Level Mathematics",
    289       "relevance": "Formal theorem-proving benchmark; contrasted with FrontierMath's informal numerical approach"
    290     },
    291     {
    292       "title": "Mathematical Capabilities of ChatGPT (GHOSTS dataset)",
    293       "relevance": "Graduate-level mathematical capabilities evaluation; related prior work on hard math benchmarks"
    294     },
    295     {
    296       "title": "Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks",
    297       "relevance": "Used to contextualize FrontierMath's ~10% error rate estimate against ImageNet's >6% label error rate"
    298     },
    299     {
    300       "title": "Investigating Data Contamination in Modern Benchmarks for Large Language Models",
    301       "relevance": "Motivates FrontierMath's contamination-prevention design by documenting prevalence of contamination in existing benchmarks"
    302     }
    303   ],
    304   "engagement_factors": {
    305     "practical_relevance": {
    306       "score": 2,
    307       "justification": "Directly useful to AI labs and researchers needing a non-saturated math evaluation, but access is gated through contact with Epoch AI."
    308     },
    309     "surprise_contrarian": {
    310       "score": 2,
    311       "justification": "The <2% solve rate for frontier models including o1-preview is striking given recent AI progress on competition math, and challenges optimistic narratives about AI mathematical capability."
    312     },
    313     "fear_safety": {
    314       "score": 1,
    315       "justification": "Frames advanced mathematical reasoning as a key capability milestone for AI, with implications for AI's potential contributions to scientific research, but does not raise direct safety concerns."
    316     },
    317     "drama_conflict": {
    318       "score": 2,
    319       "justification": "Features quotes from three Fields Medalists (Tao, Gowers, Borcherds) and the OpenAI funding/evaluation conflict adds tension; Tao's benchmark contributions while also being an expert evaluator adds color."
    320     },
    321     "demo_ability": {
    322       "score": 1,
    323       "justification": "Five sample problems with solutions are publicly released and transcripts are downloadable, but the full benchmark requires institutional access."
    324     },
    325     "brand_recognition": {
    326       "score": 3,
    327       "justification": "Epoch AI is a known AI forecasting organization; OpenAI's support is acknowledged; Fields Medalists Terence Tao, Timothy Gowers, and Richard Borcherds are quoted; all major frontier model families are evaluated."
    328     }
    329   },
    330   "hn_data": {
    331     "threads": [
    332       {
    333         "hn_id": "45873709",
    334         "title": "The Drain of Scientific Publishing",
    335         "points": 1,
    336         "comments": 0,
    337         "url": "https://news.ycombinator.com/item?id=45873709",
    338         "created_at": "2025-11-10T08:21:43Z"
    339       },
    340       {
    341         "hn_id": "39032813",
    342         "title": "Adapting Standard Retrieval Benchmarks to Evaluate Generated Answers",
    343         "points": 1,
    344         "comments": 0,
    345         "url": "https://news.ycombinator.com/item?id=39032813",
    346         "created_at": "2024-01-17T20:13:37Z"
    347       },
    348       {
    349         "hn_id": "25074836",
    350         "title": "Principles of Quantum Communication Theory: A Modern Approach",
    351         "points": 1,
    352         "comments": 0,
    353         "url": "https://news.ycombinator.com/item?id=25074836",
    354         "created_at": "2020-11-12T20:53:00Z"
    355       }
    356     ],
    357     "top_points": 1,
    358     "total_points": 3,
    359     "total_comments": 0
    360   }
    361 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs