scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (21971B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning in AI",
      6     "authors": [
      7       "Elliott S. Glazer",
      8       "Ege Erdil",
      9       "T. Besiroglu",
     10       "Diego Chicharro",
     11       "Evan Chen"
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2411.04872",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract claims 'current state-of-the-art AI models solve under 2% of problems' — supported by Figure 6 and Section 4.2.1. 'Exceptionally challenging mathematics problems crafted and vetted by expert mathematicians' — supported by Section 2, Section 6 (interviews). 'Cover most major branches of modern mathematics' — supported by Table 1 and Figure 4.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "The paper makes descriptive claims about benchmark difficulty and model performance levels, not causal claims. It does not claim that any intervention causes improved performance.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title 'Evaluating Advanced Mathematical Reasoning in AI' and abstract phrase 'revealing a vast gap between AI capabilities and the prowess of the mathematical community' generalize from 6 specific models to 'AI' broadly. While the tested models were state-of-the-art at the time, the claim extends beyond the tested set without explicit bounding.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "Section 7 discusses several alternative factors: the numerical answer format excludes proof problems, the hours-scale difficulty doesn't capture months-long research, token limits constrain model performance. Section 4.2.1 notes that correct answers don't imply correct reasoning and that guessing strategies sometimes work.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly distinguishes between what it measures (ability to produce correct numerical answers) and broader mathematical reasoning, acknowledging in Section 7 that 'the practical focus on automatically verifiable and numerical answers excludes proof-writing and open-ended exploration, which are significant parts of modern math research.' Borcherds notes the problems 'aren't quite the same as coming up with original proofs' (Section 6).",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Section 7 (Discussion) contains substantive discussion of limitations: numerical answer format excludes proofs, difficulty limited to hours not months, models too weak for fine-grained comparison, and the practical constraints of automated verification.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Section 2.3 quantifies a specific error rate (~10% estimated) with detailed analysis of 35 second-reviewed problems. Section 7 discusses specific threats: exclusion of proof problems, time-scale limitations, and that the <2% success rate 'temporarily limits FrontierMath's usefulness in evaluating relative performance of models.'",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Section 7 explicitly states: 'we cannot include problems that require mathematical proofs or formal reasoning steps,' 'they still fall short of typical mathematical research, which often spans weeks, months or even years,' and that current model weakness limits relative performance evaluation. These are specific statements about what FrontierMath does NOT test.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The paper acknowledges 'OpenAI for their support in creating the benchmark' and notes one position 'supported by SwissMAP' (ETH Zurich). Various individuals are thanked in the acknowledgments.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed (Epoch AI, various universities). The paper notes that Evan Chen is both a co-author and interviewee (Section 6), and that Terence Tao contributed problems and is interviewed.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "OpenAI supported the benchmark creation and OpenAI's models (o1-preview, o1-mini, GPT-4o) are among those evaluated. OpenAI has a commercial interest in AI evaluation and the perception of their models' capabilities. The funder is not independent of the outcome.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests statement is present in the paper. Epoch AI is a research organization that may have interests in AI evaluation as a field, but no formal declaration is made.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: difficulty dimensions (background, creativity, execution) are precisely defined with scales in Sections 2.5 and Appendix C; 'automated verification' and 'guessproofness' are defined operationally in Section 2.2; 'data contamination' is explained in the introduction.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly states it contributes a benchmark of hundreds of original research-level mathematics problems with automated verification, addressing contamination and saturation limitations of existing benchmarks (Introduction and Section 5).",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 5 (Related Work) systematically reviews GSM8K, MATH, ARB, GHOSTS, OlympiadBench, PutnamBench, OmniMATH, Putnam-AXIOM, AIMO, and MiniF2F, explaining how each differs from and is addressed by FrontierMath.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues that requiring hours of expert mathematician effort, covering research-level MSC2020 topics, and demanding deep theoretical understanding and creative insight together measure 'advanced mathematical reasoning' beyond competition-level pattern matching (Introduction and Section 2).",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Difficulty is characterized along three rated dimensions (background 1-5, creativity in hours, execution in hours), five sample problems span the difficulty quintiles (Appendix A), and Table 1 shows MSC distribution; however, the full distribution histogram is not shown and reviewer agreement on ratings was poor.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "The paper explicitly documents that all evaluated models score below 2% (no ceiling effect) and Figure 2 compares against saturated benchmarks; the design explicitly targets floor avoidance through expert-level difficulty calibration.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No systematic human solve rate is reported; qualitative interviews with Fields Medalists characterize problems as 'exceptionally challenging' but no quantitative human performance baseline (e.g., % solved by expert mathematicians within time limit) is provided.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Binary correct/incorrect scoring via automated verification is justified in Section 2.2 as enabling 'rapid, objective evaluation' while 'eliminating potential human bias'; edge cases (non-unique solutions, SymPy objects) are explicitly addressed with custom verification scripts.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Contamination resistance is a primary design goal: all problems are new and unpublished, submission used encrypted channels, plagiarism detection tools (Quetext, Copyscape) were applied to all problem statements, and expert review checked against existing literature (Section 2.4).",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Section 8 commits to ongoing development ('introducing new, rigorously vetted problems') and Section 7 acknowledges that the benchmark's utility will grow as AI improves, though no formal update schedule or obsolescence timeline is specified.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "The paper discusses benchmark failure modes including: exclusion of proof-writing, ~10% estimated problem error rate, guessproofness violations found during review, possible unexpected model shortcuts, and inconsistent difficulty ratings between reviewers (Sections 2.3, 2.5, 7).",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "GitHub links for sample problem solution code are provided, and evaluation prompts are included in Appendix B.1, but the full benchmark is not publicly available (access requires emailing math_evals@epochai.org), so reported numbers cannot be independently reproduced.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "The paper documents collection methodology, metadata schema (Table 3), review process, and MSC classification, but there is no formal data card, and the dataset itself is not publicly released — only 5 of hundreds of problems are available as samples.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Access requires emailing math_evals@epochai.org with no stated terms, timelines, or license; Section 8 describes FrontierMath as a 'public resource' but provides no license specification or usage terms.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Section 7 specifies FrontierMath is intended to evaluate progress toward research-level mathematical capability, and explicitly states what it does NOT cover (proof-writing, long-horizon research); the interview section (Section 6) further contextualizes appropriate interpretation of results.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "Current state-of-the-art AI models solve under 2% of FrontierMath problems",
    203       "evidence": "Six leading models (o1-preview, o1-mini, GPT-4o, Claude 3.5 Sonnet, Grok 2 Beta, Gemini 1.5 Pro) evaluated on the full benchmark; Figure 6 shows mean accuracy below 2% for all models across 8 runs",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "FrontierMath is resistant to data contamination because all problems are new and previously unpublished",
    208       "evidence": "Section 2.4 describes encrypted submission channels, plagiarism detection (Quetext, Copyscape showed no significant matches), and expert originality review; problems were designed to be novel adaptations requiring genuine insight",
    209       "supported": "moderate"
    210     },
    211     {
    212       "claim": "The problems require multiple hours of effort from expert mathematicians in the relevant branch",
    213       "evidence": "Qualitative interviews with three Fields Medalists (Tao, Gowers, Borcherds) in Section 6 confirm problems are 'exceptionally challenging'; sample problems in Appendix A show creativity ratings of 2-4 hours and execution ratings of 2-15 hours",
    214       "supported": "moderate"
    215     },
    216     {
    217       "claim": "FrontierMath covers most major branches of modern mathematics (70% of MSC2020 top-level subjects)",
    218       "evidence": "Table 1 shows distribution across 24+ MSC2020 classification codes; Section 3 states coverage of 70% of top-level MSC2020 subjects excluding general/history/education categories",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "Harder problems (by author difficulty rating) correlate with lower AI solve rates",
    223       "evidence": "Section 2.5 footnote acknowledges this as 'unpublished preliminary work' with GPT-4o and explicitly states 'more systematic validation would be needed to make strong claims'",
    224       "supported": "weak"
    225     },
    226     {
    227       "claim": "The benchmark has approximately 10% critical error rate in problem correctness",
    228       "evidence": "Section 2.3: 2/35 second-reviewed problems had incorrect answers from authors undetected in first review; Jeffreys prior yields posterior estimate of ~6.9%, rounded to ~10% accounting for undetected errors",
    229       "supported": "moderate"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval",
    234     "qualitative"
    235   ],
    236   "key_findings": "FrontierMath introduces hundreds of original research-level mathematics problems covering 70% of MSC2020 branches, where all current SOTA AI models (including o1-preview) fail on over 98% of problems. The benchmark uses automated verification of numerical/symbolic answers and new unpublished problems to address contamination and saturation. Expert mathematicians including three Fields Medalists confirmed the problems as exceptionally difficult, requiring hours to days from domain experts. The paper estimates ~10% problem error rate and documents significant inter-rater disagreement on difficulty ratings, revealing quality assurance limitations in large-scale expert-crafted benchmarks.",
    237   "red_flags": [
    238     {
    239       "flag": "Funder evaluated on benchmark",
    240       "detail": "OpenAI is acknowledged for 'support in creating the benchmark' yet GPT-4o and o1-preview are among the six models directly evaluated; this relationship is not flagged as a conflict of interest."
    241     },
    242     {
    243       "flag": "No human performance baseline",
    244       "detail": "No quantitative human solve rate is reported; all difficulty characterization is qualitative (expert interviews) without systematic measurement of human performance under the same evaluation framework."
    245     },
    246     {
    247       "flag": "Benchmark not publicly accessible",
    248       "detail": "The full benchmark requires emailing math_evals@epochai.org with no stated terms or license; only 5 sample problems are public, preventing independent verification of reported evaluation results."
    249     },
    250     {
    251       "flag": "10% estimated error rate",
    252       "detail": "Only 35 problems received second review; 2 had incorrect answers from authors, yielding an estimated ~10% critical error rate across the full benchmark, which is acknowledged but not remediated before publication."
    253     },
    254     {
    255       "flag": "Poor difficulty rating reliability",
    256       "detail": "Section 2.3 reports that difficulty ratings 'rarely matched and often showed substantial differences' between first and second reviewers, undermining the difficulty characterization presented as a benchmark feature."
    257     },
    258     {
    259       "flag": "Evan Chen dual role",
    260       "detail": "Evan Chen is both a co-author of the paper and one of the four expert mathematicians interviewed in Section 6 to validate the benchmark's difficulty — his interview is not independent validation."
    261     }
    262   ],
    263   "cited_papers": [
    264     {
    265       "title": "Measuring mathematical problem solving with the MATH dataset",
    266       "relevance": "Primary benchmark being compared against; FrontierMath explicitly targets problems beyond MATH's difficulty level"
    267     },
    268     {
    269       "title": "Training verifiers to solve math word problems (GSM8K)",
    270       "relevance": "Foundational math benchmark shown to be near-saturated by SOTA models, motivating FrontierMath"
    271     },
    272     {
    273       "title": "Omni-MATH: A Universal Olympiad Level Mathematic Benchmark For Large Language Models",
    274       "relevance": "Most recent high-difficulty comparison benchmark; used for saturation comparison in Figure 2"
    275     },
    276     {
    277       "title": "Benchmark Data Contamination of Large Language Models: A Survey",
    278       "relevance": "Motivates contamination-resistant design; cited as evidence that contamination inflates performance metrics"
    279     },
    280     {
    281       "title": "Putnam-AXIOM: A Functional and Static Benchmark for Measuring Higher Level Mathematical Reasoning",
    282       "relevance": "Direct predecessor attempting contamination resistance via problem variants; FrontierMath argues this approach provides limited novelty"
    283     },
    284     {
    285       "title": "Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks",
    286       "relevance": "Used to contextualize FrontierMath's ~10% estimated error rate against known benchmark error rates (ImageNet >6%, MMLU >9%)"
    287     },
    288     {
    289       "title": "Arb: Advanced reasoning benchmark for large language models",
    290       "relevance": "University/graduate-level math benchmark that FrontierMath claims to supersede in difficulty"
    291     },
    292     {
    293       "title": "Mathematical capabilities of ChatGPT (GHOSTS dataset)",
    294       "relevance": "Graduate-level evaluation with human-expert grading; represents alternative evaluation methodology to FrontierMath's automated verification"
    295     },
    296     {
    297       "title": "Solving olympiad geometry without human demonstrations (AlphaGeometry)",
    298       "relevance": "Cited as evidence of recent AI mathematical advances motivating need for harder benchmarks"
    299     },
    300     {
    301       "title": "Mathematical discoveries from program search with large language models (FunSearch)",
    302       "relevance": "Cited as evidence of AI improving on research-level combinatorics results, motivating research-level benchmark"
    303     }
    304   ],
    305   "engagement_factors": {
    306     "practical_relevance": {
    307       "score": 2,
    308       "justification": "AI developers can use FrontierMath to benchmark mathematical reasoning, but access is gated (email required) and problems aren't freely usable by practitioners."
    309     },
    310     "surprise_contrarian": {
    311       "score": 3,
    312       "justification": "The finding that o1-preview — widely perceived as near-human at math — solves under 2% of problems directly challenges common narratives about AI mathematical capability reaching expert level."
    313     },
    314     "fear_safety": {
    315       "score": 1,
    316       "justification": "Marginally relevant as a capability measurement tool; mathematician interviews speculate about AI eventually surpassing humans in mathematics, but safety framing is absent."
    317     },
    318     "drama_conflict": {
    319       "score": 2,
    320       "justification": "OpenAI acknowledged as supporter while their flagship model performs worst among reasoning models; Fields Medalists publicly assessed AI math capability as far below expert level."
    321     },
    322     "demo_ability": {
    323       "score": 2,
    324       "justification": "Five sample problems with full solutions are publicly available, and model transcripts on sample problems are downloadable, enabling partial hands-on engagement."
    325     },
    326     "brand_recognition": {
    327       "score": 3,
    328       "justification": "Features Terence Tao (Fields Medal), Timothy Gowers (Fields Medal), Richard Borcherds (Fields Medal), Epoch AI, and evaluation of OpenAI/Anthropic/Google flagship models — extremely high brand density."
    329     }
    330   },
    331   "hn_data": {
    332     "threads": [
    333       {
    334         "hn_id": "45873709",
    335         "title": "The Drain of Scientific Publishing",
    336         "points": 1,
    337         "comments": 0,
    338         "url": "https://news.ycombinator.com/item?id=45873709",
    339         "created_at": "2025-11-10T08:21:43Z"
    340       },
    341       {
    342         "hn_id": "39032813",
    343         "title": "Adapting Standard Retrieval Benchmarks to Evaluate Generated Answers",
    344         "points": 1,
    345         "comments": 0,
    346         "url": "https://news.ycombinator.com/item?id=39032813",
    347         "created_at": "2024-01-17T20:13:37Z"
    348       },
    349       {
    350         "hn_id": "25074836",
    351         "title": "Principles of Quantum Communication Theory: A Modern Approach",
    352         "points": 1,
    353         "comments": 0,
    354         "url": "https://news.ycombinator.com/item?id=25074836",
    355         "created_at": "2020-11-12T20:53:00Z"
    356       }
    357     ],
    358     "top_points": 1,
    359     "total_points": 3,
    360     "total_comments": 0
    361   }
    362 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs