ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32026B)


      1 {
      2   "paper": {
      3     "title": "FrontierMath: A Benchmark for Evaluating Advanced Mathematical Reasoning in AI",
      4     "authors": [
      5       "Elliot Glazer",
      6       "Ege Erdil",
      7       "Tamay Besiroglu",
      8       "Diego Chicharro",
      9       "Evan Chen",
     10       "Alex Gunning",
     11       "Caroline Falkman Olsson",
     12       "Jean-Stanislas Denain",
     13       "Anson Ho",
     14       "Emily de Oliveira Santos",
     15       "Olli Järviniemi",
     16       "Matthew Barnett",
     17       "Robert Sandler",
     18       "Matej Vrzala",
     19       "Jaime Sevilla"
     20     ],
     21     "year": 2024,
     22     "venue": "arXiv",
     23     "arxiv_id": "2411.04872"
     24   },
     25   "scan_version": 2,
     26   "active_modules": ["experimental_rigor", "data_leakage"],
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The benchmark and evaluation framework are deliberately kept private to prevent data contamination. GitHub links are provided only for helper code for two sample problems (artin_code, finite_field_implementation), not for the benchmark or evaluation system itself."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The benchmark problems are deliberately withheld to prevent contamination. Only 5 sample problems are publicly released (Section A). The paper states teams should 'reach out to math_evals@epochai.org' for evaluation access."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The evaluation uses Python with pickle and SymPy for verification (Section 2.2, Figure 3), but no requirements.txt, Dockerfile, or detailed dependency listing is provided. The exact Python version and library versions are not specified."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No step-by-step reproduction instructions are provided. The benchmark is private, so external researchers cannot reproduce the evaluation without contacting the authors."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Figure 6 shows 'mean accuracy across 8 runs' but no confidence intervals, error bars, or ± values are reported. Section 4.2.1 notes that 'the precise ordering of model performance should be interpreted with significant caution' but provides no quantified uncertainty."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No statistical significance tests are used. Model comparisons are made by comparing raw performance numbers without any hypothesis testing. The paper acknowledges ranking instability but does not test whether observed differences are statistically significant."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper reports absolute performance numbers (<2% solve rate) and compares across benchmarks (Figure 2) but does not report formal effect sizes (Cohen's d, odds ratios, etc.). The comparisons are descriptive rather than quantified with effect size measures."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The benchmark contains 'hundreds' of problems but no justification is given for this number. The repeated trials use 5 runs per model per problem on 4 selected problems, and 8 runs for the full benchmark, with no power analysis or formal justification for these sample sizes."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "While 8 runs were conducted (Figure 6) and 5 repeated trials on 4 problems (Table 2), no standard deviations, IQR, or other spread measures are reported. Table 2 shows success rates as percentages but without variance across the 5 trials. The paper mentions 'high variability across runs' qualitatively without quantifying it."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The paper compares FrontierMath performance against multiple existing benchmarks (GSM8K, MATH, AIME, Omni-MATH, MathVista, MMLU College Math) in Figure 2, and evaluates 6 different models against each other in Figure 6 and Table 2."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Models evaluated include o1-preview, o1-mini, GPT-4o (2024-08-06), Claude 3.5 Sonnet (2024-10-22), Grok 2 Beta, and Gemini 1.5 Pro 002 — all state-of-the-art at the time of evaluation (Section 4.2.1)."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No ablation study is performed. The evaluation framework has key design choices (10,000 token limit, code execution capability, prompt format) that are not systematically varied. Section 8 mentions plans to 'test the effects of increasing the token limit' as future work."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "The paper reports pass@1 accuracy (Figure 6), pass@8 accuracy (Figure 9, Appendix B.4), number of responses per problem, token usage per model, and per-problem success rates across repeated trials (Table 2, Section 4.2.2)."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Model evaluation is entirely automated via verification scripts (Section 2.2). The mathematician interviews (Section 6) evaluate the benchmark's difficulty, not the models' outputs. The paper notes 'even when a model obtained the correct answer, this does not mean that its reasoning was correct' but does not systematically evaluate reasoning quality via human review."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The entire FrontierMath benchmark consists of novel, unpublished problems (Section 2.4) that were not used for any development or tuning purpose. The benchmark is the held-out test set by design."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Despite rich MSC2020 classification of problems (Table 1, Figure 4) and difficulty ratings across three dimensions (Section 2.5), no per-category or per-difficulty-level performance breakdown is provided. The <2% solve rate is reported only as an aggregate."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 4.2.1 discusses that models often hit token limits, that o1-preview and Gemini 'typically submit a final answer before seeing any experimental results' despite being told to experiment, and that a correct answer was obtained through guessing rather than mathematical understanding. Section 4.2.2 analyzes token usage and response patterns."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The core finding is a negative result — models solve <2% of problems. The paper reports that models fail to use the code execution framework effectively (Section 4.2.2), and that correct answers sometimes come from guessing rather than understanding (Section 4.2.1)."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract claims 'current state-of-the-art AI models solve under 2% of problems' — supported by Figure 6 and Section 4.2.1. 'Exceptionally challenging mathematics problems crafted and vetted by expert mathematicians' — supported by Section 2, Section 6 (interviews). 'Cover most major branches of modern mathematics' — supported by Table 1 and Figure 4."
    129       },
    130       "causal_claims_justified": {
    131         "applies": false,
    132         "answer": false,
    133         "justification": "The paper makes descriptive claims about benchmark difficulty and model performance levels, not causal claims. It does not claim that any intervention causes improved performance."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title 'Evaluating Advanced Mathematical Reasoning in AI' and abstract phrase 'revealing a vast gap between AI capabilities and the prowess of the mathematical community' generalize from 6 specific models to 'AI' broadly. While the tested models were state-of-the-art at the time, the claim extends beyond the tested set without explicit bounding."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Section 7 discusses several alternative factors: the numerical answer format excludes proof problems, the hours-scale difficulty doesn't capture months-long research, token limits constrain model performance. Section 4.2.1 notes that correct answers don't imply correct reasoning and that guessing strategies sometimes work."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper explicitly distinguishes between what it measures (ability to produce correct numerical answers) and broader mathematical reasoning, acknowledging in Section 7 that 'the practical focus on automatically verifiable and numerical answers excludes proof-writing and open-ended exploration, which are significant parts of modern math research.' Borcherds notes the problems 'aren't quite the same as coming up with original proofs' (Section 6)."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 4.2.1 specifies: 'GPT-4o (2024-08-06 version)', 'Claude 3.5 Sonnet (2024-10-22 version)', 'Gemini 1.5 Pro 002', 'o1-preview', 'o1-mini', and 'Grok 2 Beta' with citations to the specific model releases."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix B.1 provides the complete text of the initial prompt, continuation prompt, and final prompt used in evaluation. These are full verbatim prompts, not summaries."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "The token limit is stated as 10,000 (Section 4.1), but temperature, top-p, and other sampling parameters for the evaluated models are not reported."
    166       },
    167       "scaffolding_described": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 4.1 and Figure 5 describe the evaluation framework in detail: models can write Python code, receive execution results, iterate multiple times, and submit final answers via pickle format. The interaction loop, feedback mechanism, and answer submission protocol are clearly documented."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "The data collection pipeline is documented in Sections 2.1-2.5: problem creation guidelines (originality, verifiability, guessproofness, computational tractability), peer review process, originality checks (plagiarism detection), difficulty rating system, and validation procedures."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 7 (Discussion) contains substantive discussion of limitations: numerical answer format excludes proofs, difficulty limited to hours not months, models too weak for fine-grained comparison, and the practical constraints of automated verification."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 2.3 quantifies a specific error rate (~10% estimated) with detailed analysis of 35 second-reviewed problems. Section 7 discusses specific threats: exclusion of proof problems, time-scale limitations, and that the <2% success rate 'temporarily limits FrontierMath's usefulness in evaluating relative performance of models.'"
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "Section 7 explicitly states: 'we cannot include problems that require mathematical proofs or formal reasoning steps,' 'they still fall short of typical mathematical research, which often spans weeks, months or even years,' and that current model weakness limits relative performance evaluation. These are specific statements about what FrontierMath does NOT test."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The benchmark problems are deliberately kept private to prevent data contamination. Only 5 sample problems are released publicly (Section A). Teams must contact the authors for evaluation access."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Section 2.1 describes the collection process in detail: collaboration with 60+ mathematicians, guidelines for problem creation, four requirements (originality, verifiability, guessproofness, computational tractability), submission metadata, and peer review workflow."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "The paper describes who the mathematicians are (60+ from universities in 12+ countries, 14 IMO gold medals, graduate students to faculty) but does not describe how they were recruited — through what channels, invitation process, or whether the recruitment method could bias the benchmark toward certain mathematical subfields."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The pipeline is documented: problem creation with guidelines → metadata submission → blind peer review → revision of issues → second review for subset (35 problems) → originality/plagiarism checks. Section 2.3 details types of issues found and error rate estimation."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The paper acknowledges 'OpenAI for their support in creating the benchmark' and notes one position 'supported by SwissMAP' (ETH Zurich). Various individuals are thanked in the acknowledgments."
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Author affiliations are listed (Epoch AI, various universities). The paper notes that Evan Chen is both a co-author and interviewee (Section 6), and that Terence Tao contributed problems and is interviewed."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "OpenAI supported the benchmark creation and OpenAI's models (o1-preview, o1-mini, GPT-4o) are among those evaluated. OpenAI has a commercial interest in AI evaluation and the perception of their models' capabilities. The funder is not independent of the outcome."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is present in the paper. Epoch AI is a research organization that may have interests in AI evaluation as a field, but no formal declaration is made."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "While the paper extensively discusses contamination prevention by using novel problems, it does not state the training data cutoff dates for any of the 6 evaluated models."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "This is the core innovation. Section 2.4 describes extensive measures: problems are exclusively new and unpublished, submitted through encrypted channels, checked with plagiarism detection tools (Quetext, Copyscape), and reviewed by expert mathematicians for originality."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": true,
    253         "justification": "Contamination prevention is a primary contribution. Section 2.4 details: novel unpublished problems, encrypted communication channels, plagiarism detection tools, expert review for originality. Section 1 frames contamination as a key motivation for the benchmark."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human subjects study is conducted. The mathematicians are problem contributors and interviewees, not experimental participants."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human subjects study is conducted. Mathematician collaborators and interviewees are not experimental participants."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human subjects study. Mathematician contributors are described by affiliation and expertise (Section 2.1) but this is collaborator description, not participant demographics."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human subjects study is conducted."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human subjects study is conducted."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human subjects study is conducted."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human subjects study is conducted."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Section 4.2.2 reports token usage (6,000-17,000 tokens per question) and percentage of questions hitting token limits, but does not report API costs in dollars, wall-clock evaluation time, or per-problem latency."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No total computational budget is stated. The paper does not report total API spend, GPU hours, or overall evaluation time for the benchmark runs."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Eight runs were conducted on the full benchmark (Figure 6) and 5 repeated trials on 4 problems (Table 2), showing variability exists. However, no standard deviations or quantified variance across runs is reported — only mean accuracy and per-problem success rates."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Section 4.2.1 explicitly states '8 runs' for the full benchmark evaluation and 'five runs per model per problem' for the detailed repeated trials on 4 problems."
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper sets a 10,000 token limit and uses specific prompts but does not report whether these were tuned or what alternatives were considered. Section 8 mentions future plans to 'test the effects of increasing the token limit,' implying the current choice was not validated."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The 10,000 token limit and prompt design choices are not justified with systematic comparison. No explanation is given for why this specific configuration was chosen over alternatives."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": false,
    328         "answer": false,
    329         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors designed the benchmark and evaluate third-party models on it. The potential bias of benchmark creators evaluating on their own benchmark (e.g., problem selection favoring difficulty patterns that disadvantage certain model types) is not discussed."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Section 4.2.2 notes substantial differences in token usage between models (6,000 to 17,000 tokens per question) but does not analyze whether these compute differences affect performance or present performance-compute tradeoff curves."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": true,
    344         "justification": "Section 6 features interviews with three Fields Medalists and an IMO coach who assess the benchmark's validity. Section 7 acknowledges the gap between numerical answers and mathematical research: 'we cannot include problems that require mathematical proofs.' Borcherds notes the problems 'aren't quite the same as coming up with original proofs.'"
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "All models use the same Python code execution framework, but Section 4.2.2 reveals very different interaction patterns (o1-preview: 1.29 responses, Grok: 3.81 responses; token limit hit rates vary from 16.8% to >45%). This suggests the scaffold may advantage certain models, but the confound is not explicitly addressed."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "The entire benchmark design addresses temporal leakage: all problems are 'exclusively new, previously unpublished' (Section 2.4), created specifically for this benchmark after model training. This is the primary anti-contamination strategy."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the evaluation setup could leak answer information. The prompt structure (Appendix B.1), MSC classification tags, or problem format conventions are not analyzed for potential information leakage."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether structural similarities between benchmark problems (e.g., shared mathematical techniques, similar problem formats) could allow models to transfer knowledge between problems within the benchmark."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": true,
    371         "justification": "Section 2.4 describes concrete methods: Quetext and Copyscape plagiarism detection tools were run on all problem statements, expert review verified originality against 'popular mathematics websites, online repositories, and academic publications,' and problems were handled through encrypted channels."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "Current state-of-the-art AI models solve under 2% of FrontierMath problems",
    378       "evidence": "Figure 6 and Section 4.2.1 show all 6 evaluated models (o1-preview, o1-mini, GPT-4o, Claude 3.5 Sonnet, Grok 2 Beta, Gemini 1.5 Pro 002) achieve less than 2% mean accuracy across 8 runs on the full benchmark.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "FrontierMath problems are exceptionally challenging, requiring hours from expert mathematicians",
    383       "evidence": "Section 6 reports interviews with Fields Medalists Tao, Gowers, and Borcherds, who 'unanimously characterized the problems as exceptionally challenging, requiring deep domain expertise and significant time investment.' Section 2.5 describes creativity and execution ratings in hours.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "The benchmark covers most major branches of modern mathematics",
    388       "evidence": "Table 1 shows distribution across 24+ MSC2020 classification codes. Figure 4 visualizes interconnections. The paper claims coverage of '70% of the top-level subjects in the MSC2020 classification.'",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "FrontierMath is resistant to data contamination",
    393       "evidence": "Section 2.4 describes originality measures (encrypted channels, plagiarism detection, expert review). However, the paper acknowledges 'in some cases, standard email clients were used when communicating about a subset of the problems' (footnote 6).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "The estimated benchmark error rate is approximately 10%",
    398       "evidence": "Section 2.3: 2 of 35 second-reviewed problems had incorrect answers, yielding a Bayesian posterior of ~6.9%. The paper estimates ~10% accounting for possible undetected errors.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "FrontierMath maintains >98% unsolved rate compared to near-saturation of other benchmarks",
    403       "evidence": "Figure 2 compares FrontierMath (>98% unsolved) against GSM8K (~4% unsolved), MATH (~5% unsolved), AIME (~26% unsolved), and Omni-MATH (~40% unsolved).",
    404       "supported": "strong"
    405     }
    406   ],
    407   "methodology_tags": ["benchmark-eval", "qualitative"],
    408   "key_findings": "FrontierMath is a benchmark of hundreds of original, expert-crafted mathematics problems spanning most branches of modern mathematics, designed to resist data contamination through novel unpublished problems and automated verification. Current state-of-the-art AI models solve under 2% of problems, revealing a vast gap between AI capabilities and research-level mathematics. The benchmark features a ~10% estimated error rate in problems based on 35-problem second-review analysis. Fields Medalists interviewed characterized the problems as exceptionally challenging, requiring deep domain expertise beyond current AI capabilities.",
    409   "red_flags": [
    410     {
    411       "flag": "Funder conflict of interest",
    412       "detail": "OpenAI supported the benchmark creation ('We gratefully acknowledge OpenAI for their support') and three OpenAI models (o1-preview, o1-mini, GPT-4o) are among the six models evaluated. This conflict is not explicitly acknowledged."
    413     },
    414     {
    415       "flag": "Limited error rate validation",
    416       "detail": "Only 35 of hundreds of accepted problems received second-review scrutiny (Section 2.3). The ~10% estimated error rate is based on this small sample, and the paper acknowledges 'we must account for potential undetected errors that even the second review might have missed.'"
    417     },
    418     {
    419       "flag": "No statistical rigor in model comparisons",
    420       "detail": "With <2% solve rate on hundreds of problems, individual model successes have outsized impact on rankings. The paper acknowledges this but reports model comparisons without confidence intervals, significance tests, or variance measures despite running 8 trials."
    421     },
    422     {
    423       "flag": "No per-category performance breakdown",
    424       "detail": "Despite rich MSC2020 classification (Table 1, Figure 4) and three-dimensional difficulty ratings (Section 2.5), no performance breakdown by mathematical domain or difficulty level is provided. This hides whether the <2% is uniform or concentrated."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Training verifiers to solve math word problems",
    430       "authors": ["Karl Cobbe"],
    431       "year": 2021,
    432       "arxiv_id": "2110.14168",
    433       "relevance": "GSM8K benchmark — a foundational LLM math evaluation dataset now approaching saturation."
    434     },
    435     {
    436       "title": "Measuring mathematical problem solving with the math dataset",
    437       "authors": ["Dan Hendrycks", "Collin Burns", "Saurav Kadavath"],
    438       "year": 2021,
    439       "arxiv_id": "2103.03874",
    440       "relevance": "MATH dataset benchmark for mathematical reasoning, now near-saturated by frontier models."
    441     },
    442     {
    443       "title": "Investigating data contamination in modern benchmarks for large language models",
    444       "authors": ["Chunyuan Deng"],
    445       "year": 2023,
    446       "arxiv_id": "2311.09783",
    447       "relevance": "Directly addresses the benchmark contamination problem that FrontierMath aims to solve."
    448     },
    449     {
    450       "title": "Mathematical capabilities of chatgpt",
    451       "authors": ["Simon Frieder"],
    452       "year": 2024,
    453       "relevance": "Evaluates LLM mathematical capabilities through 709 graduate-level problems with human-expert evaluation."
    454     },
    455     {
    456       "title": "Omni-MATH: A Universal Olympiad Level Mathematic Benchmark For Large Language Models",
    457       "authors": ["Bofei Gao"],
    458       "year": 2024,
    459       "arxiv_id": "2410.07985",
    460       "relevance": "Olympiad-level math benchmark with 4,428 competition-sourced problems, representing the difficulty frontier before FrontierMath."
    461     },
    462     {
    463       "title": "Benchmark Data Contamination of Large Language Models: A Survey",
    464       "authors": ["Cheng Xu"],
    465       "year": 2024,
    466       "arxiv_id": "2406.04244",
    467       "relevance": "Survey of benchmark contamination in LLM evaluation, directly relevant to FrontierMath's anti-contamination design."
    468     },
    469     {
    470       "title": "Pervasive Label Errors in Test Sets Destabilize Machine Learning Benchmarks",
    471       "authors": ["Curtis G. Northcutt", "Anish Athalye", "Jonas Mueller"],
    472       "year": 2021,
    473       "arxiv_id": "2103.14749",
    474       "relevance": "Documents >6% label error rates in ML benchmarks including ImageNet, contextualizing FrontierMath's ~10% error estimate."
    475     },
    476     {
    477       "title": "Are We Done with MMLU?",
    478       "authors": ["Aryo Pradipta Gema"],
    479       "year": 2024,
    480       "arxiv_id": "2406.04127",
    481       "relevance": "Documents >9% error rate in MMLU benchmark, relevant to benchmark quality and evaluation reliability."
    482     },
    483     {
    484       "title": "Solving olympiad geometry without human demonstrations",
    485       "authors": ["Trieu H. Trinh"],
    486       "year": 2024,
    487       "relevance": "AlphaGeometry achieving olympiad-level geometry performance, representing the frontier of AI mathematical reasoning."
    488     },
    489     {
    490       "title": "Mathematical discoveries from program search with large language models",
    491       "authors": ["Bernardino Romera-Paredes"],
    492       "year": 2024,
    493       "relevance": "FunSearch demonstrating LLMs can make novel mathematical discoveries, relevant to AI's mathematical capability frontier."
    494     },
    495     {
    496       "title": "Olympiadbench: A challenging benchmark for promoting agi with olympiad-level bilingual multimodal scientific problems",
    497       "authors": ["Chaoqun He"],
    498       "year": 2024,
    499       "arxiv_id": "2402.14008",
    500       "relevance": "8,476 olympiad problems for AI evaluation, representing the prior difficulty standard FrontierMath aims to surpass."
    501     },
    502     {
    503       "title": "Measuring Massive Multitask Language Understanding",
    504       "authors": ["Dan Hendrycks", "Collin Burns", "Steven Basart"],
    505       "year": 2021,
    506       "arxiv_id": "2009.03300",
    507       "relevance": "MMLU benchmark including college mathematics, now near-saturated — motivating harder benchmarks like FrontierMath."
    508     }
    509   ]
    510 }

Impressum · Datenschutz