scan.json (33058B)
1 { 2 "paper": { 3 "title": "RealMath: A Continuous Benchmark for Evaluating Language Models on Research-Level Mathematics", 4 "authors": [ 5 "Jie Zhang", 6 "Cezara Petrui", 7 "Kristina Nikolić", 8 "Florian Tramèr" 9 ], 10 "year": 2025, 11 "venue": "NeurIPS 2025", 12 "arxiv_id": "2505.12575", 13 "doi": "10.48550/arXiv.2505.12575" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "RealMath is a continuously refreshable benchmark of 1,200+ QA pairs extracted from arXiv papers and Math Stack Exchange, testing LLMs on research-level mathematics with verifiable fixed answers. Frontier models (o3 at 49.1% on Math.arXiv, 70.7% on StackExchange) perform surprisingly well on research math compared to competition benchmarks like FrontierMath (~2%), suggesting LLMs may already serve as useful mathematical assistants. However, 62% of Math.arXiv questions are classified as hard, where o3 drops to 27.9%. Fine-tuning GPT-4o-mini on 500 samples yielded no improvement, suggesting the difficulty stems from missing mathematical knowledge rather than distribution mismatch.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper states 'The code and dataset for REALMATH are publicly available' with footnote links to GitHub (code) and Huggingface (dataset)." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Dataset is released on Huggingface. The NeurIPS checklist confirms 'We provide access to all three datasets and the code used for their retrieval.'" 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No environment specification (requirements.txt, Dockerfile, or dependency list) is provided in the paper. The NeurIPS checklist answer on compute resources says 'Our experiments all involve querying APIs' but no environment details are given." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": true, 39 "justification": "Full prompts for theorem filtering (Appendix A.1), QA generation (Appendix A.1), and evaluation (Appendix A.2) are provided. The pipeline steps are detailed in Section 3.2 with code released on GitHub." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "Main results in Table 3 report only point estimates (e.g., '49.1', '43.4') with no confidence intervals or error bars. The NeurIPS checklist question #7 answers 'No' and states results are 'mainly to illustrate the difficulty composition of our dataset.'" 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "No statistical significance tests are used anywhere. Claims like 'o3 achieved the highest accuracy' are based solely on comparing raw numbers without any statistical test." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Table 3 reports absolute accuracy for 9 models across 3 datasets, providing full comparative context (e.g., o3 at 49.1% vs GPT-4o-mini at 12.5% on Math.arXiv). Context ablation reports 21.6% vs 42.3%. Effect magnitudes are clear from the tables." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "Dataset sizes (633 Math.arXiv, 111 CS.arXiv, 542 Math.StackExchange) are reported but not justified. No power analysis or justification for why these sample sizes are adequate, particularly for the small CS.arXiv set of 111 examples." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "Main results are single-run numbers with no variance. Only the fine-tuning experiment (Appendix B.1) reports mean and standard deviation over 5 runs. All other results in Table 3 and figures lack any spread measure." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Nine models are compared including older ones as baselines: GPT-4o-mini, Llama-3.1-405B, Claude 3.5 Sonnet alongside frontier models o3, o4-mini, Gemini 2.5 Pro, etc. (Table 3, Section 4.1)." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Baselines include contemporary frontier models released in 2024-2025: o3, o4-mini, Gemini 2.5 Pro, Grok-3, DeepSeek-R1, Claude 3.7 Sonnet (Figure 1 shows release dates)." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 4.3 'How important is a theorem's context?' ablates the context component, showing o4-mini drops from 42.3% to 21.6% on CS.arXiv without context. The fine-tuning analysis (Section 4.3, Appendix B.1) tests whether fine-tuning improves performance." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": false, 88 "justification": "Only accuracy (exact match) is used as the evaluation metric throughout. While breakdowns by difficulty and category provide different views, the underlying metric is always accuracy percentage." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "Evaluation is entirely automated via exact-match checking with an LLM judge (GPT-4o, Appendix A.2). Human review was used only for dataset curation (~6% filtered), not for evaluating model outputs. Given the paper's broader 'mathematical assistant' framing, human evaluation of model reasoning quality would be relevant." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "For main evaluation, models are evaluated zero-shot on the benchmark (no training on this data). For the fine-tuning experiment, the paper splits into 500 training and 133 test samples (Appendix B.1)." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Extensive breakdowns by math category (Figure 5, Table 4, Appendix B.3-B.4) and by difficulty level (Figures 4, 12, 13). Category-level accuracy varies from 23.8% (cs.LG) to 64.1% (math.NT) for o3." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 4.3 'Analyzing LLM's mistakes' provides an error breakdown (Figure 6) categorizing failures into reasoning errors, conceptual misunderstanding, missed insights, calculation errors, hallucinations, etc. Figure 3 also shows filtered-out examples." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The fine-tuning experiment (Section 4.3, Appendix B.1) explicitly reports 'Surprisingly, fine-tuning did not lead to an improvement in accuracy' with GPT-4o-mini going from 12.3% to 10.8% (Figure 10)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims about 'surprising capabilities in handling research mathematics' are supported by Table 3 (o3 at 49.1% vs ~2% on FrontierMath). Claims about 'continually refreshable dataset' are supported by the pipeline description (Section 3.2). The hedge 'may already serve as valuable assistants' is appropriately tentative." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims are modest and appropriately supported. The fine-tuning claim ('fine-tuning did not lead to improvement') uses a controlled experiment. The context ablation (with/without context) is a single-variable manipulation. Language is generally hedged ('may', 'suggests')." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims 'Research-Level Mathematics' broadly, but the benchmark tests only constructive theorems with fixed numerical/symbolic answers — excluding proofs, inequalities, and non-constructive results. Results are dominated by combinatorics (28.9%) and number theory (10.2%) per Table 4. The conclusion claims 'current frontier models demonstrate surprisingly strong capabilities in research mathematics' without adequately bounding this to the specific subset tested." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section 4.3 discusses alternative explanations for the temporal performance pattern (easier recent math vs. training data recency). The paper also considers whether high performance reflects benchmark selection bias. The label noise discussion considers both source quality and LLM judge limitations." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper explicitly distinguishes: 'Although our approach focuses on statement verification rather than proof generation or verification, it nonetheless provides a valuable signal about LLMs' potential as mathematical assistants.' They acknowledge the gap between their measurement (fixed-answer accuracy) and the broader framing (mathematical assistant utility)." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models are listed by marketing names only: 'o3', 'o4-mini', 'Gemini 2.5-pro', 'Claude 3.7 Sonnet', 'Grok-3', 'DeepSeek-R1', etc. No API versions, snapshot dates, or specific model identifiers (e.g., 'gpt-4o-mini-2024-07-18') are provided." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Full system prompts for theorem filtering (SYSTEM_PROMPT_THEOREM) and QA generation (SYSTEM_PROMPT_GENERATE_QA) are provided in Appendix A.1. The evaluation prompt is provided in Appendix A.2. For the evaluated models, the input format is described (context + question)." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No temperature, top-p, max tokens, or other sampling parameters are reported for any of the 9 evaluated models or the judge model (o3-mini). The NeurIPS checklist answer on compute resources does not mention hyperparameters." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. Models receive direct prompts with context and questions; the evaluation is straightforward prompt-response without tools, retry logic, or multi-step reasoning scaffolds." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3.2 and Table 1 detail the full pipeline with counts at each stage: 4,000 papers → 3,922 LaTeX → 14,747 theorems → 407 constructive → 401 QA pairs → 280 after filtering. Filtering criteria are explained at each stage." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "There is no dedicated Limitations section. The 'Discussion on label noise' subsection (Section 3.2) addresses data quality limitations only. Limitations are scattered across the paper (conclusion, NeurIPS checklist) but there is no substantive, consolidated limitations discussion." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Specific threats discussed include: LLM judge reliability (~6% low-quality samples passed filtering), label noise from Stack Exchange, the assumption that arXiv theorems are correct (not peer-reviewed), and pipeline selection bias toward combinatorics and number theory (Figure 9, Table 4)." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "While the paper notes it covers 'statement verification rather than proof generation' and excludes inequalities/non-constructive results, it does not explicitly state what the results do NOT show. The paper argues through these boundaries rather than treating them as genuine scope limitations: 'it nonetheless provides a valuable signal about LLMs' potential as mathematical assistants.'" 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "Dataset is released on Huggingface with code on GitHub, enabling independent verification of the QA pairs and pipeline." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3.2 describes the collection in detail: querying arXiv API for math papers in specified time windows, downloading LaTeX sources, extracting theorems via LLM, filtering, generating QA pairs. Table 2 gives time spans and counts." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data sources are public repositories (arXiv papers, Math Stack Exchange posts)." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "Table 1 documents the full pipeline with counts at each stage (4,000 → 3,922 → 14,747 → 407 → 401 → 280). Each filtering criterion is explained. Appendix A.5 documents the Stack Exchange pipeline variant. Appendix A.3 documents the manual review stage." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Section 6: 'J.Z. is funded by the Swiss National Science Foundation (SNSF) project grant 214838. K.N. is supported by an ETH AI Center Doctoral Fellowship.'" 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "All authors are listed as affiliated with ETH Zurich. They evaluate third-party models (OpenAI, Google, Meta, Anthropic, xAI, DeepSeek) with no direct conflict." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "SNSF and ETH AI Center are academic funders with no financial stake in the benchmark outcomes or the performance of evaluated LLMs." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is found in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "Section 4.3 references 'cutoff dates of GPT-4o-mini, Llama-3.1-405B, and Claude-3.5-Sonnet' and Figure 5 shows before/after cutoff splits, but the actual training cutoff dates for these models (and the other 6 evaluated models) are not explicitly stated." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Section 4.3 'Measuring the impact of data contamination' explicitly analyzes potential overlap by splitting Math.arXiv into papers published before vs. after model training cutoffs (2022 vs. 2025) and comparing accuracy." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": true, 245 "justification": "Contamination resistance is a core design feature. Section 3.1 criterion 3 addresses 'Continuous acquisition' to prevent contamination. Section 4.3 empirically tests contamination via temporal splits. The pipeline is designed to generate fresh data from new papers after each model release." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates LLMs on a benchmark constructed from public mathematical sources." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. NeurIPS checklist #14-15 confirm: 'This empirical study does not include any human subjects.'" 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "NeurIPS checklist #8 acknowledges 'querying APIs comes with a cost' but no actual API costs, tokens consumed, or wall-clock times are reported for any of the 9 models evaluated across 3 datasets." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No total computational budget or API spend is stated. NeurIPS checklist #8 answers 'NA' claiming experiments 'do not require computational resources' but this ignores the substantial API costs of evaluating 9 models on 1,200+ questions." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "Main evaluation results are single-run with no seed sensitivity analysis. Only the fine-tuning experiment (Appendix B.1) reports results across 5 runs. For API-based evaluation, temperature/sampling stochasticity could affect results but is not analyzed." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of runs for main experiments is not stated, implying single-run results. Only the fine-tuning experiment explicitly states 'we repeat the evaluation five times' (Appendix B.1)." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search is reported for the main evaluation or the fine-tuning experiment. For the fine-tuning, the paper says only that they used 'OpenAI's fine-tuning service' without reporting any hyperparameter settings or search budget." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "No configuration cherry-picking is evident. All 9 models are evaluated on the full dataset with results reported for each. The fine-tuning experiment uses 500 random samples. Results are reported transparently including the negative fine-tuning finding." 317 }, 318 "multiple_comparison_correction": { 319 "applies": false, 320 "answer": false, 321 "justification": "No statistical tests are performed in the paper, so multiple comparison correction is not applicable. The absence of statistical testing is captured by the significance_tests item." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors construct the benchmark, select the evaluation pipeline, and report results. No discussion of author-evaluation bias or potential systematic biases in their pipeline design (e.g., using o3-mini as judge may systematically favor or disfavor certain model families)." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "Models of vastly different compute costs are compared directly (e.g., GPT-4o-mini vs o3) without any discussion of compute budgets, inference costs, or performance-per-dollar comparisons." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": true, 336 "justification": "Section 1 and 3.1 explicitly discuss what the benchmark measures vs. what it claims: 'our approach focuses on statement verification rather than proof generation.' The paper compares with competition benchmarks (FrontierMath, AIME) and discusses what different performance patterns mean for the construct being measured." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No scaffolding is involved. Models receive direct prompts with context and questions; evaluation is straightforward prompt-response." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": true, 348 "justification": "Section 4.3 explicitly analyzes temporal leakage by comparing model accuracy on Math.arXiv papers published before vs. after model training cutoffs (Figure 5 right panel). This is a core design feature of the continually refreshable pipeline." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": true, 353 "justification": "The pipeline's filtering step #5 explicitly removes cases where 'the answer is obvious from the context fed to the LLM (it is common, for instance, for the paper's introduction to state a theorem's result informally).' Section 4.3 also tests the effect of context vs. no context." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether questions drawn from the same paper, same research group, or closely related subfields create non-independence. Multiple theorems can come from the same paper, potentially inflating effective sample size for some categories." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": true, 363 "justification": "Temporal splitting is used as a concrete leakage detection method: papers from before and after model training cutoffs are compared (Section 4.3, Figure 5). The refreshable pipeline design itself serves as a contamination prevention mechanism." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "Frontier LLMs achieve surprisingly strong performance on research-level mathematics, with o3 at 49.1% on Math.arXiv, 44.1% on CS.arXiv, and 70.7% on Math.StackExchange.", 370 "evidence": "Table 3 reports accuracy for 9 models across 3 datasets (Section 4.2). Performance is compared contextually with near-zero results on FrontierMath.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Hard questions remain very challenging for current LLMs, with o3 achieving only 27.9% on the hard subset of Math.arXiv.", 375 "evidence": "Figure 4 shows accuracy breakdown by difficulty level. 62.2% of Math.arXiv questions are classified as hard based on weak-model performance (Section 4.2, Appendix B.2).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "LLM capabilities vary significantly across mathematical domains, with model-specific strengths (o3 excels at theoretical topics, Gemini 2.5-pro at applied ones).", 380 "evidence": "Figure 5 shows per-category accuracy. o3 ranges from 23.8% (cs.LG) to 64.1% (math.NT). Gemini 2.5-pro's best categories (cs.LG at 61.9%, math.OC at 56.0%) are o3's worst (Section 4.2).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Fine-tuning GPT-4o-mini on 500 RealMath samples does not improve accuracy, suggesting difficulty stems from missing mathematical knowledge rather than distribution mismatch.", 385 "evidence": "Appendix B.1, Figure 10: fine-tuned GPT-4o-mini achieves 10.8% vs. 12.3% original (5 runs, mean reported). Training loss decreases but test loss fluctuates (Figure 11).", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "LLMs perform reasonably well even without theorem context, suggesting many theorems use concepts that can be understood in isolation.", 390 "evidence": "Section 4.3: o4-mini achieves 21.6% without context vs. 42.3% with context on CS.arXiv. Examples of context-free solvability are shown in Figure 7.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "The benchmark pipeline can be continuously refreshed with 70+ new samples per month, mitigating data contamination risks.", 395 "evidence": "Section 3.2 describes the automated pipeline. Table 2 shows data spans across multiple time windows. Section 4.3 provides empirical contamination analysis.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Models perform better on newer (post-cutoff) samples than older ones, possibly because LLMs are more familiar with recent mathematical topics rather than due to research becoming easier.", 400 "evidence": "Figure 5 right panel shows GPT-4o-mini at 7.9% before cutoff vs. 16.1% after, Llama-3.1-405B at 13.9% vs. 18.4%, Claude 3.5 Sonnet at 15.7% vs. 20.4% (Section 4.3).", 401 "supported": "weak" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "No error bars on main results", 407 "detail": "All main results (Table 3, Figures 4-5) are single-run accuracy numbers without confidence intervals, error bars, or variance. The NeurIPS checklist explicitly answers 'No' to the error bars question. Given API stochasticity and small dataset sizes (especially CS.arXiv with 111 samples), single-run results may not be stable." 408 }, 409 { 410 "flag": "LLM-as-judge circularity", 411 "detail": "The pipeline uses o3-mini to filter theorems, generate QA pairs, and classify difficulty, then uses GPT-4o to judge answer correctness. Using LLMs to both construct and evaluate a benchmark for LLMs creates potential systematic biases — questions that LLMs can process well during construction may also be ones they perform well on during evaluation." 412 }, 413 { 414 "flag": "Missing model versions and hyperparameters", 415 "detail": "No API versions, snapshot dates, or sampling parameters (temperature, top-p) are reported for any of the 9 evaluated models. Results may not be reproducible given that model behavior changes across API versions." 416 }, 417 { 418 "flag": "Sampling bias toward constructive theorems", 419 "detail": "The pipeline filters for constructive theorems with fixed numerical/symbolic answers — only 407 of 14,747 theorems (2.8%) pass filtering. This heavily biased subset (dominated by combinatorics at 28.9%) is used to make claims about 'research-level mathematics' broadly. The paper's claim that LLMs 'may already serve as valuable assistants for working mathematicians' extrapolates well beyond the tested subset." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Swe-bench: Can language models resolve real-world github issues?", 425 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"], 426 "year": 2023, 427 "arxiv_id": "2310.06770", 428 "relevance": "Key benchmark for evaluating LLMs on real-world software engineering tasks, cited as parallel example of 'in the wild' evaluation." 429 }, 430 { 431 "title": "BaxBench: Can LLMs generate correct and secure backends?", 432 "authors": ["Mark Vero", "Niels Mündler", "Victor Chibotaru", "Veselin Raychev", "Maximilian Baader", "Nikola Jovanović", "Jingxuan He", "Martin Vechev"], 433 "year": 2025, 434 "relevance": "Benchmark evaluating LLMs on real-world backend code generation, cited as example of practical capability evaluation." 435 }, 436 { 437 "title": "Swe-lancer: Can frontier LLMs earn 1 million from real-world freelance software engineering?", 438 "authors": ["Samuel Miserendino", "Michele Wang", "Tejal Patwardhan", "Johannes Heidecke"], 439 "year": 2025, 440 "relevance": "Evaluates LLM capabilities on real freelance software engineering tasks, cited as parallel real-world evaluation benchmark." 441 }, 442 { 443 "title": "FrontierMath: A benchmark for evaluating advanced mathematical reasoning in AI", 444 "authors": ["Elliot Glazer", "Ege Erdil", "Tamay Besiroglu"], 445 "year": 2024, 446 "relevance": "Expert-crafted benchmark of extremely challenging math problems where models score ~2%; primary comparison point for RealMath's research-level approach." 447 }, 448 { 449 "title": "Humanity's last exam", 450 "authors": ["Long Phan", "Alice Gatti", "Ziwen Han"], 451 "year": 2025, 452 "arxiv_id": "2501.14249", 453 "relevance": "Extremely challenging expert-curated evaluation benchmark, cited as comparison for research-level difficulty evaluation." 454 }, 455 { 456 "title": "MathConstruct: Challenging LLM reasoning with constructive proofs", 457 "authors": ["Mislav Balunović", "Jasper Dekoninck", "Nikola Jovanović", "Ivo Petrov", "Martin Vechev"], 458 "year": 2025, 459 "arxiv_id": "2502.10197", 460 "relevance": "Related benchmark using constructive mathematical proofs to evaluate LLM mathematical reasoning." 461 }, 462 { 463 "title": "Training verifiers to solve math word problems (GSM8K)", 464 "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"], 465 "year": 2021, 466 "arxiv_id": "2110.14168", 467 "relevance": "Foundational math reasoning benchmark for LLMs, cited as example of basic-level evaluation approaching saturation." 468 }, 469 { 470 "title": "Measuring mathematical problem solving with the MATH dataset", 471 "authors": ["Dan Hendrycks", "Collin Burns", "Saurav Kadavath"], 472 "year": 2021, 473 "relevance": "Standard competition-level math benchmark for LLMs, cited as example of evaluation approaching saturation." 474 }, 475 { 476 "title": "LeanDojo: Theorem proving with retrieval-augmented language models", 477 "authors": ["Kaiyu Yang", "Aidan M Swope", "Alex Gu"], 478 "year": 2023, 479 "relevance": "Benchmark for LLM-based formal theorem proving, representing the formal verification approach to evaluating mathematical capabilities." 480 }, 481 { 482 "title": "Proof or bluff? Evaluating LLMs on 2025 USA math olympiad", 483 "authors": ["Ivo Petrov", "Jasper Dekoninck", "Lyuben Baltadzhiev"], 484 "year": 2025, 485 "arxiv_id": "2503.21934", 486 "relevance": "Evaluates LLMs on contemporary math olympiad problems, addressing competition-level mathematical reasoning." 487 }, 488 { 489 "title": "LLM agents can autonomously exploit one-day vulnerabilities", 490 "authors": ["Richard Fang", "Rohan Bindu", "Akul Gupta", "Daniel Kang"], 491 "year": 2024, 492 "relevance": "Example of evaluating LLM capabilities on real-world exploits rather than curated competitions, parallel to RealMath's 'in the wild' philosophy." 493 }, 494 { 495 "title": "AutoAdvExBench: Benchmarking autonomous exploitation of adversarial example defenses", 496 "authors": ["Nicholas Carlini", "Javier Rando", "Edoardo Debenedetti", "Milad Nasr", "Florian Tramèr"], 497 "year": 2025, 498 "relevance": "Benchmark for evaluating real LLM exploit capabilities, cited as example of 'in the wild' capability evaluation over curated proxies." 499 } 500 ], 501 "engagement_factors": { 502 "practical_relevance": { 503 "score": 1, 504 "justification": "Primarily a benchmark/evaluation paper useful for researchers studying LLM math capabilities, not a directly usable tool for practitioners." 505 }, 506 "surprise_contrarian": { 507 "score": 2, 508 "justification": "Challenges the narrative that LLMs are poor at research-level math by showing 49% accuracy on real theorems, contrasting with ~2% on FrontierMath." 509 }, 510 "fear_safety": { 511 "score": 0, 512 "justification": "No safety or security concerns raised; purely evaluates mathematical reasoning capabilities." 513 }, 514 "drama_conflict": { 515 "score": 1, 516 "justification": "Mild implicit critique of competition-focused benchmarks and FrontierMath's private test set, but framed diplomatically." 517 }, 518 "demo_ability": { 519 "score": 2, 520 "justification": "Dataset on Huggingface and code on GitHub allow anyone to run evaluations, though requires API access to frontier models." 521 }, 522 "brand_recognition": { 523 "score": 1, 524 "justification": "ETH Zurich is well-known in academia; evaluates prominent models (o3, Gemini, Claude) but the paper itself is not from a major AI lab." 525 } 526 } 527 }