scan-v5.json (26009B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Formula-One Prompting: Adaptive Reasoning Through Equations For Applied Mathematics", 6 "authors": [ 7 "Natapong Nitarach", 8 "Pittawat Taveekitworachai", 9 "Kunat Pipatanakul" 10 ], 11 "year": 2026, 12 "venue": "arXiv.org", 13 "arxiv_id": "2601.19302", 14 "doi": "10.48550/arXiv.2601.19302" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "Abstract claims (+5.76% over CoT, +8.42% over PoT, +13.30% on FinanceMath) are directly supported by Table 4 macro-averaged results across five models.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Causal claim that equation formalization drives gains is supported by ablation study (Table 6) removing components one at a time; ablation is the appropriate design for prompting research.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "Paper explicitly excludes arithmetic benchmarks (GSM8K), acknowledges results apply only to equation-centric applied mathematics, and limitations section explicitly states scope boundaries.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "The verification step ('Verify your solution') included in F-1 but absent from all baselines is an uncontrolled confound that is never isolated or discussed as an alternative explanation for observed gains.", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "Claims are about benchmark accuracy on mathematical problems, which is exactly what is measured; the paper does not conflate benchmark performance with broader real-world mathematical ability.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "A dedicated Limitations section appears before the Ethics Statement, covering model scale, domain scope, small benchmark sizes (AICrypto n=18), and the deliberate single-call design constraint.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Specific threats are named: AICrypto n=18 and OlympiadBench TP_physics n=25 are flagged as too small; testing no models below 30B is explicitly noted as a limitation.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "Paper explicitly states F-1 is not evaluated on GSM8K or simple arithmetic, not tested on models below 30B, and generalization beyond equation-centric domains is explicitly out of scope.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding source is disclosed anywhere in the paper; there is no acknowledgments section or funding statement despite authors being employed at a commercial institution.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "Author affiliations with SCB 10X, SCBX Group (a financial technology arm of a major bank) are listed on the title page.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": false, 85 "justification": "Authors are employed by SCB 10X (banking/fintech), and the largest claimed gain (+13.30%) is on FinanceMath; this potential interest alignment is not acknowledged or discussed.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement, patent declarations, or financial interests declaration appears anywhere in the paper.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Key terms are operationally defined: 'governing equations,' 'equation formalization,' 'adaptive solving,' CoT, PoT, and Direct strategies are all explained with examples in Section 3.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Three explicit contributions are bulleted in the introduction: the F-1 method itself, ablation showing formalization is the key component, and strategy selection accuracy analysis.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Related work section systematically positions F-1 against single-call and multi-call methods with Table 1 showing design-level differences across seven prior approaches.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": false, 122 "justification": "No code repository, GitHub link, or code release is mentioned anywhere in the paper.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "All four evaluation benchmarks (IMO-Bench, OlympiadBench, FinanceMath, AICrypto) are standard public benchmarks with citations to their original papers.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "No requirements.txt, Dockerfile, or environment specification is provided; the sandboxed code execution environment is only described as '30s timeout, standard libraries' without specifics.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "Complete prompts are in Appendix A and temperature=0 is stated, but no code, API wrappers, or step-by-step pipeline instructions are provided to actually run experiments.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "No confidence intervals or error bars are reported for any result; greedy decoding eliminates run-to-run variance but LLM judge variance and benchmark sampling variance are not quantified.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "No statistical significance tests are conducted for any comparative claim; performance differences are reported as raw percentages only.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Percentage improvements over baselines are reported consistently (+5.76% over CoT, +8.42% over PoT, +13.30% on FinanceMath) with per-benchmark and per-model breakdowns.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "No power analysis or prospective sample size justification is provided; AICrypto's n=18 is flagged after the fact as a limitation rather than addressed in study design.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "No variance or standard deviation is reported across any dimension; single-run greedy decoding is used, but judge variability and benchmark sampling variance remain unquantified.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Three baselines are evaluated across all benchmarks and models: Zero-Shot, CoT, and PoT, covering the relevant single-call prompting space.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "CoT and PoT are the established standards for single-call mathematical prompting; multi-call methods (ToT, GoT) are excluded with explicit justification of different compute assumptions.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 6.1 ablates three F-1 components (adaptive selection, equation formulation, givens/targets identification) across three benchmarks, though only using GPT-5.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Both accuracy (primary metric) and token efficiency ratio (accuracy/tokens) are reported across all benchmarks and models in Tables 9-15.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": false, 203 "answer": false, 204 "justification": "Human evaluation of system outputs is not applicable for automated mathematical benchmark evaluation; LLM-as-Judge is used for proof-based problems.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": true, 210 "justification": "All four benchmarks are fixed test sets used without any training; F-1 is a prompting technique with no learned components.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Results are broken down by OlympiadBench subtask (OE/TP, math/physics), FinanceMath category, and AICrypto category in Appendix C tables.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Table 7 quantifies strategy failures (Adapt× category where baselines succeed but F-1 fails), and Section 6.3 provides qualitative failure analysis for all baseline methods.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "F-1 underperforms in 2 of 20 benchmark-model combinations (explicitly noted); IMO-Bench shows near-zero average gain (+0.78%) with specific models performing worse than CoT.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "GPT-5 and Gemini 2.5 Pro are referenced without snapshot dates or API version identifiers, making exact reproduction impossible for two of five evaluated models.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "Complete verbatim prompts for all methods across all four benchmarks are provided in Appendix A, including system prompts, user prompts, and benchmark-specific adaptations.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "Only temperature=0 is explicitly stated; all other hyperparameters (top-p, max tokens, presence penalty, etc.) are described only as 'default values' without specifying what those defaults are.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": false, 253 "answer": false, 254 "justification": "This is a single-call prompting study with no agentic scaffolding or multi-step orchestration; the sandboxed code execution for PoT is briefly described.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "Evaluation protocol is documented in Appendix E: regex extraction with tolerance ε=10⁻⁶ for numerical answers, LLM-as-Judge with specific judge models and full prompts for proof-based problems.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": false, 268 "justification": "Raw model outputs, intermediate equation generations, and LLM judge responses are not released; only summary accuracy statistics are provided.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "All four benchmarks are publicly available with citations to original papers; benchmark sizes, domains, and evaluation formats are documented in Table 3.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "Standard publicly available benchmarks are used; no human participant recruitment is involved.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "The full evaluation pipeline is described: prompt construction → model inference (temp=0, greedy) → answer extraction (regex or LLM judge) → accuracy computation, with judge prompts in Appendix E.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Training data cutoffs for GPT-5, Gemini 2.5 Pro, DeepSeek-V3.1, and Qwen3 models are not stated anywhere in the paper.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "No discussion of potential training data overlap with IMO-Bench (2024) or OlympiadBench (2024), which predate the paper by 1-2 years and may be in training corpora.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "IMO-Bench and OlympiadBench were published in 2024 and are likely in training data for frontier models; contamination is entirely unaddressed.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants involved.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants involved.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants involved.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants involved.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants involved.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants involved.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants involved.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": true, 358 "justification": "Token efficiency ratio (accuracy/tokens × 100) and tokens per correct answer are reported across all benchmarks and models in Tables 9-15.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "Total API cost or compute budget for running all experiments (5 models × 4 benchmarks × ~2,116 problems × 4 methods) is not stated.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "F-1 outperforms CoT by +5.76% and PoT by +8.42% on average across four benchmarks and five models", 373 "evidence": "Table 4 macro-averaged overall row: F-1 61.06% vs CoT 55.30% vs PoT 52.64%; 18 of 20 benchmark-model combinations favor F-1", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Gains are largest in applied domains: +13.30% over CoT on FinanceMath", 378 "evidence": "Table 4 FinanceMath: F-1 56.30% vs CoT 43.00%; confirmed by OlympiadBench physics (+2.55%) outpacing math (+0.44%) in Table 5", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Equation formalization is the primary driver, contributing roughly twice the improvement of adaptive selection alone", 383 "evidence": "Table 6 ablation (GPT-5 only): removing equation formulation drops FinanceMath 8.5pp vs removing adaptive selection drops 6.0pp", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "F-1 achieves 73% strategy selection accuracy on applied domains", 388 "evidence": "Section 6.2: defined as (Adapt✓ + F-1 Only)/(Adapt✓ + Adapt× + F-1 Only); FinanceMath=73.0%, OlympiadBench=69.9%", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "F-1 reaches 81-84% of the theoretical upper bound on applied domains", 393 "evidence": "Table 8: FinanceMath 80.9%, OlympiadBench 84.1%, AICrypto 82.2%; upper bound defined as 100% minus all-failed rate", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "F-1 adds only +68 prompt tokens overhead versus Zero-Shot while maintaining single-call efficiency", 398 "evidence": "Table 10: Zero-Shot 397 avg input tokens vs F-1 465 avg input tokens across all benchmarks", 399 "supported": "strong" 400 }, 401 { 402 "claim": "Even the smallest tested model (Qwen3-30B) benefits from F-1 comparably to frontier models (+5.6% over CoT)", 403 "evidence": "Table 4 Qwen3-30B: F-1 63.33% vs CoT 57.72%; macro average across benchmarks including small n=18 AICrypto", 404 "supported": "weak" 405 } 406 ], 407 "methodology_tags": [ 408 "benchmark-eval" 409 ], 410 "key_findings": "F-1 prompting, which instructs LLMs to formalize governing equations before adaptively selecting a solving strategy (CoT, PoT, or direct calculation), outperforms CoT by +5.76% and PoT by +8.42% on average across four applied mathematics benchmarks and five models ranging from 30B to frontier scale. The gains are strongly domain-specific: largest in finance (+13.30% over CoT) and cryptography (+7.24%), near-zero in competition mathematics (IMO-Bench). Ablation analysis identifies equation formalization as the primary mechanism, contributing roughly twice the performance gain of adaptive strategy selection alone. The method is computationally efficient, adding only 68 prompt tokens overhead within a single LLM call.", 411 "red_flags": [ 412 { 413 "flag": "Uncontrolled verification confound", 414 "detail": "F-1 includes an explicit verification step ('Verify your solution') absent from all three baselines (Zero-Shot, CoT, PoT); this is never isolated in ablation, making it impossible to attribute gains solely to equation formalization versus self-checking." 415 }, 416 { 417 "flag": "AICrypto micro-benchmark (n=18)", 418 "detail": "The cryptography domain claim (+7.24% over CoT) rests on 18 problems; this benchmark receives equal macro-average weight as OlympiadBench (n=1,438), inflating overall averages with high-variance estimates." 419 }, 420 { 421 "flag": "Model versions underspecified", 422 "detail": "GPT-5 and Gemini 2.5 Pro are referenced without snapshot dates or API version identifiers, making exact reproduction impossible and preventing contamination assessment for two of five models." 423 }, 424 { 425 "flag": "Ablation on single model only", 426 "detail": "Component ablation (Table 6) uses only GPT-5 (highest-performing model); the claimed hierarchy of formalization > adaptive selection may not hold for weaker models and cannot be verified from reported data." 427 }, 428 { 429 "flag": "Benchmark contamination unaddressed", 430 "detail": "IMO-Bench and OlympiadBench were published in 2024 and are likely in training data for frontier models; no training cutoffs are stated and overlap is never discussed." 431 }, 432 { 433 "flag": "Financial institution evaluating on finance benchmark", 434 "detail": "Authors are from SCB 10X (banking/fintech arm of a major Thai bank), and the largest claimed gain (+13.30%) is on FinanceMath; no conflict of interest statement addresses this potential alignment." 435 }, 436 { 437 "flag": "LLM-as-Judge variance unquantified", 438 "detail": "Proof benchmarks (OlympiadBench TP, IMO-ProofBench, AICrypto) are evaluated via LLM-as-Judge with no inter-rater reliability measurement, agreement rate, or uncertainty quantification on judge scores." 439 } 440 ], 441 "cited_papers": [ 442 { 443 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 444 "relevance": "Primary baseline and motivating prior work; F-1 is directly benchmarked against CoT across all experiments" 445 }, 446 { 447 "title": "Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks", 448 "relevance": "Primary baseline; F-1 incorporates PoT as one of its adaptive solving strategies and compares against it throughout" 449 }, 450 { 451 "title": "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models", 452 "relevance": "Most closely related single-call two-phase approach; explicitly differentiated from F-1 in introduction and Table 1" 453 }, 454 { 455 "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models", 456 "relevance": "Multi-call upper bound excluded from main comparison with justification; represents the compute-intensive alternative to single-call methods" 457 }, 458 { 459 "title": "OlympiadBench: A Challenging Benchmark for Promoting AGI with Olympiad-Level Bilingual Multimodal Scientific Problems", 460 "relevance": "Primary evaluation benchmark (n=1,438) enabling controlled math vs. physics domain comparison central to paper's hypothesis" 461 }, 462 { 463 "title": "FinanceMath: Knowledge-Intensive Math Reasoning in Finance Domains", 464 "relevance": "Key evaluation benchmark (n=200) for applied finance domain where F-1 shows its largest gains" 465 }, 466 { 467 "title": "AICrypto: A Comprehensive Benchmark for Evaluating Cryptography Capabilities of Large Language Models", 468 "relevance": "Evaluation benchmark for cryptographic reasoning; provides the equation-formalization-heavy domain beyond physics and finance" 469 }, 470 { 471 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 472 "relevance": "Methodological basis for the LLM-as-Judge evaluation paradigm used in proof-based benchmark assessment" 473 }, 474 { 475 "title": "Adaptive-Solver Framework for Dynamic Strategy Selection in Large Language Model Reasoning", 476 "relevance": "Multi-call adaptive baseline; motivates F-1's single-call alternative to classifier-based routing" 477 }, 478 { 479 "title": "LLM-SR: Scientific Equation Discovery via Programming with Large Language Models", 480 "relevance": "Prior work on structured equation representations supporting F-1's theoretical motivation that formalized representations improve reasoning" 481 } 482 ], 483 "engagement_factors": { 484 "practical_relevance": { 485 "score": 2, 486 "justification": "Practitioners with access to any frontier LLM API can immediately apply the verbatim prompts from Appendix A, though gains are limited to equation-centric applied math domains." 487 }, 488 "surprise_contrarian": { 489 "score": 1, 490 "justification": "Writing governing equations before solving is intuitive pedagogy; the finding that this helps LLMs is confirmatory rather than surprising or counter to expectations." 491 }, 492 "fear_safety": { 493 "score": 0, 494 "justification": "Paper improves mathematical reasoning with no AI safety, risk, or misuse implications beyond a brief ethics statement about educational cheating." 495 }, 496 "drama_conflict": { 497 "score": 0, 498 "justification": "No controversy or conflict angle; straightforward prompting technique paper with cooperative framing relative to prior work." 499 }, 500 "demo_ability": { 501 "score": 3, 502 "justification": "Anyone with API access can immediately try the verbatim prompts from Appendix A on their own math problems; no code or setup required." 503 }, 504 "brand_recognition": { 505 "score": 0, 506 "justification": "SCB 10X is not a well-known AI research lab; no famous author affiliations or high-profile institutional backing." 507 } 508 }, 509 "hn_data": { 510 "threads": [], 511 "top_points": 0, 512 "total_points": 0, 513 "total_comments": 0 514 } 515 }