scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32514B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Formula-One Prompting: Adaptive Reasoning Through Equations For Applied Mathematics",
      6     "authors": [
      7       "Natapong Nitarach",
      8       "Pittawat Taveekitworachai",
      9       "Kunat Pipatanakul"
     10     ],
     11     "year": 2026,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2601.19302",
     14     "doi": "10.48550/arXiv.2601.19302"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims are supported by Table 4: '+5.76% over CoT' (55.30→61.06), '+8.42% over PoT' (52.64→61.06), '+13.30% on FinanceMath over CoT' (43.00→56.30), physics gains larger than math in Table 5. All numbers match.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The main causal claim — 'equation formalization is the key component' — is supported by the ablation study (Table 6) using controlled single-variable removal. Each ablation variant removes one component while keeping others, constituting adequate causal identification for this type of claim.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The abstract concludes 'F-1 is more effective than CoT in applied mathematics problems,' bounded to applied math. The title specifies 'For Applied Mathematics.' Limitations section explicitly bounds scope: 'Our study focuses on mathematical reasoning in physics, finance, and cryptography.' Models below 30B are flagged as untested.",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 7 'Training Data Composition' discusses an alternative explanation: F-1's gains may reflect training data composition (applied math content naturally presents equations) rather than the prompting technique itself. The Limitations section discusses model capability dependence.",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Claims stay close to measurement granularity. The paper frames results as benchmark accuracy improvements on specific benchmarks and specific domains, not as improvements to general 'mathematical reasoning.' The abstract concludes with the bounded claim about 'applied mathematics problems.'",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "A dedicated 'Limitations' section appears after Section 8, spanning approximately one full page with multiple substantive paragraphs covering model sizes, benchmark limitations, domain scope, and method constraints.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Specific threats discussed: (1) model size floor not tested below 30B — 'equation formalization phase requires reliable symbolic abstraction,' (2) AICrypto n=18 and TP_physics n=25 are small, (3) limited model families, (4) F-1 'does not explicitly backtrack' when strategy is suboptimal.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Explicit scope boundaries: 'Our study focuses on mathematical reasoning in physics, finance, and cryptography'; 'We also do not evaluate F-1 on multiple-choice or simple arithmetic benchmarks (e.g., GSM8K)'; 'we do not include smaller models (e.g., 7B or 13B)'; 'Generalizing F-1 beyond equation-centric settings...would likely require different forms of formalization.'",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No funding source, acknowledgments section, or grant numbers are mentioned anywhere in the paper.",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are listed: 'SCB 10X, SCBX Group' — a corporate research entity. The affiliation is clearly stated under the author names.",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No funding is disclosed at all. Since the authors are from SCB 10X (a financial technology company) and one benchmark is FinanceMath, there is a potential interest in demonstrating improved financial mathematics capabilities, but this is not acknowledged.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, financial disclosure, or conflict-of-interest declaration appears in the paper.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "F-1 Prompting is defined precisely with a formal two-phase structure; 'governing equations,' 'equation formalization,' and 'adaptive solving' are all defined with examples and formal notation.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Three explicit contributions are listed: the F-1 method, ablation evidence for equation formalization as the key component, and strategy selection analysis.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper systematically compares F-1 to CoT, PoT, Plan-and-Solve, ToT, GoT, XoT, and Adaptive-Solver in Table 1, showing how F-1 differs on formalization and call count dimensions.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No repository URL, code archive, or link to implementation is provided anywhere in the paper. Only prompt templates are given in Appendix A.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The paper uses four publicly available benchmarks: IMO-Bench (He et al. 2024b), OlympiadBench (He et al. 2024a), FinanceMath (Zhao et al. 2024), and AICrypto (Wang et al. 2025), all referenced with citations.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No requirements.txt, Dockerfile, or environment setup details are provided. The paper mentions model names and API-based inference but does not specify library versions, Python version, or dependency information.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The prompt templates in Appendix A and evaluation details in Appendix E provide partial information, but not enough to reproduce the full pipeline without significant effort.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "All results in Tables 4–8 and throughout the paper are reported as point estimates only. No confidence intervals, error bars, or ± notation appear anywhere.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "The paper claims 'F-1 outperforms CoT by +5.76%' and similar comparative claims throughout, but no statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are reported for any comparison.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes are reported with baseline context throughout: e.g., '+5.76% over CoT', '+13.30% on FinanceMath over CoT', with full baseline numbers in Table 4 (CoT 43.00% → F-1 56.30% on FinanceMath). The reader can assess magnitude.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "No power analysis or formal justification for benchmark sizes. The Limitations section acknowledges small sizes ('AICrypto contains only 18 problems... OlympiadBench TP_physics includes 25 problems') but provides no justification beyond data scarcity.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or spread measures are reported. Section 4.2 states 'we use a sampling temperature of 0' and 'each question is evaluated using a single generated output' — single-run deterministic results only.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Three single-call prompting baselines are compared: Zero-Shot (Wei et al. 2022a), Chain-of-Thought (Kojima et al. 2022), and Program-of-Thought (Chen et al. 2023b). Results in Table 4.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "The baselines (CoT, PoT, Zero-Shot) are the standard contemporary single-call prompting methods. The paper explicitly scopes its comparison to single-call methods (Table 1) and justifies excluding multi-call methods due to different compute budgets.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table 6 presents an ablation study removing individual F-1 components: adaptive selection, equation formulation, and givens/targets identification, each tested on GPT-5 across three benchmarks.",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The paper reports accuracy (Table 4), efficiency ratio (accuracy/tokens, Table 9), tokens per correct answer (Table 15), and strategy selection accuracy (Section 6.2). Multiple evaluation dimensions are covered.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "No human evaluation is included. Proof-based problems use LLM-as-Judge (Appendix E) with Gemini-3-Pro and GPT-5.1 as judges. All evaluation is automated — either regex-based or LLM-based.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "Results are reported on standard benchmark test sets (IMO-Bench, OlympiadBench, FinanceMath, AICrypto). The prompts are fixed templates (Appendix A), not tuned on test data. No dev set is needed or used since the prompting approach is not learned.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Extensive per-category breakdowns in Appendix C: IMO-Bench by AnswerBench/ProofBench (Table 16), OlympiadBench by OE/TP math/physics (Table 17), FinanceMath by 7 financial categories (Table 18), AICrypto by 4 categories (Table 19).",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 6.2 analyzes strategy selection failures ('Adapt×' category in Table 7). Section 6.3 provides qualitative examples including baseline failure modes. The paper notes IMO-Bench's 90.7% failure rate for all methods and discusses where F-1 doesn't help.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "F-1 underperforms on several specific results: Risk Management in FinanceMath (Table 18: F-1 gets 11.11% vs CoT 22.22% for GPT-5), minimal gains on IMO-Bench (+0.78% over CoT overall), and gains on competition math are near-zero (+0.44% OE_maths). F-1 loses to Zero-Shot on IMO-Bench for GPT-5 (55.58 vs 56.26).",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Models are listed as 'GPT-5', 'Gemini 2.5 Pro', 'DeepSeek-V3.1', 'Qwen3-235B', 'Qwen3-30B' — marketing names without API versions, snapshot dates, or specific model identifiers. No version pinning is described.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Complete prompt templates for all four benchmarks and all four methods (Zero-Shot, CoT, PoT, F-1) are provided in Appendix A (Sections A.1–A.4), including both system and user prompts with the actual text used.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Section 4.2 states: 'we use a sampling temperature of 0 and leave all other inference hyperparameters at their default values. Since we use greedy decoding, each question is evaluated using a single generated output.'",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "No agentic scaffolding is used. F-1 is a single-call prompting method — the LLM generates the full response in one generation with no tools, retry logic, or multi-step orchestration.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Section 4.2 describes the evaluation protocol: temperature=0, greedy decoding, regex-based answer extraction with numerical tolerance ε=10⁻⁶, and LLM-as-Judge for proof problems. Appendix E documents evaluation prompts and methodology for each benchmark.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "No raw data (individual model outputs, per-problem results, judge scores) is made available. Only aggregate results in tables are provided.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 4.1 describes the four benchmarks with citations, domain descriptions, sizes (Table 3), and evaluation types. Section 4.2 describes inference protocol. The benchmarks are well-established public datasets.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. All data comes from standard public benchmarks (IMO-Bench, OlympiadBench, FinanceMath, AICrypto).",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The pipeline from input to evaluation is documented: problem text → prompt construction (Appendix A) → model inference at temp=0 → answer extraction (regex or LLM judge, Section 4.2 and Appendix E) → scoring with specified tolerance. Code execution for PoT is described (30s timeout, sandboxed, Appendix E.1).",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "No training data cutoff dates are stated for any of the five models used (GPT-5, Gemini 2.5 Pro, DeepSeek-V3.1, Qwen3-235B, Qwen3-30B). This is critical since the benchmarks are publicly available.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "No discussion of potential train/test overlap. OlympiadBench problems come from past olympiads, IMO-Bench from IMO problems — both long public and likely in training data of frontier models. No analysis or acknowledgment.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "No discussion of benchmark contamination. OlympiadBench (2024), IMO-Bench (2024), and FinanceMath (2023) were all published before the likely training cutoffs of the frontier models used. No canary strings, temporal analysis, or contamination assessment.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study. The paper evaluates prompting methods on automated benchmarks.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants. Benchmark evaluation study only.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Appendix B provides detailed token efficiency analysis: average tokens per method/model/benchmark (Tables 11–14), efficiency ratios (Table 9), tokens per correct answer (Table 15), and prompt overhead comparison (Table 10: F-1 adds +68 tokens over Zero-Shot).",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "No total computational budget is stated — no total API spend, total tokens consumed across all experiments, GPU hours, or wall-clock time for the full evaluation. Only per-problem averages are reported.",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": false,
    372           "justification": "Results are from single deterministic runs (temperature=0, greedy decoding). No sensitivity analysis across different temperatures, sampling strategies, or repeated runs is performed.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": true,
    378           "justification": "Section 4.2 explicitly states: 'Since we use greedy decoding, each question is evaluated using a single generated output.' The number of runs (1) is clearly stated.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No mention of how the F-1 prompt design was developed — how many prompt variations were tried, what alternatives were considered, or how the final prompt was selected. The prompt appears hand-designed but no search budget is reported.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": false,
    390           "justification": "The paper presents a single prompt design (Section 3.3) without explaining how it was selected among alternatives. No validation set is mentioned for prompt development.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "The paper makes numerous comparisons across 4 methods × 5 models × 4 benchmarks (80 cells in Table 4) plus subcategory breakdowns, without any correction for multiple comparisons.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors implement all baselines (Zero-Shot, CoT, PoT) themselves and compare against their own F-1 method. No acknowledgment of self-comparison bias (Lucic et al. 2018) or independent evaluation is discussed.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": true,
    408           "justification": "Tables 9–10 and Appendix B compare performance relative to computational cost across all methods. Efficiency ratio (accuracy/tokens) and prompt overhead are reported, showing F-1 achieves comparable or better accuracy with similar token costs.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": true,
    414           "justification": "Section 4.1 discusses why selected benchmarks test equation formalization and why GSM8K/MATH are excluded: 'they test sequential calculation rather than equation identification' and frontier models are at ceiling (96.8%). The paper justifies benchmark selection relative to what F-1 claims to improve.",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": false,
    419           "answer": false,
    420           "justification": "No scaffolding is involved — all methods are single-call prompting with no tools or orchestration.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": true,
    427           "answer": false,
    428           "justification": "No discussion of temporal leakage. OlympiadBench problems are from past olympiads, IMO-Bench from past IMO competitions — both available online well before frontier model training cutoffs. This is not addressed.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": true,
    433           "answer": false,
    434           "justification": "No discussion of feature leakage. The paper does not consider whether evaluation setup (e.g., problem formatting, answer type hints in OlympiadBench prompts) leaks information not available in real usage.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": true,
    439           "answer": false,
    440           "justification": "No discussion of whether benchmark problems share structural similarities with model training data or with each other.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": true,
    445           "answer": false,
    446           "justification": "No leakage detection or prevention method is applied — no canary strings, membership inference, temporal splits, or decontamination analysis.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "F-1 outperforms CoT by +5.76% and PoT by +8.42% on average across five models and four benchmarks",
    455       "evidence": "Table 4 macro-averaged overall row: F-1 61.06% vs CoT 55.30% vs PoT 52.64%",
    456       "supported": "moderate"
    457     },
    458     {
    459       "claim": "Equation formalization is the primary driver of F-1's gains, contributing roughly twice the improvement of adaptive selection on applied domains",
    460       "evidence": "Table 6 ablation on GPT-5: full F-1 64.00% vs −adaptive 58.00% (−6pp) vs −equation 55.50% (−8.5pp) on FinanceMath",
    461       "supported": "weak"
    462     },
    463     {
    464       "claim": "F-1 gains are largest on applied domains: +13.30% on FinanceMath over CoT",
    465       "evidence": "Table 4: FinanceMath F-1 average 56.30% vs CoT 43.00%",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "F-1 achieves 73% strategy selection accuracy on applied domains and reaches 81–84% of the upper bound",
    470       "evidence": "Tables 7–8: FinanceMath selection accuracy computed as (Adapt✓ + F-1 Only)/(differentiable problems); upper bound comparison in Table 8",
    471       "supported": "moderate"
    472     },
    473     {
    474       "claim": "The smallest tested model (Qwen3-30B) achieves +5.6% improvement over CoT, comparable to frontier models",
    475       "evidence": "Table 4 overall row: Qwen3-30B F-1 63.33% vs CoT 57.72% = +5.61pp",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "F-1 achieves 81–84% of the upper bound performance while maintaining single-call efficiency",
    480       "evidence": "Table 8: FinanceMath F-1/UB = 80.9%, OlympiadBench 84.1%, AICrypto 82.2%",
    481       "supported": "moderate"
    482     }
    483   ],
    484   "methodology_tags": [
    485     "benchmark-eval"
    486   ],
    487   "key_findings": "F-1 Prompting adds an explicit equation formalization phase before adaptive solving strategy selection (Direct/CoT/PoT), all within a single LLM call. Across five models and four benchmarks, F-1 outperforms CoT by +5.76% and PoT by +8.42% on average, with largest gains on applied domains (+13.30% on FinanceMath). An ablation on GPT-5 indicates equation formalization is the primary driver, contributing roughly twice the gain of adaptive strategy selection alone. F-1 achieves 73% strategy selection accuracy on applied domains and reaches 81–84% of the post-hoc performance upper bound, while adding only +68 prompt tokens over zero-shot.",
    488   "red_flags": [
    489     {
    490       "flag": "Ablation on single model only",
    491       "detail": "The ablation study supporting the key causal claim ('equation formalization is the primary driver') is conducted only on GPT-5, not validated across all five evaluated models; this finding may not generalize."
    492     },
    493     {
    494       "flag": "AICrypto n=18 — underpowered",
    495       "detail": "AICrypto contains only 18 problems; results on this benchmark (e.g., F-1 87.54% vs CoT 80.30%) are treated as supporting evidence for applied domain gains without acknowledging the extreme sampling variance."
    496     },
    497     {
    498       "flag": "No statistical significance tests",
    499       "detail": "All comparative claims are made as raw percentage differences with no significance tests, confidence intervals, or variance estimates; it is unknown whether any observed differences exceed noise."
    500     },
    501     {
    502       "flag": "Proprietary model versions unspecified",
    503       "detail": "GPT-5 and Gemini 2.5 Pro are named without snapshot dates or specific model IDs, making results non-reproducible for these two out of five tested models."
    504     },
    505     {
    506       "flag": "Benchmark selection excludes unfavorable domains",
    507       "detail": "Standard benchmarks (GSM8K, MATH) are excluded with the stated rationale that frontier models hit ceiling performance, but this also conveniently excludes settings where F-1 may not improve over CoT."
    508     },
    509     {
    510       "flag": "Contamination completely unaddressed",
    511       "detail": "No training cutoffs are stated for any model; FinanceMath (2023) and OlympiadBench (2024) likely appeared in training data of some models tested, potentially inflating all methods' performance."
    512     },
    513     {
    514       "flag": "No code released",
    515       "detail": "Reproduction requires reimplementing the full evaluation pipeline from prose descriptions; no scripts for inference, answer extraction, or LLM judging are provided."
    516     }
    517   ],
    518   "cited_papers": [
    519     {
    520       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    521       "relevance": "Primary baseline; F-1 is designed to outperform CoT by adding an equation formalization phase before reasoning."
    522     },
    523     {
    524       "title": "Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks",
    525       "relevance": "Second primary baseline; PoT generates executable code instead of natural language reasoning steps."
    526     },
    527     {
    528       "title": "Plan-and-Solve Prompting: Improving Zero-Shot Chain-of-Thought Reasoning by Large Language Models",
    529       "relevance": "Closest structural comparator — also single-call two-phase, but structures execution rather than mathematical representation."
    530     },
    531     {
    532       "title": "Tree of Thoughts: Deliberate Problem Solving with Large Language Models",
    533       "relevance": "Multi-call alternative requiring 10–100+ calls; contrasted with F-1's single-call approach to justify efficiency claims."
    534     },
    535     {
    536       "title": "OlympiadBench: A Challenging Benchmark for Promoting AGI with Olympiad-Level Bilingual Multimodal Scientific Problems",
    537       "relevance": "Primary evaluation benchmark providing both competition math and applied physics problems for controlled domain comparison."
    538     },
    539     {
    540       "title": "FinanceMath: Knowledge-Intensive Math Reasoning in Finance Domains",
    541       "relevance": "Key applied domain benchmark where F-1 shows its largest gains (+13.30% over CoT)."
    542     },
    543     {
    544       "title": "Adaptive-Solver Framework for Dynamic Strategy Selection in Large Language Model Reasoning",
    545       "relevance": "Multi-call adaptive routing comparator; F-1 claims to achieve similar adaptivity without multiple calls."
    546     },
    547     {
    548       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    549       "relevance": "Referenced in upper bound analysis methodology; used to frame the performance ceiling comparison in Table 8."
    550     }
    551   ],
    552   "engagement_factors": {
    553     "practical_relevance": {
    554       "score": 2,
    555       "justification": "F-1 is a plug-and-play prompting technique requiring no training; practitioners can immediately apply the provided prompt templates to any LLM for applied math tasks."
    556     },
    557     "surprise_contrarian": {
    558       "score": 1,
    559       "justification": "The finding that explicit equation formalization helps applied math is intuitive rather than counterintuitive; the main novelty is the empirical quantification, not a surprising reversal of expectations."
    560     },
    561     "fear_safety": {
    562       "score": 0,
    563       "justification": "No AI safety or risk concerns are raised; the ethics section briefly mentions potential for academic cheating but frames it as a minor concern."
    564     },
    565     "drama_conflict": {
    566       "score": 0,
    567       "justification": "No controversy, replication crisis angle, or adversarial framing; the paper is a straightforward prompting technique paper."
    568     },
    569     "demo_ability": {
    570       "score": 3,
    571       "justification": "Full prompt templates are provided in Appendix A; anyone can immediately reproduce F-1 by pasting the template into any LLM interface with a math problem."
    572     },
    573     "brand_recognition": {
    574       "score": 1,
    575       "justification": "Authors are from SCB 10X (Siam Commercial Bank venture arm), not a prominent AI research lab; the paper does evaluate GPT-5 and Gemini 2.5 Pro which provide name recognition."
    576     }
    577   },
    578   "hn_data": {
    579     "threads": [],
    580     "top_points": 0,
    581     "total_points": 0,
    582     "total_comments": 0
    583   }
    584 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs