scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31643B)
      1 {
      2   "paper": {
      3     "title": "Formula-One Prompting: Adaptive Reasoning Through Equations For Applied Mathematics",
      4     "authors": [
      5       "Natapong Nitarach",
      6       "Pittawat Taveekitworachai",
      7       "Kunat Pipatanakul"
      8     ],
      9     "year": 2026,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2601.19302",
     12     "doi": "10.48550/arXiv.2601.19302"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "Formula-One Prompting (F-1) uses equation formalization as an intermediate representation before adaptive solving strategy selection, all within a single LLM call. Across 5 models and 4 math benchmarks (2,116 problems), F-1 outperforms Chain-of-Thought by +5.76% and Program-of-Thought by +8.42% on average, with largest gains on applied domains (+13.30% on FinanceMath, +7.24% on AICrypto). Ablation on GPT-5 shows equation formalization contributes roughly twice the improvement of adaptive selection, and strategy selection achieves 73% accuracy on applied domains, reaching 81–84% of the theoretical upper bound.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL, code archive, or link to implementation is provided anywhere in the paper. Only prompt templates are given in Appendix A."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses four publicly available benchmarks: IMO-Bench (He et al. 2024b), OlympiadBench (He et al. 2024a), FinanceMath (Zhao et al. 2024), and AICrypto (Wang et al. 2025), all referenced with citations."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, or environment setup details are provided. The paper mentions model names and API-based inference but does not specify library versions, Python version, or dependency information."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The prompt templates in Appendix A and evaluation details in Appendix E provide partial information, but not enough to reproduce the full pipeline without significant effort."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables 4–8 and throughout the paper are reported as point estimates only. No confidence intervals, error bars, or ± notation appear anywhere."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims 'F-1 outperforms CoT by +5.76%' and similar comparative claims throughout, but no statistical significance tests (p-values, t-tests, bootstrap tests, etc.) are reported for any comparison."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Effect sizes are reported with baseline context throughout: e.g., '+5.76% over CoT', '+13.30% on FinanceMath over CoT', with full baseline numbers in Table 4 (CoT 43.00% → F-1 56.30% on FinanceMath). The reader can assess magnitude."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No power analysis or formal justification for benchmark sizes. The Limitations section acknowledges small sizes ('AICrypto contains only 18 problems... OlympiadBench TP_physics includes 25 problems') but provides no justification beyond data scarcity."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported. Section 4.2 states 'we use a sampling temperature of 0' and 'each question is evaluated using a single generated output' — single-run deterministic results only."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Three single-call prompting baselines are compared: Zero-Shot (Wei et al. 2022a), Chain-of-Thought (Kojima et al. 2022), and Program-of-Thought (Chen et al. 2023b). Results in Table 4."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baselines (CoT, PoT, Zero-Shot) are the standard contemporary single-call prompting methods. The paper explicitly scopes its comparison to single-call methods (Table 1) and justifies excluding multi-call methods due to different compute budgets."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Table 6 presents an ablation study removing individual F-1 components: adaptive selection, equation formulation, and givens/targets identification, each tested on GPT-5 across three benchmarks."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper reports accuracy (Table 4), efficiency ratio (accuracy/tokens, Table 9), tokens per correct answer (Table 15), and strategy selection accuracy (Section 6.2). Multiple evaluation dimensions are covered."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation is included. Proof-based problems use LLM-as-Judge (Appendix E) with Gemini-3-Pro and GPT-5.1 as judges. All evaluation is automated — either regex-based or LLM-based."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are reported on standard benchmark test sets (IMO-Bench, OlympiadBench, FinanceMath, AICrypto). The prompts are fixed templates (Appendix A), not tuned on test data. No dev set is needed or used since the prompting approach is not learned."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Extensive per-category breakdowns in Appendix C: IMO-Bench by AnswerBench/ProofBench (Table 16), OlympiadBench by OE/TP math/physics (Table 17), FinanceMath by 7 financial categories (Table 18), AICrypto by 4 categories (Table 19)."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 6.2 analyzes strategy selection failures ('Adapt×' category in Table 7). Section 6.3 provides qualitative examples including baseline failure modes. The paper notes IMO-Bench's 90.7% failure rate for all methods and discusses where F-1 doesn't help."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "F-1 underperforms on several specific results: Risk Management in FinanceMath (Table 18: F-1 gets 11.11% vs CoT 22.22% for GPT-5), minimal gains on IMO-Bench (+0.78% over CoT overall), and gains on competition math are near-zero (+0.44% OE_maths). F-1 loses to Zero-Shot on IMO-Bench for GPT-5 (55.58 vs 56.26)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are supported by Table 4: '+5.76% over CoT' (55.30→61.06), '+8.42% over PoT' (52.64→61.06), '+13.30% on FinanceMath over CoT' (43.00→56.30), physics gains larger than math in Table 5. All numbers match."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The main causal claim — 'equation formalization is the key component' — is supported by the ablation study (Table 6) using controlled single-variable removal. Each ablation variant removes one component while keeping others, constituting adequate causal identification for this type of claim."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The abstract concludes 'F-1 is more effective than CoT in applied mathematics problems,' bounded to applied math. The title specifies 'For Applied Mathematics.' Limitations section explicitly bounds scope: 'Our study focuses on mathematical reasoning in physics, finance, and cryptography.' Models below 30B are flagged as untested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 7 'Training Data Composition' discusses an alternative explanation: F-1's gains may reflect training data composition (applied math content naturally presents equations) rather than the prompting technique itself. The Limitations section discusses model capability dependence."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Claims stay close to measurement granularity. The paper frames results as benchmark accuracy improvements on specific benchmarks and specific domains, not as improvements to general 'mathematical reasoning.' The abstract concludes with the bounded claim about 'applied mathematics problems.'"
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Models are listed as 'GPT-5', 'Gemini 2.5 Pro', 'DeepSeek-V3.1', 'Qwen3-235B', 'Qwen3-30B' — marketing names without API versions, snapshot dates, or specific model identifiers. No version pinning is described."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Complete prompt templates for all four benchmarks and all four methods (Zero-Shot, CoT, PoT, F-1) are provided in Appendix A (Sections A.1–A.4), including both system and user prompts with the actual text used."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 4.2 states: 'we use a sampling temperature of 0 and leave all other inference hyperparameters at their default values. Since we use greedy decoding, each question is evaluated using a single generated output.'"
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. F-1 is a single-call prompting method — the LLM generates the full response in one generation with no tools, retry logic, or multi-step orchestration."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 4.2 describes the evaluation protocol: temperature=0, greedy decoding, regex-based answer extraction with numerical tolerance ε=10⁻⁶, and LLM-as-Judge for proof problems. Appendix E documents evaluation prompts and methodology for each benchmark."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "A dedicated 'Limitations' section appears after Section 8, spanning approximately one full page with multiple substantive paragraphs covering model sizes, benchmark limitations, domain scope, and method constraints."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Specific threats discussed: (1) model size floor not tested below 30B — 'equation formalization phase requires reliable symbolic abstraction,' (2) AICrypto n=18 and TP_physics n=25 are small, (3) limited model families, (4) F-1 'does not explicitly backtrack' when strategy is suboptimal."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Explicit scope boundaries: 'Our study focuses on mathematical reasoning in physics, finance, and cryptography'; 'We also do not evaluate F-1 on multiple-choice or simple arithmetic benchmarks (e.g., GSM8K)'; 'we do not include smaller models (e.g., 7B or 13B)'; 'Generalizing F-1 beyond equation-centric settings...would likely require different forms of formalization.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw data (individual model outputs, per-problem results, judge scores) is made available. Only aggregate results in tables are provided."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 4.1 describes the four benchmarks with citations, domain descriptions, sizes (Table 3), and evaluation types. Section 4.2 describes inference protocol. The benchmarks are well-established public datasets."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. All data comes from standard public benchmarks (IMO-Bench, OlympiadBench, FinanceMath, AICrypto)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline from input to evaluation is documented: problem text → prompt construction (Appendix A) → model inference at temp=0 → answer extraction (regex or LLM judge, Section 4.2 and Appendix E) → scoring with specified tolerance. Code execution for PoT is described (30s timeout, sandboxed, Appendix E.1)."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source, acknowledgments section, or grant numbers are mentioned anywhere in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are listed: 'SCB 10X, SCBX Group' — a corporate research entity. The affiliation is clearly stated under the author names."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed at all. Since the authors are from SCB 10X (a financial technology company) and one benchmark is FinanceMath, there is a potential interest in demonstrating improved financial mathematics capabilities, but this is not acknowledged."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement, financial disclosure, or conflict-of-interest declaration appears in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for any of the five models used (GPT-5, Gemini 2.5 Pro, DeepSeek-V3.1, Qwen3-235B, Qwen3-30B). This is critical since the benchmarks are publicly available."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of potential train/test overlap. OlympiadBench problems come from past olympiads, IMO-Bench from IMO problems — both long public and likely in training data of frontier models. No analysis or acknowledgment."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "No discussion of benchmark contamination. OlympiadBench (2024), IMO-Bench (2024), and FinanceMath (2023) were all published before the likely training cutoffs of the frontier models used. No canary strings, temporal analysis, or contamination assessment."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. The paper evaluates prompting methods on automated benchmarks."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. Benchmark evaluation study only."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Appendix B provides detailed token efficiency analysis: average tokens per method/model/benchmark (Tables 11–14), efficiency ratios (Table 9), tokens per correct answer (Table 15), and prompt overhead comparison (Table 10: F-1 adds +68 tokens over Zero-Shot)."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No total computational budget is stated — no total API spend, total tokens consumed across all experiments, GPU hours, or wall-clock time for the full evaluation. Only per-problem averages are reported."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Results are from single deterministic runs (temperature=0, greedy decoding). No sensitivity analysis across different temperatures, sampling strategies, or repeated runs is performed."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section 4.2 explicitly states: 'Since we use greedy decoding, each question is evaluated using a single generated output.' The number of runs (1) is clearly stated."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No mention of how the F-1 prompt design was developed — how many prompt variations were tried, what alternatives were considered, or how the final prompt was selected. The prompt appears hand-designed but no search budget is reported."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The paper presents a single prompt design (Section 3.3) without explaining how it was selected among alternatives. No validation set is mentioned for prompt development."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The paper makes numerous comparisons across 4 methods × 5 models × 4 benchmarks (80 cells in Table 4) plus subcategory breakdowns, without any correction for multiple comparisons."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement all baselines (Zero-Shot, CoT, PoT) themselves and compare against their own F-1 method. No acknowledgment of self-comparison bias (Lucic et al. 2018) or independent evaluation is discussed."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Tables 9–10 and Appendix B compare performance relative to computational cost across all methods. Efficiency ratio (accuracy/tokens) and prompt overhead are reported, showing F-1 achieves comparable or better accuracy with similar token costs."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Section 4.1 discusses why selected benchmarks test equation formalization and why GSM8K/MATH are excluded: 'they test sequential calculation rather than equation identification' and frontier models are at ceiling (96.8%). The paper justifies benchmark selection relative to what F-1 claims to improve."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved — all methods are single-call prompting with no tools or orchestration."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of temporal leakage. OlympiadBench problems are from past olympiads, IMO-Bench from past IMO competitions — both available online well before frontier model training cutoffs. This is not addressed."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of feature leakage. The paper does not consider whether evaluation setup (e.g., problem formatting, answer type hints in OlympiadBench prompts) leaks information not available in real usage."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether benchmark problems share structural similarities with model training data or with each other."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is applied — no canary strings, membership inference, temporal splits, or decontamination analysis."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "F-1 outperforms CoT by +5.76% and PoT by +8.42% on average across four benchmarks and five models",
    369       "evidence": "Table 4 shows overall macro-averaged accuracy: F-1 61.06% vs CoT 55.30% (+5.76%) and PoT 52.64% (+8.42%). F-1 is best in 18 of 20 benchmark-model combinations.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Gains are largest on applied domains: +13.30% on FinanceMath over CoT and +7.24% on AICrypto",
    374       "evidence": "Table 4: FinanceMath average F-1 56.30% vs CoT 43.00% (+13.30%), AICrypto F-1 87.54% vs CoT 80.30% (+7.24%). Consistent pattern across models.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Within OlympiadBench, F-1 gains are larger on physics than pure math, supporting the hypothesis that equation formalization benefits applied domains most",
    379       "evidence": "Table 5 (GPT-5 only): physics OE +2.55%, math OE +0.44%; physics TP +4.00% (n=25), math TP +3.77%.",
    380       "supported": "weak"
    381     },
    382     {
    383       "claim": "Equation formalization is the most critical component, contributing roughly twice the improvement of adaptive selection on applied domains",
    384       "evidence": "Table 6 ablation on GPT-5: removing equation formulation drops FinanceMath by 8.5pp (64.0→55.5), while removing adaptive selection drops it by 6.0pp (64.0→58.0). Pattern consistent across three benchmarks.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "F-1 achieves 73% strategy selection accuracy on applied domains and reaches 81–84% of the theoretical upper bound",
    389       "evidence": "Section 6.2 and Tables 7–8: FinanceMath selection accuracy 73.0%, reaching 80.9% of upper bound. OlympiadBench 69.9% accuracy, 84.1% of upper bound.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "F-1 maintains single-call efficiency with only +68 additional input tokens over Zero-Shot",
    394       "evidence": "Table 10: F-1 averages 465 input tokens vs Zero-Shot 397 (+68). Single API call confirmed. Efficiency ratio in Table 9 shows F-1 achieves 1.51 vs CoT 1.26 and PoT 1.24.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "The smallest model (Qwen3-30B) benefits comparably to frontier models from F-1, with +5.6% improvement over CoT",
    399       "evidence": "Table 4: Qwen3-30B overall F-1 63.33% vs CoT 57.72% (+5.61%). Gains are comparable to frontier models (GPT-5 +6.24%, Gemini +2.33%).",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "Tiny benchmark with outsized claims",
    406       "detail": "AICrypto has only 18 problems. Results vary enormously (e.g., GPT-5 scores 98.50% vs Qwen3-235B 76.50%). The paper reports +7.24% average gains on AICrypto in the abstract without prominently noting this is based on 18 items. Table 3 footnotes this, and Limitations acknowledges it, but it still appears as a headline claim."
    407     },
    408     {
    409       "flag": "No statistical significance testing",
    410       "detail": "All comparative claims ('outperforms by +5.76%') are based on raw accuracy differences across 80 benchmark-model cells with no significance tests, confidence intervals, or multiple comparison corrections. Given single-run deterministic decoding, there is no measure of result stability."
    411     },
    412     {
    413       "flag": "Ablation only on one model",
    414       "detail": "The ablation study (Table 6) showing equation formalization as the key component is conducted only on GPT-5. It is unclear whether the component contribution pattern holds for other models, especially the open-source ones that show different absolute performance profiles."
    415     },
    416     {
    417       "flag": "No contamination analysis",
    418       "detail": "Frontier models (GPT-5, Gemini 2.5 Pro) may have been trained on OlympiadBench and IMO-Bench problems, which are publicly available competition math/physics problems. If models memorized solutions, prompting method differences could reflect different ways of triggering memorized answers rather than genuine reasoning improvements."
    419     },
    420     {
    421       "flag": "LLM-as-Judge without human validation",
    422       "detail": "Proof-based evaluation (IMO-ProofBench, AICrypto, OlympiadBench TP) relies on LLM judges (Gemini-3-Pro, GPT-5.1) with no human validation of judge accuracy. The qualitative examples in Appendix D show cases where judges give 7/10 vs 8/10 — small differences that determine correctness thresholds. Judge reliability is assumed, not demonstrated."
    423     },
    424     {
    425       "flag": "Corporate affiliation without COI disclosure",
    426       "detail": "Authors are from SCB 10X (a financial technology company under SCBX Group). One of the four benchmarks is FinanceMath, where F-1 shows its largest gains (+13.30%). No conflict-of-interest statement or funding disclosure is provided."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    432       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Brian Ichter", "Fei Xia", "Ed Chi", "Quoc Le", "Denny Zhou"],
    433       "year": 2022,
    434       "relevance": "Foundational prompting technique for LLM reasoning; the primary baseline that F-1 improves upon."
    435     },
    436     {
    437       "title": "Program of thoughts prompting: Disentangling computation from reasoning for numerical reasoning tasks",
    438       "authors": ["Wenhu Chen", "Xueguang Ma", "Xinyi Wang", "William W Cohen"],
    439       "year": 2023,
    440       "relevance": "Code-based reasoning alternative to CoT; baseline method showing different intermediate representation for math reasoning."
    441     },
    442     {
    443       "title": "Plan-and-solve prompting: Improving zero-shot chain-of-thought reasoning by large language models",
    444       "authors": ["Lei Wang", "Wanyu Xu", "Yihuai Lan"],
    445       "year": 2023,
    446       "relevance": "Single-call two-phase prompting method closest to F-1's architecture; comparison point for planning vs formalization."
    447     },
    448     {
    449       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    450       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao", "Izhak Shafran", "Thomas L Griffiths", "Yuan Cao", "Karthik Narasimhan"],
    451       "year": 2023,
    452       "arxiv_id": "2305.10601",
    453       "relevance": "Multi-call reasoning method exploring multiple paths; represents the compute-heavy alternative to F-1's single-call approach."
    454     },
    455     {
    456       "title": "Graph of thoughts: Solving elaborate problems with large language models",
    457       "authors": ["Maciej Besta", "Nils Blach", "Ales Kubicek"],
    458       "year": 2024,
    459       "relevance": "Graph-based multi-call reasoning framework; another high-compute reasoning approach contrasted with F-1's efficiency."
    460     },
    461     {
    462       "title": "Adaptive-solver framework for dynamic strategy selection in large language model reasoning",
    463       "authors": ["Jianpeng Zhou", "Wanjun Zhong", "Yanlin Wang", "Jiahai Wang"],
    464       "year": 2024,
    465       "arxiv_id": "2310.01446",
    466       "relevance": "Classifier-based routing across multiple LLM calls for math; closest prior work to F-1's adaptive selection but multi-call."
    467     },
    468     {
    469       "title": "Self-consistency improves chain of thought reasoning in language models",
    470       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    471       "year": 2023,
    472       "relevance": "Multiple reasoning path sampling technique; upper bound analysis methodology referenced by F-1."
    473     },
    474     {
    475       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    476       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    477       "year": 2023,
    478       "relevance": "Cost-efficient LLM usage framework; F-1 references its token efficiency metrics."
    479     },
    480     {
    481       "title": "OlympiadBench: A challenging benchmark for promoting AGI with olympiad-level bilingual multimodal scientific problems",
    482       "authors": ["Chaoqun He", "Renjie Luo", "Yuzhuo Bai"],
    483       "year": 2024,
    484       "arxiv_id": "2402.14008",
    485       "relevance": "Primary evaluation benchmark containing both math and physics olympiad problems; enables controlled applied vs pure math comparison."
    486     },
    487     {
    488       "title": "FinanceMath: Knowledge-intensive math reasoning in finance domains",
    489       "authors": ["Yilun Zhao", "Hongjun Liu", "Yitao Long", "Rui Zhang", "Chen Zhao", "Arman Cohan"],
    490       "year": 2024,
    491       "arxiv_id": "2311.09797",
    492       "relevance": "Applied mathematics benchmark in finance domain; shows largest F-1 improvements and tests equation-based reasoning in real-world domain."
    493     },
    494     {
    495       "title": "AICrypto: A comprehensive benchmark for evaluating cryptography capabilities of large language models",
    496       "authors": ["Yu Wang", "Yijian Liu", "Liheng Ji"],
    497       "year": 2025,
    498       "arxiv_id": "2507.09580",
    499       "relevance": "Cryptography capabilities benchmark for LLMs; tests formal proof reasoning where F-1 shows strong gains."
    500     },
    501     {
    502       "title": "LLM-SR: Scientific equation discovery via programming with large language models",
    503       "authors": ["Parshin Shojaee", "Kazem Meidani", "Amir Barati Farimani", "Chandan K Reddy"],
    504       "year": 2024,
    505       "relevance": "Scientific equation discovery using LLMs; supports the broader idea that structured equation representations improve LLM reasoning."
    506     },
    507     {
    508       "title": "Large language models for mathematical reasoning: Progresses and challenges",
    509       "authors": ["Janice Ahn", "Rishu Verma", "Renze Lou", "Di Liu", "Rui Zhang", "Wenpeng Yin"],
    510       "year": 2024,
    511       "arxiv_id": "2402.00157",
    512       "relevance": "Survey of LLM mathematical reasoning techniques; provides context for the landscape of prompting approaches F-1 contributes to."
    513     }
    514   ]
    515 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs