scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24694B)
      1 {
      2   "paper": {
      3     "title": "Scaling Test-Time Compute Without Verification or RL is Suboptimal",
      4     "authors": ["Amrith Setlur", "Nived Rajaraman", "Sergey Levine", "Aviral Kumar"],
      5     "year": 2025,
      6     "venue": "International Conference on Machine Learning",
      7     "arxiv_id": "2502.12118",
      8     "doi": "10.48550/arXiv.2502.12118"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["theoretical", "benchmark-eval"],
     13   "key_findings": "The paper proves that verifier-based (VB) methods for scaling test-time compute outperform verifier-free (VF) methods by a factor of √H when the base LLM is heterogeneous and anti-concentrated. Empirically, VB best-of-N search scales compute by 8× and data by 6× over VF SFT on MATH with 3B/8B Llama models. The s1 model trained with verifier-free distillation is outperformed by simple best-of-N search with a trained verifier. The theoretical separation grows super-linearly when both data and compute are scaled.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL or code archive link is provided in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses publicly available datasets: MATH benchmark and MATH500 evaluation set. The didactic planted subsequence problem is fully specified in the paper and appendix."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment setup listing library versions is provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions or scripts are provided. Experimental details are in the appendix but no runnable instructions."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Main results in Figures 5, 7, and 8 show point estimates without confidence intervals or error bars."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims VB outperforms VF based on comparing accuracy numbers (e.g., '8× scaling') without statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported with context: '8× compute scaling', '6× data efficiency', accuracy gaps of 9%, 17%, 20% (Figure 7c), and specific accuracy differences between VB and VF across budgets."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why specific data budgets (2^12 to 2^16) or evaluation set sizes were chosen."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviations, variance across seeds, or spread measures are reported for the experimental results."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares VB (BoN, RL) against VF (SFT) methods, and also compares against the base policy and expert policy (Figures 5, 7, 8)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include the s1 model (2025), SFT following Snell et al. (2024) and Qu et al. (2024) approaches, which are recent and relevant."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper ablates over: data budget scaling (Figure 5b, 7b), compute budget scaling (Figure 5a, 7a), base policy heterogeneity (Figure 6, 9), and combined scaling (Figure 5c, 7c)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses test-time efficiency (normalized J_r), accuracy on MATH500, accuracy on AIME24, and verifier accuracy as metrics."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant to the claims — the paper evaluates mathematical reasoning with ground-truth answers."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Training on MATH training set, evaluation on MATH500 benchmark and AIME24 (Appendix D). The didactic setup also separates train and test prompts."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results broken down by heterogeneity buckets (low/medium/high, Figure 9), easy/hard problem sets (Figure 10), and separate MATH500/AIME24 results (Figure 8)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses when VF outperforms VB: low heterogeneity settings (Figure 6), and identifies verifier accuracy decline as a source of RL performance degradation (Appendix C, Figure 12)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figure 6 shows VF SFT outperforming VB RL when heterogeneity is low. Figure 9 shows VF SFT outperforming BoN on low-heterogeneity problem buckets. The paper explicitly discusses when its thesis does not hold."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims about √H separation, VB outperforming VF, and empirical corroboration on 3/8/32B models are supported by theorems (5.1, 5.8) and Figures 5, 7, 8."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims about verification enabling better scaling are supported by both formal proofs (Theorems 5.4, 5.7, 5.8) and controlled experiments varying one variable at a time (heterogeneity, data budget, compute budget)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper bounds its empirical findings to specific models (Llama-3.1/3.2 3B/8B, s1, GPT2-xl), specific benchmarks (MATH, AIME, planted subsequence), and acknowledges theoretical conditions (heterogeneity, anti-concentration) required for the separation."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses verifier accuracy as an alternative explanation for performance gaps (Figure 12), discusses when heterogeneity is low and VF outperforms VB (Figure 6), and discusses reward hacking as a failure mode of VB methods."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly defines what it measures (bi-level reward capturing both correctness and token efficiency) and discusses in Remark 5.10 whether improvements come from solving more problems vs. solving existing problems faster."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific models are named: Llama-3.1-8B-Instruct, Llama-3.2-3B-Instruct, GPT2-xl, and the s1 model with reference to its paper."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting in the traditional sense — it finetunes models with RL/SFT and evaluates math problem solving. The SFT data construction is described procedurally."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix C and D report: batch size 64, learning rates (1e-4 for RL, 2e-4 for SFT), KL penalty weight 0.2, Adam optimizer, 20k iterations for RL, 10000 iterations for MATH, weight decay 0.01, temperature 1.0 for sampling."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The approach is direct model finetuning and inference."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The SFT data construction is documented: rejection sampling from base policy, stitching incorrect + correct traces (Appendix D, with examples). Verifier training data collection is described: sampling from base LLM, 0/1 annotation."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8 'Discussion, Limitations, and Future Work' includes substantive discussion of limitations."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 8 identifies specific limitations: the analysis only separates VB vs VF (not sparse vs dense verification), bi-level rewards may not capture all reward types, and scaling to >32k contexts is not studied. Also discusses verifier accuracy as a practical concern."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 8 explicitly states what is NOT covered: dense/process-based rewards, generative rewards, scaling to very long contexts (>32k). The theoretical conditions (heterogeneity, anti-concentration) bound when results apply."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (training logs, generated traces, verifier predictions) is released."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Data collection is described in detail: rejection sampling from base policy for SFT data, sampling from base policy with 0/1 annotation for verifier training, binary search for bi-level location in didactic setup (Appendix C, D)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard benchmarks (MATH) and model-generated traces."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: base LLM → sample traces → annotate with rewards → train verifier/SFT → evaluate. SFT trace construction (stitching incorrect + correct responses) is detailed with examples in Appendix D."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgements section lists: Office of Naval Research (N00014-24-12206), JP Morgan AI PhD Fellowship, NSF Grants IIS-1901252 and CCF-2211209, Google Cloud TRC program, Lambda labs."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are disclosed: Carnegie Mellon University and UC Berkeley. No conflict with evaluated products."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funders (ONR, NSF, JP Morgan fellowship) do not have a direct financial stake in whether VB or VF methods perform better. Google Cloud and Lambda provided compute resources, not research direction."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is provided in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper evaluates Llama-3.1/3.2 models finetuned on MATH and tested on MATH500/AIME24 but does not state the training data cutoff for the base models."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether MATH problems could appear in Llama pre-training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "MATH was published in 2021; Llama models trained after this. No discussion of contamination risk for MATH500 or AIME24 benchmarks."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Token budgets are varied (2^9 to 2^13) but no API costs, wall-clock times, or per-example costs are reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The acknowledgements mention Google Cloud and Lambda labs for compute but no GPU hours, total training time, or compute budget is quantified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No reporting of results across multiple random seeds. Single-run results appear to be reported."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not explicitly stated for any experiment."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Hyperparameters are reported but no search budget or number of configurations tried is stated."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No discussion of how the reported hyperparameters were selected or whether they were tuned on a validation set."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement both VB and VF baselines themselves without acknowledging potential bias in their implementations. They do compare against the externally-trained s1 model, partially mitigating this."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Performance is explicitly plotted as a function of compute budget (test-time tokens H) in Figures 5a, 7a, 8. Compute-matched evaluation is emphasized."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "The paper defines and formalizes what 'effective test-time compute scaling' means (Definition 4.2, bi-level reward Property 4.1), explicitly discussing what the evaluation measures and why."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved — the paper compares training methods (SFT vs RL/BoN) directly."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. MATH (2021) and AIME24 benchmarks could have been seen during Llama pre-training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training and test sets beyond using separate MATH splits."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is applied."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Verifier-based methods scale test-time compute by √H better than any verifier-free method when the base LLM is heterogeneous and anti-concentrated.",
    365       "evidence": "Theorems 5.1, 5.4, 5.7, 5.8 with formal proofs in Appendix A. Lower bound Ω(H/√n) for VF vs upper bound O(H/n) for VB.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "VB best-of-N scales compute by 8× and data efficiency by 6× over VF SFT on MATH with Llama 3B/8B models.",
    370       "evidence": "Figure 7(a)(b): at fixed n=2^14, BoN achieves at 2^10 tokens what SFT needs 2^13 tokens for; at fixed H=2^12, BoN at 2^13 data matches SFT at 2^16.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "The gap between VB and VF grows super-linearly when both data and compute are scaled jointly.",
    375       "evidence": "Figure 7(c) shows accuracy gap growing from 9% to 17% to 20% as log H increases; Figure 5(c) shows similar trend in didactic setup.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The s1 model trained with verifier-free distillation is outperformed by simple best-of-N search with a trained verifier.",
    380       "evidence": "Figure 8: BoN outperforms budget-forced s1 on both MATH500 and AIME24 across compute budgets.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Common pre-trained LLMs (Llama-3.1-8B) exhibit heterogeneous and anti-concentrated reward distributions.",
    385       "evidence": "Figure 9 shows distribution of σ_x skewed toward higher values; Figure 10 shows ~0.25 anti-concentration coefficient on both easy and hard MATH problems.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "When base policy heterogeneity is low, VF methods can outperform VB methods.",
    390       "evidence": "Figure 6: at low σ_b, SFT outperforms RL. Figure 9: VF SFT beats BoN on low-heterogeneity problem buckets.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No error bars or variance reporting",
    397       "detail": "All experimental results (Figures 5-10) appear to be single-run results with no error bars, confidence intervals, or variance across seeds. Given that RL training can be highly variable (as Henderson et al. 2018 demonstrated), the reported gaps could be within noise."
    398     },
    399     {
    400       "flag": "No contamination analysis",
    401       "detail": "MATH benchmark was published in 2021. Llama models likely trained on data including MATH solutions. No contamination analysis is performed, which could inflate baseline accuracy and affect the VB/VF comparison."
    402     },
    403     {
    404       "flag": "Authors implement their own baselines",
    405       "detail": "Both VB and VF methods are implemented by the authors. The SFT baseline follows Snell et al. and Qu et al. but is not their original code. The s1 comparison (Figure 8) partially mitigates this by comparing against an externally trained model."
    406     }
    407   ],
    408   "cited_papers": [
    409     {
    410       "title": "SFT memorizes, RL generalizes: A comparative study of foundation model post-training",
    411       "authors": ["Tianzhe Chu", "Yuexiang Zhai", "Jihan Yang"],
    412       "year": 2025,
    413       "arxiv_id": "2501.17161",
    414       "relevance": "Systematic comparison of SFT vs RL post-training, finding RL generalizes better — directly supports this paper's thesis."
    415     },
    416     {
    417       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    418       "authors": ["DeepSeek-AI"],
    419       "year": 2025,
    420       "arxiv_id": "2501.12948",
    421       "relevance": "Major RL-trained reasoning model using 0/1 outcome rewards, exemplifying VB approach at scale."
    422     },
    423     {
    424       "title": "s1: Simple test-time scaling",
    425       "authors": ["Niklas Muennighoff", "Zitong Yang", "Weijia Shi"],
    426       "year": 2025,
    427       "arxiv_id": "2501.19393",
    428       "relevance": "Key VF baseline: distills Gemini Thinking traces via supervised learning, directly compared against in Figure 8."
    429     },
    430     {
    431       "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters",
    432       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    433       "year": 2024,
    434       "arxiv_id": "2408.03314",
    435       "relevance": "Foundational work on optimal test-time compute scaling that this paper builds upon theoretically and empirically."
    436     },
    437     {
    438       "title": "Training verifiers to solve math word problems",
    439       "authors": ["Karl Cobbe", "Vineet Kosaraju"],
    440       "year": 2021,
    441       "arxiv_id": "2110.14168",
    442       "relevance": "Introduced verifier-based approach for math reasoning that is the basis for the best-of-N method used in this paper."
    443     },
    444     {
    445       "title": "Is behavior cloning all you need? Understanding horizon in imitation learning",
    446       "authors": ["Dylan J Foster", "Adam Block", "Dipendra Misra"],
    447       "year": 2024,
    448       "arxiv_id": "2407.15007",
    449       "relevance": "Provides theoretical lower bounds for supervised finetuning that this paper extends to multi-prompt settings."
    450     },
    451     {
    452       "title": "RL on incorrect synthetic data scales the efficiency of LLM math reasoning by eight-fold",
    453       "authors": ["Amrith Setlur", "Saurabh Garg"],
    454       "year": 2024,
    455       "arxiv_id": "2406.14532",
    456       "relevance": "Prior work by same authors on using RL with synthetic data for math reasoning, foundational to this paper's approach."
    457     },
    458     {
    459       "title": "Generative verifiers: Reward modeling as next-token prediction",
    460       "authors": ["Lunjun Zhang", "Arian Hosseini"],
    461       "year": 2024,
    462       "arxiv_id": "2408.15240",
    463       "relevance": "Proposes generative verification approach that falls under this paper's VB category."
    464     },
    465     {
    466       "title": "STaR: Bootstrapping reasoning with reasoning",
    467       "authors": ["Eric Zelikman", "Yuhuai Wu", "Jesse Mu", "Noah Goodman"],
    468       "year": 2022,
    469       "relevance": "Influential self-training approach for reasoning that uses rejection sampling — a VF method in this paper's taxonomy."
    470     },
    471     {
    472       "title": "Training language models to self-correct via reinforcement learning",
    473       "authors": ["Aviral Kumar", "Vincent Zhuang", "Rishabh Agarwal"],
    474       "year": 2024,
    475       "arxiv_id": "2409.12917",
    476       "relevance": "RL-based self-correction approach that motivates the sequential revision SFT baseline used in this paper."
    477     },
    478     {
    479       "title": "Rewarding progress: Scaling automated process verifiers for LLM reasoning",
    480       "authors": ["Amrith Setlur", "Chirag Nagpal", "Adam Fisch"],
    481       "year": 2024,
    482       "arxiv_id": "2410.08146",
    483       "relevance": "Process verification for LLM reasoning, extends the verification paradigm studied in this paper."
    484     },
    485     {
    486       "title": "DeepScaleR: Surpassing o1-preview with a 1.5B model by scaling RL",
    487       "authors": ["Michael Luo", "Sijun Tan"],
    488       "year": 2025,
    489       "relevance": "Demonstrates VB RL scaling with outcome rewards on small models, validating this paper's thesis in practice."
    490     }
    491   ]
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs