scan.json (19692B)
1 { 2 "paper": { 3 "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters", 4 "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2408.03314" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive is provided in the paper." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses the publicly available MATH benchmark and PRM800k dataset from Lightman et al. Both are publicly available." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No environment specifications, requirements files, or dependency lists are provided." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions are included. The paper describes methodology but does not provide runnable scripts or a README." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results are reported as point estimates (accuracy percentages) without confidence intervals or error bars on figures." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., '4x more efficient', 'outperform a 14x larger model') without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage improvements with baseline context, e.g., '+21.6%', '+16.7%' relative improvement in accuracy (Figure 1), and '4x less computation' efficiency gains." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The test set is 500 questions from MATH. No justification for why this size is sufficient for the claims made, and no power analysis." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread measures are reported across runs. Results appear to be single-run numbers." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines are compared: best-of-N sampling, majority voting, ORM, and greedy pass@1 from a 14x larger model." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include contemporary approaches like PRM-based search, majority voting, and best-of-N weighted selection from recent work (Lightman et al. 2023, Li et al. 2023)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Extensive ablations are provided: PRM aggregation strategies (Appendix E), PRM vs ORM (Appendix F), revision model with/without history (Appendix J), ReSTEM revision model (Appendix K), sequential-to-parallel ratio sweeps." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "Only accuracy on MATH is reported. No other metrics (e.g., calibration, cost-efficiency curves as a formal metric) are used." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a math reasoning benchmark evaluation where ground-truth answers are available; human evaluation is not relevant." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses a 12k train / 500 test split from Lightman et al. and employs two-fold cross-validation on the test set for compute-optimal strategy selection (Section 3.2)." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are extensively broken down by difficulty level (5 bins) throughout Figures 3-9 and appendix figures." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses where test-time compute fails: hard questions (difficulty bins 4/5) show limited gains, and the ReSTEM revision model degrades with more sequential revisions (Appendix K). Example outputs show failure trajectories (Figures 17-23)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Several negative results: ReSTEM optimization hurts revision performance (Appendix K), test-time compute is less effective than pretraining on hard questions (Section 7, Figure 9), PRM trained on PRM800k data was ineffective due to distribution shift (Section 5.1)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of '4x improvement over best-of-N' and 'outperform 14x larger model' are supported by Figures 1 and 9 with appropriate caveats about difficulty-dependence." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims via ablations (e.g., removing components, varying ratios) with controlled single-variable manipulation. The compute-optimal strategy is validated via cross-validation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title claims broad applicability ('Scaling LLM Test-Time Compute') but results are only on MATH with PaLM 2-S*. The paper acknowledges 'we believe that our findings likely transfer to similar models' (Section 4) without evidence." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper discusses alternative explanations: distribution shift affecting PRM performance (Section 5.1), difficulty bins being computed with oracle vs predicted difficulty, and the possibility that PRM training acts as representation learning rather than inference-time tool (Appendix E)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper uses 'PaLM 2-S* (Codey)' without specifying an exact version, snapshot date, or API version." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "Appendix G describes the prompting approach (4-shot from PRM800k phase 1 training split) but does not provide the actual prompt text." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "PRM training hyperparameters are reported in Appendix D (lr 3e-5, batch size 128, dropout 0.05, Adam betas). Revision model hyperparameters in Appendix H (lr 1e-5, batch size 128). Beam search parameters detailed in Section 5.2." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used; this is a search/revision methodology study." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "PRM training data generation is documented in Appendix D (16 samples per question, 16 MC rollouts per step, filtering invalid answers). Revision model data generation in Appendix H (64 outputs per question, edit-distance-based selection)." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 8 (Discussion and Future Work) discusses specific limitations: difficulty estimation cost, lack of combined PRM+revision experiments, limited gains on hard problems." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Specific threats discussed: difficulty estimation requires non-trivial compute itself (Section 3.2), oracle difficulty bins use ground-truth not available in practice, single model family tested (Section 4)." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states what it did NOT test: no combination of PRM tree-search with revisions, no critique-and-revise methods, and acknowledges hard questions remain unsolved (Section 8)." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "Raw experimental results (per-question predictions, generated solutions) are not released." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Data collection for PRM training and revision model training is described in detail in Appendices D and H, including sampling procedures and filtering criteria." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; uses standard benchmark (MATH)." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline from sampling model outputs to training verifiers/revision models to evaluation is documented across Sections 4-6 and Appendices D, H, J." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No explicit funding disclosure. Work done during internship at Google DeepMind is stated but no grants or funding sources are listed." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly stated: UC Berkeley and Google DeepMind, with note that work was done during internship at Google DeepMind." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "Google DeepMind is both the employer/funder and provider of the PaLM 2 models being evaluated. Google has a financial interest in demonstrating effective use of inference compute." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "No training data cutoff date is stated for PaLM 2-S*." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether MATH problems appeared in PaLM 2's pretraining data." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "MATH was published in 2021; PaLM 2 was trained after this. No contamination analysis is provided." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The paper discusses compute in terms of FLOPs ratios and generation budgets but does not report actual API costs, wall-clock time, or tokens consumed." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total GPU hours, training time, or hardware specifications are provided for the experiments." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Compute-optimal scaling of test-time compute can improve efficiency by more than 4x compared to best-of-N baseline.", 286 "evidence": "Figures 1 and 5 show compute-optimal strategy matching best-of-N performance with ~4x less compute on both revisions and PRM search settings.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "On easy and intermediate questions, test-time compute with a smaller model can outperform a 14x larger model in a FLOPs-matched evaluation.", 291 "evidence": "Figure 9 shows that on difficulty bins 1-3 with R<<1 or R~=1, the compute-optimal strategy with PaLM 2-S* exceeds greedy performance of the 14x larger model.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "The effectiveness of test-time compute scaling critically depends on prompt difficulty.", 296 "evidence": "Figures 3-9 consistently show different optimal strategies for different difficulty bins, with search/revisions helping easy questions but providing diminishing returns on hard ones.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "On the hardest questions, scaling pretraining compute is more effective than scaling test-time compute.", 301 "evidence": "Figure 9 shows on difficulty bins 4/5 with R>=1, the 14x larger model outperforms test-time compute scaling with the smaller model.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "PRM-based beam search outperforms best-of-N on easy questions but underperforms on hard questions.", 306 "evidence": "Figure 3 shows beam search exceeds best-of-N on difficulty bins 1-3 but falls below on bins 4-5.", 307 "supported": "strong" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "Test-time compute scaling effectiveness depends critically on prompt difficulty relative to the base model's capabilities. A compute-optimal strategy that adaptively selects between revision and search methods based on question difficulty achieves 4x efficiency gains over best-of-N baselines. On easy-to-medium difficulty math problems, a smaller model with additional test-time compute can outperform a 14x larger model in FLOPs-matched comparisons, but on the hardest problems, additional pretraining remains more effective.", 312 "red_flags": [ 313 { 314 "flag": "No error bars or variance reporting", 315 "detail": "All results are reported as point estimates without confidence intervals, error bars, or multi-run variance. For a paper making quantitative efficiency claims (4x improvement), this is a significant omission." 316 }, 317 { 318 "flag": "Single model family", 319 "detail": "All experiments use PaLM 2-S* only. The paper claims findings 'likely transfer to similar models' without evidence. Generalization to other model families is unknown." 320 }, 321 { 322 "flag": "Proprietary model", 323 "detail": "PaLM 2-S* is a proprietary Google model not publicly available, making independent reproduction impossible." 324 }, 325 { 326 "flag": "Benchmark contamination unaddressed", 327 "detail": "MATH benchmark was published in 2021 and PaLM 2 was trained after this date. No contamination analysis is provided, which could affect absolute performance numbers and difficulty calibration." 328 }, 329 { 330 "flag": "Company evaluating own product", 331 "detail": "Three of four authors are affiliated with Google DeepMind, and the paper evaluates Google's PaLM 2 model. No conflict of interest statement is provided." 332 } 333 ], 334 "cited_papers": [ 335 { 336 "title": "Let's verify step by step", 337 "authors": ["H. Lightman", "V. Kosaraju", "Y. Burda", "H. Edwards", "B. Baker", "T. Lee", "J. Leike", "J. Schulman", "I. Sutskever", "K. Cobbe"], 338 "year": 2023, 339 "relevance": "Foundational work on process reward models (PRMs) for step-level verification of LLM reasoning, directly extended in this paper." 340 }, 341 { 342 "title": "Self-refine: Iterative refinement with self-feedback", 343 "authors": ["A. Madaan"], 344 "year": 2023, 345 "relevance": "Key prior work on LLM self-revision that this paper builds upon and compares against." 346 }, 347 { 348 "title": "Training compute-optimal large language models", 349 "authors": ["J. Hoffmann"], 350 "year": 2022, 351 "relevance": "Chinchilla scaling laws for pretraining compute; this paper extends the compute-optimality concept to inference time." 352 }, 353 { 354 "title": "Beyond chinchilla-optimal: Accounting for inference in language model scaling laws", 355 "authors": ["N. Sardana", "J. Frankle"], 356 "year": 2023, 357 "arxiv_id": "2401.00448", 358 "relevance": "Directly relevant work on trading off training and inference compute in LLM scaling laws." 359 }, 360 { 361 "title": "Tree of thoughts: Deliberate problem solving with large language models", 362 "authors": ["S. Yao"], 363 "year": 2023, 364 "relevance": "Tree-search approach for LLM reasoning that this paper's beam search methods relate to." 365 }, 366 { 367 "title": "Reflexion: Language agents with verbal reinforcement learning", 368 "authors": ["N. Shinn"], 369 "year": 2023, 370 "relevance": "Agentic approach to LLM self-improvement through verbal feedback, related to revision mechanisms studied here." 371 }, 372 { 373 "title": "Chain-of-thought prompting elicits reasoning in large language models", 374 "authors": ["J. Wei"], 375 "year": 2023, 376 "relevance": "Foundational prompting technique for LLM reasoning that underpins the step-by-step format used in this paper." 377 }, 378 { 379 "title": "Large language models cannot self-correct reasoning yet", 380 "authors": ["J. Huang"], 381 "year": 2023, 382 "relevance": "Negative result on LLM self-correction that motivates this paper's investigation of when test-time compute helps." 383 }, 384 { 385 "title": "Beyond human data: Scaling self-training for problem-solving with language models", 386 "authors": ["A. Singh"], 387 "year": 2024, 388 "relevance": "ReSTEM method used in this paper's revision model experiments (Appendix K)." 389 }, 390 { 391 "title": "A critical evaluation of ai feedback for aligning large language models", 392 "authors": ["A. Sharma"], 393 "year": 2024, 394 "arxiv_id": "2402.12366", 395 "relevance": "Evaluates AI feedback effectiveness for LLM alignment, relevant to understanding self-improvement limitations." 396 } 397 ] 398 }