scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24993B)
      1 {
      2   "paper": {
      3     "title": "Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models",
      4     "authors": [
      5       "Yangzhen Wu",
      6       "Zhiqing Sun",
      7       "Shanda Li",
      8       "Sean Welleck",
      9       "Yiming Yang"
     10     ],
     11     "year": 2024,
     12     "venue": "ICLR 2025",
     13     "arxiv_id": "2408.00724"
     14   },
     15   "scan_version": 2,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval", "theoretical"],
     18   "key_findings": "Scaling inference compute with smaller models and advanced inference strategies can outperform larger models at the same compute budget. The proposed REBASE tree search algorithm achieves Pareto-optimal cost-performance tradeoffs, with Llemma-7B + REBASE outperforming Llemma-34B with standard methods using 2x less FLOPs. Theoretical analysis shows sampling-based voting converges exponentially but saturates at a limit determined by the model's output distribution, motivating more sophisticated inference algorithms.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "A project page URL is provided (https://thu-wyz.github.io/inference-scaling/) in the author block, which serves as a release page for the work."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All benchmarks used (MATH, GSM8K, MBPP, MetaMath, Math-Shepherd) are publicly available datasets. The paper references them with citations."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. Table 2 lists fine-tuning hyperparameters but not software dependencies or library versions."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions are provided in the paper. Hyperparameters are listed in Appendix C but no runnable commands or scripts are described."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Section 4.1 states 'Each configuration is run multiple times to calculate the mean and variance, which mitigates effects from randomness.' Figures show shaded regions indicating variance."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No statistical significance tests are reported. Claims like 'REBASE outperforms sampling' are based on visual comparison of curves and point estimates without p-values or formal tests."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Effect sizes are reported with context throughout: '2× less FLOPs', '7 times less compute' (Table 1), specific accuracy numbers with baselines (e.g., 45.5% vs 46.8% MATH500 accuracy)."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification for why MATH500 (500 problems) or GSM8K test set sizes were used. No power analysis or discussion of whether sample sizes are adequate for the conclusions drawn."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Section 4.1: 'Each configuration is run multiple times to calculate the mean and variance.' Figures display variance bands."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Comprehensive baselines: greedy search, majority voting, best-of-n, weighted majority voting, and MCTS. Both sampling-based and tree-search variants are compared."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "MCTS variants from 2023-2024 are included (Zhang et al. 2023, Zhou et al. 2024, Liu et al. 2024). Self-consistency/majority voting (Wang et al. 2023) is contemporary."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No ablation study of REBASE components (e.g., effect of balance temperature Tb, reward model quality, expansion budget formula). The MATH-easy vs MATH-hard analysis is a breakdown, not an ablation."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Only accuracy/error rate is used as the evaluation metric throughout. While multiple benchmarks are tested (MATH, GSM8K, MBPP), the metric is always accuracy or pass rate."
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Human evaluation is irrelevant — the benchmarks have ground-truth mathematical answers that can be verified automatically."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "MATH500 test set and GSM8K test set are standard held-out splits. Models are fine-tuned on MetaMath training data, evaluated on separate test sets."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Figure 8 breaks results down by MATH difficulty level (easy: levels 1-2, hard: levels 3-5). Results are shown per-model and per-strategy throughout."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "MCTS failure mode is discussed: 'MCTS underperforms the sampling-based methods at each compute budget, likely due to its costly rollouts' and its production of 'many unfinished solutions.' Saturation behavior is analyzed."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "MCTS is shown to underperform simpler sampling methods — a significant negative result given MCTS's popularity. Saturation of sampling methods is documented as a limitation."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims — smaller models outperform larger ones at fixed compute, REBASE is Pareto-optimal, Llemma-7B outperforms Llemma-34B — are all supported by Figures 4-7 and Table 1."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims are supported by controlled single-variable manipulation: varying model size while fixing strategy, or varying strategy while fixing model. The inference strategy is the only variable changed in each comparison."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "Title claims 'Inference Scaling Laws' and 'Problem-Solving with Language Models' broadly, but experiments are primarily on mathematical reasoning benchmarks (MATH, GSM8K) with one small code generation experiment (MBPP, Appendix D.2). No natural language reasoning, commonsense, or other problem types tested."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "No substantive discussion of alternative explanations. For example, the reward model quality could explain much of REBASE's advantage (it uses the same Llemma-34B reward model for both 7B and 34B policy models), but this confound is not discussed."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Claims match measurement granularity. The paper measures accuracy on math benchmarks and claims compute-optimal inference for math problem-solving. No overreach to broader 'reasoning' or 'intelligence' framing."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model names and sizes are provided: Pythia-410M/1.4B/2.8B/6.9B/12B, Llemma-7B, Llemma-34B, Mistral-7B, Llama3-8B-Instruct. These are specific open-source model checkpoints."
    148       },
    149       "prompts_provided": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "The paper uses fine-tuned models that take math problems as direct input. No custom prompting or prompt engineering is involved — inputs are benchmark questions."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Table 2 provides fine-tuning hyperparameters (LR, batch size, epochs, max seq length, dtype). Appendix C reports inference parameters: temperature 1.0, max tokens 1024, REBASE Tb=0.1, MCTS C=1 and expansion configurations."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The inference strategies (sampling, tree search) are the methods being studied, not scaffolding around an agent."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": false,
    167         "justification": "Appendix C mentions 'We preprocess the MetaMath Dataset to make the solutions in a stepwise format' but provides no detail on how this preprocessing was done."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No dedicated limitations section. The conclusion mentions some caveats about future work but does not substantively discuss limitations."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No threats to validity are discussed anywhere in the paper."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No explicit statement of what the results do not show. The paper does not bound its conclusions to math or acknowledge that inference scaling laws may differ for other task types."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Generated solutions, reward model scores, and per-problem results are not released. Only aggregated accuracy numbers and FLOPs are reported."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Data sources are clearly described: MATH500 test set, GSM8K test set, MetaMath for fine-tuning, Math-Shepherd for reward model training. All with citations."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. All data comes from standard public benchmarks."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: fine-tune policy models on MetaMath → fine-tune reward model on Math-Shepherd → generate solutions via various strategies → aggregate via voting/selection → evaluate accuracy."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgments section: 'Zhiqing Sun acknowledges the support of the Google Fellowship. Sean Welleck thanks NSF SCALE (NSF DMS 2134012) and Convergent Research.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All authors' affiliations are listed: Tsinghua University and Carnegie Mellon University. No company products are being evaluated."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Funding is from Google Fellowship and NSF — neither has a direct financial interest in the specific inference scaling law findings or REBASE algorithm."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is included in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No training data cutoff dates are stated for any of the base models (Pythia, Llemma, Mistral, Llama3). The pretraining data composition is not discussed."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No discussion of whether MATH or GSM8K problems appear in the pretraining data of any model used. MetaMath is derived from these benchmarks' training splits, but overlap with test sets is not addressed."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "GSM8K (2021) and MATH (2021) predate all models used, creating contamination risk. This is not discussed. MetaMath is derived from these datasets, adding further risk of leakage."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Inference FLOPs per question are reported extensively throughout — this is the core axis of analysis. Table 1 provides specific FLOPs values for each configuration."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "Total computational budget (GPU hours, hardware, wall-clock time for all experiments) is not stated. Only per-question inference FLOPs are reported. Fine-tuning compute is not quantified."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.1: 'Each configuration is run multiple times to calculate the mean and variance, which mitigates effects from randomness.' Variance is shown in figures."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper says 'run multiple times' but never states the exact number of runs per configuration."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "No hyperparameter search budget is reported. The REBASE balance temperature Tb=0.1 and MCTS C=1 appear tuned but no search process is described."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "REBASE uses Tb=0.1 without justification for this specific value. MCTS expansion configurations (4/8/16 children) are stated but not justified."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors propose REBASE and compare it against their own implementations of baselines (sampling, MCTS). No acknowledgment of self-comparison bias per Lucic et al. (2018)."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "This is the paper's central contribution. All main results (Figures 1, 4-7) plot performance as a function of inference FLOPs, enabling direct compute-matched comparisons."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "Brief mention that math 'allows us to accurately evaluate problem solving ability' (Section 4.1) but no substantive discussion of whether MATH/GSM8K actually measure the claimed capabilities or their limitations as benchmarks."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "The same reward model (Llemma-34B) is used across all inference strategies and model sizes, controlling for the reward model confound. Each comparison varies only one factor (model size OR strategy)."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "MATH (2021) and GSM8K (2021) predate the training of models like Llama3 (2024). No discussion of whether benchmark solutions appeared in pretraining data."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the evaluation setup leaks information. The reward model is trained on Math-Shepherd which is derived from the same problem domain."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "MetaMath training data is derived from MATH and GSM8K training splits. No verification that train and test problems are truly independent or that near-duplicates don't exist."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No leakage detection or prevention method is applied."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Scaling inference compute with smaller models can be more computationally efficient than scaling model parameters.",
    370       "evidence": "Figure 1 shows Pythia models on GSM8K where smaller models (410M-2.8B) outperform larger ones at lower compute budgets. Figures 4-5 show Llemma-7B achieves comparable accuracy to Llemma-34B with 2x less FLOPs.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "REBASE is Pareto-optimal across all tested compute budgets, outperforming sampling and MCTS methods.",
    375       "evidence": "Figures 4-7 show REBASE dominates at all compute budgets across MATH, GSM8K, and multiple models (Llemma-7B/34B, Mistral-7B). Table 1 shows REBASE achieves better accuracy with 7x less compute than sampling.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Sampling-based voting converges exponentially to a limit determined by the model's output distribution, with diminishing returns.",
    380       "evidence": "Theorems 1 and 2 prove convergence bounds with formal proofs in Appendix A. Empirical saturation is visible in Figures 1, 6, and 7.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "MCTS has unfavorable cost-performance tradeoffs compared to simpler sampling methods.",
    385       "evidence": "Figure 4 shows MCTS underperforms sampling-based methods at each compute budget on MATH for both 7B and 34B models. Attributed to costly rollouts producing unfinished solutions.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "REBASE yields greater gains on hard problems than easy problems.",
    390       "evidence": "Figure 8 shows comparable performance on MATH-easy (levels 1-2) but significant advantage on MATH-hard (levels 3-5) for both Llemma-7B and 34B.",
    391       "supported": "strong"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Same reward model used for all comparisons",
    397       "detail": "All experiments use a Llemma-34B reward model. REBASE relies more heavily on the reward model than sampling-based methods. The reward model quality confound is not discussed — REBASE's advantage may partly stem from having a strong reward model rather than the search algorithm itself."
    398     },
    399     {
    400       "flag": "No limitations section",
    401       "detail": "The paper has no dedicated limitations section and does not discuss threats to validity, scope boundaries, or conditions under which the findings might not hold."
    402     },
    403     {
    404       "flag": "Contamination risk unaddressed",
    405       "detail": "MATH and GSM8K benchmarks (2021) predate all models used. MetaMath training data is derived from these benchmarks. No contamination analysis is performed."
    406     },
    407     {
    408       "flag": "Generalization beyond math unclear",
    409       "detail": "Title claims general 'Inference Scaling Laws' and 'Problem-Solving' but experiments are overwhelmingly on mathematical reasoning. The MBPP code generation experiment (Appendix D.2) is a small addition without the full scaling analysis."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Training compute-optimal large language models",
    415       "authors": ["Jordan Hoffmann", "Sebastian Borgeaud", "Arthur Mensch"],
    416       "year": 2022,
    417       "arxiv_id": "2203.15556",
    418       "relevance": "Chinchilla scaling laws for training compute — the training-side analogue to this paper's inference scaling laws."
    419     },
    420     {
    421       "title": "Scaling test-time compute optimally can be more effective than scaling LLM parameters",
    422       "authors": ["Charlie Victor Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    423       "year": 2025,
    424       "relevance": "Concurrent work on test-time compute scaling with complementary inference strategies."
    425     },
    426     {
    427       "title": "Self-consistency improves chain of thought reasoning in language models",
    428       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    429       "year": 2023,
    430       "relevance": "Foundational work on majority voting for LLM reasoning that this paper analyzes theoretically."
    431     },
    432     {
    433       "title": "Beyond chinchilla-optimal: Accounting for inference in language model scaling laws",
    434       "authors": ["Nikhil Sardana", "Jacob Portes", "Sasha Doubov", "Jonathan Frankle"],
    435       "year": 2024,
    436       "relevance": "Studies scaling laws accounting for inference cost but only considers fixed inference algorithms."
    437     },
    438     {
    439       "title": "Scaling laws for neural language models",
    440       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    441       "year": 2020,
    442       "arxiv_id": "2001.08361",
    443       "relevance": "Seminal training scaling laws paper whose FLOPs formula is used for inference compute estimation."
    444     },
    445     {
    446       "title": "Chain of thought prompting elicits reasoning in large language models",
    447       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    448       "year": 2022,
    449       "relevance": "Chain-of-thought prompting as an inference-time technique that increases compute for better reasoning."
    450     },
    451     {
    452       "title": "Let's verify step by step",
    453       "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yuri Burda"],
    454       "year": 2024,
    455       "relevance": "Process reward modeling for step-by-step verification — the PRM approach used in REBASE."
    456     },
    457     {
    458       "title": "Math-shepherd: Verify and reinforce LLMs step-by-step without human annotations",
    459       "authors": ["Peiyi Wang", "Lei Li", "Zhihong Shao"],
    460       "year": 2024,
    461       "relevance": "Synthetic process reward modeling dataset used to train the reward model in this paper's experiments."
    462     },
    463     {
    464       "title": "From decoding to meta-generation: Inference-time algorithms for large language models",
    465       "authors": ["Sean Welleck", "Amanda Bertsch", "Matthew Finlayson"],
    466       "year": 2024,
    467       "relevance": "Survey of inference-time algorithms for LLMs covering decoding strategies studied in this paper."
    468     },
    469     {
    470       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    471       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    472       "year": 2023,
    473       "relevance": "Tree search approach for LLM reasoning that combines search with LLMs."
    474     }
    475   ]
    476 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs