scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19692B)
      1 {
      2   "paper": {
      3     "title": "Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters",
      4     "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2408.03314"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided in the paper."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses the publicly available MATH benchmark and PRM800k dataset from Lightman et al. Both are publicly available."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specifications, requirements files, or dependency lists are provided."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are included. The paper describes methodology but does not provide runnable scripts or a README."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates (accuracy percentages) without confidence intervals or error bars on figures."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes comparative claims (e.g., '4x more efficient', 'outperform a 14x larger model') without any statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper reports percentage improvements with baseline context, e.g., '+21.6%', '+16.7%' relative improvement in accuracy (Figure 1), and '4x less computation' efficiency gains."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The test set is 500 questions from MATH. No justification for why this size is sufficient for the claims made, and no power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No variance, standard deviation, or spread measures are reported across runs. Results appear to be single-run numbers."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Multiple baselines are compared: best-of-N sampling, majority voting, ORM, and greedy pass@1 from a 14x larger model."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include contemporary approaches like PRM-based search, majority voting, and best-of-N weighted selection from recent work (Lightman et al. 2023, Li et al. 2023)."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Extensive ablations are provided: PRM aggregation strategies (Appendix E), PRM vs ORM (Appendix F), revision model with/without history (Appendix J), ReSTEM revision model (Appendix K), sequential-to-parallel ratio sweeps."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "Only accuracy on MATH is reported. No other metrics (e.g., calibration, cost-efficiency curves as a formal metric) are used."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a math reasoning benchmark evaluation where ground-truth answers are available; human evaluation is not relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses a 12k train / 500 test split from Lightman et al. and employs two-fold cross-validation on the test set for compute-optimal strategy selection (Section 3.2)."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are extensively broken down by difficulty level (5 bins) throughout Figures 3-9 and appendix figures."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses where test-time compute fails: hard questions (difficulty bins 4/5) show limited gains, and the ReSTEM revision model degrades with more sequential revisions (Appendix K). Example outputs show failure trajectories (Figures 17-23)."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results: ReSTEM optimization hurts revision performance (Appendix K), test-time compute is less effective than pretraining on hard questions (Section 7, Figure 9), PRM trained on PRM800k data was ineffective due to distribution shift (Section 5.1)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of '4x improvement over best-of-N' and 'outperform 14x larger model' are supported by Figures 1 and 9 with appropriate caveats about difficulty-dependence."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper makes causal claims via ablations (e.g., removing components, varying ratios) with controlled single-variable manipulation. The compute-optimal strategy is validated via cross-validation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims broad applicability ('Scaling LLM Test-Time Compute') but results are only on MATH with PaLM 2-S*. The paper acknowledges 'we believe that our findings likely transfer to similar models' (Section 4) without evidence."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses alternative explanations: distribution shift affecting PRM performance (Section 5.1), difficulty bins being computed with oracle vs predicted difficulty, and the possibility that PRM training acts as representation learning rather than inference-time tool (Appendix E)."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'PaLM 2-S* (Codey)' without specifying an exact version, snapshot date, or API version."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Appendix G describes the prompting approach (4-shot from PRM800k phase 1 training split) but does not provide the actual prompt text."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "PRM training hyperparameters are reported in Appendix D (lr 3e-5, batch size 128, dropout 0.05, Adam betas). Revision model hyperparameters in Appendix H (lr 1e-5, batch size 128). Beam search parameters detailed in Section 5.2."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used; this is a search/revision methodology study."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "PRM training data generation is documented in Appendix D (16 samples per question, 16 MC rollouts per step, filtering invalid answers). Revision model data generation in Appendix H (64 outputs per question, edit-distance-based selection)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 8 (Discussion and Future Work) discusses specific limitations: difficulty estimation cost, lack of combined PRM+revision experiments, limited gains on hard problems."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats discussed: difficulty estimation requires non-trivial compute itself (Section 3.2), oracle difficulty bins use ground-truth not available in practice, single model family tested (Section 4)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states what it did NOT test: no combination of PRM tree-search with revisions, no critique-and-revise methods, and acknowledges hard questions remain unsolved (Section 8)."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw experimental results (per-question predictions, generated solutions) are not released."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Data collection for PRM training and revision model training is described in detail in Appendices D and H, including sampling procedures and filtering criteria."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants; uses standard benchmark (MATH)."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from sampling model outputs to training verifiers/revision models to evaluation is documented across Sections 4-6 and Appendices D, H, J."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No explicit funding disclosure. Work done during internship at Google DeepMind is stated but no grants or funding sources are listed."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are clearly stated: UC Berkeley and Google DeepMind, with note that work was done during internship at Google DeepMind."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "Google DeepMind is both the employer/funder and provider of the PaLM 2 models being evaluated. Google has a financial interest in demonstrating effective use of inference compute."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff date is stated for PaLM 2-S*."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether MATH problems appeared in PaLM 2's pretraining data."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "MATH was published in 2021; PaLM 2 was trained after this. No contamination analysis is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The paper discusses compute in terms of FLOPs ratios and generation budgets but does not report actual API costs, wall-clock time, or tokens consumed."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total GPU hours, training time, or hardware specifications are provided for the experiments."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Compute-optimal scaling of test-time compute can improve efficiency by more than 4x compared to best-of-N baseline.",
    286       "evidence": "Figures 1 and 5 show compute-optimal strategy matching best-of-N performance with ~4x less compute on both revisions and PRM search settings.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "On easy and intermediate questions, test-time compute with a smaller model can outperform a 14x larger model in a FLOPs-matched evaluation.",
    291       "evidence": "Figure 9 shows that on difficulty bins 1-3 with R<<1 or R~=1, the compute-optimal strategy with PaLM 2-S* exceeds greedy performance of the 14x larger model.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The effectiveness of test-time compute scaling critically depends on prompt difficulty.",
    296       "evidence": "Figures 3-9 consistently show different optimal strategies for different difficulty bins, with search/revisions helping easy questions but providing diminishing returns on hard ones.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "On the hardest questions, scaling pretraining compute is more effective than scaling test-time compute.",
    301       "evidence": "Figure 9 shows on difficulty bins 4/5 with R>=1, the 14x larger model outperforms test-time compute scaling with the smaller model.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "PRM-based beam search outperforms best-of-N on easy questions but underperforms on hard questions.",
    306       "evidence": "Figure 3 shows beam search exceeds best-of-N on difficulty bins 1-3 but falls below on bins 4-5.",
    307       "supported": "strong"
    308     }
    309   ],
    310   "methodology_tags": ["benchmark-eval"],
    311   "key_findings": "Test-time compute scaling effectiveness depends critically on prompt difficulty relative to the base model's capabilities. A compute-optimal strategy that adaptively selects between revision and search methods based on question difficulty achieves 4x efficiency gains over best-of-N baselines. On easy-to-medium difficulty math problems, a smaller model with additional test-time compute can outperform a 14x larger model in FLOPs-matched comparisons, but on the hardest problems, additional pretraining remains more effective.",
    312   "red_flags": [
    313     {
    314       "flag": "No error bars or variance reporting",
    315       "detail": "All results are reported as point estimates without confidence intervals, error bars, or multi-run variance. For a paper making quantitative efficiency claims (4x improvement), this is a significant omission."
    316     },
    317     {
    318       "flag": "Single model family",
    319       "detail": "All experiments use PaLM 2-S* only. The paper claims findings 'likely transfer to similar models' without evidence. Generalization to other model families is unknown."
    320     },
    321     {
    322       "flag": "Proprietary model",
    323       "detail": "PaLM 2-S* is a proprietary Google model not publicly available, making independent reproduction impossible."
    324     },
    325     {
    326       "flag": "Benchmark contamination unaddressed",
    327       "detail": "MATH benchmark was published in 2021 and PaLM 2 was trained after this date. No contamination analysis is provided, which could affect absolute performance numbers and difficulty calibration."
    328     },
    329     {
    330       "flag": "Company evaluating own product",
    331       "detail": "Three of four authors are affiliated with Google DeepMind, and the paper evaluates Google's PaLM 2 model. No conflict of interest statement is provided."
    332     }
    333   ],
    334   "cited_papers": [
    335     {
    336       "title": "Let's verify step by step",
    337       "authors": ["H. Lightman", "V. Kosaraju", "Y. Burda", "H. Edwards", "B. Baker", "T. Lee", "J. Leike", "J. Schulman", "I. Sutskever", "K. Cobbe"],
    338       "year": 2023,
    339       "relevance": "Foundational work on process reward models (PRMs) for step-level verification of LLM reasoning, directly extended in this paper."
    340     },
    341     {
    342       "title": "Self-refine: Iterative refinement with self-feedback",
    343       "authors": ["A. Madaan"],
    344       "year": 2023,
    345       "relevance": "Key prior work on LLM self-revision that this paper builds upon and compares against."
    346     },
    347     {
    348       "title": "Training compute-optimal large language models",
    349       "authors": ["J. Hoffmann"],
    350       "year": 2022,
    351       "relevance": "Chinchilla scaling laws for pretraining compute; this paper extends the compute-optimality concept to inference time."
    352     },
    353     {
    354       "title": "Beyond chinchilla-optimal: Accounting for inference in language model scaling laws",
    355       "authors": ["N. Sardana", "J. Frankle"],
    356       "year": 2023,
    357       "arxiv_id": "2401.00448",
    358       "relevance": "Directly relevant work on trading off training and inference compute in LLM scaling laws."
    359     },
    360     {
    361       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    362       "authors": ["S. Yao"],
    363       "year": 2023,
    364       "relevance": "Tree-search approach for LLM reasoning that this paper's beam search methods relate to."
    365     },
    366     {
    367       "title": "Reflexion: Language agents with verbal reinforcement learning",
    368       "authors": ["N. Shinn"],
    369       "year": 2023,
    370       "relevance": "Agentic approach to LLM self-improvement through verbal feedback, related to revision mechanisms studied here."
    371     },
    372     {
    373       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    374       "authors": ["J. Wei"],
    375       "year": 2023,
    376       "relevance": "Foundational prompting technique for LLM reasoning that underpins the step-by-step format used in this paper."
    377     },
    378     {
    379       "title": "Large language models cannot self-correct reasoning yet",
    380       "authors": ["J. Huang"],
    381       "year": 2023,
    382       "relevance": "Negative result on LLM self-correction that motivates this paper's investigation of when test-time compute helps."
    383     },
    384     {
    385       "title": "Beyond human data: Scaling self-training for problem-solving with language models",
    386       "authors": ["A. Singh"],
    387       "year": 2024,
    388       "relevance": "ReSTEM method used in this paper's revision model experiments (Appendix K)."
    389     },
    390     {
    391       "title": "A critical evaluation of ai feedback for aligning large language models",
    392       "authors": ["A. Sharma"],
    393       "year": 2024,
    394       "arxiv_id": "2402.12366",
    395       "relevance": "Evaluates AI feedback effectiveness for LLM alignment, relevant to understanding self-improvement limitations."
    396     }
    397   ]
    398 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs