scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25314B)
      1 {
      2   "paper": {
      3     "title": "The Art of Scaling Test-Time Compute for Large Language Models",
      4     "authors": ["Aradhye Agarwal", "Ayan Sengupta", "Tanmoy Chakraborty"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.02008"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "The paper states 'The source code is available at https://github.com/Aradhye2002/art_of_tts' in a footnote on page 1."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available benchmarks: AIME 2024, AIME 2025-I, AIME 2025-II, and GPQA Diamond (Rein et al., 2023). All are standard public datasets."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No environment specification, requirements.txt, Dockerfile, or detailed dependency listing is provided. The paper mentions using API-based inference via deepinfra.com but provides no environment setup details."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "While source code is released, the paper itself does not contain step-by-step reproduction instructions, a README description of commands to run, or a 'Reproducing Results' section."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Figure 3 shows '±1 std. deviation' shaded regions for mean accuracy vs. average completion tokens. However, the main results tables (Table 4) report only point estimates without uncertainty."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper makes numerous comparative claims (e.g., 'beam search performance degrades', 'LFS is always suboptimal to MV') but provides no statistical significance tests. All comparisons are based on raw accuracy differences."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results are reported as raw accuracy percentages (e.g., 83.3% vs. 73.3%) without formal effect size measures like Cohen's d, odds ratios, or systematic baseline-contextualized improvements."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The datasets are small (AIME has 30 problems per variant, GPQA Diamond has ~198 questions) but no justification or power analysis is provided for these sample sizes. With N=8 samples per configuration and 30 test questions, the statistical power for detecting meaningful differences is limited."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Figure 3 shows ±1 standard deviation bands for accuracy vs. token length. However, variance across experimental runs (e.g., different random seeds for sampling) is not reported in the main results tables."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "The paper compares four TTS strategies (beam search, majority voting, first finish search, last finish search) and simple decoding as baselines against each other across eight models."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The baselines include recently proposed methods: FFS (Agarwal et al., 2025), short-m@k (Hassid et al., 2025), beam search, and majority voting. Models used include recent ones like GPT-OSS-120B, Qwen3, and DAPO-32B."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper systematically varies k and N parameters in FFS-k@N and LFS-k@N strategies (Figures 4 and 5), effectively ablating the contributions of trace selection (k) and sample count (N) to performance."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports accuracy, total token consumption, and sequential token consumption (Section 2.4). Both compute cost and accuracy are tracked as dual metrics."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "This is a benchmark evaluation of automated TTS strategies on math/science reasoning tasks with ground-truth answers. Human evaluation of model outputs is not relevant."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses standard held-out benchmarks (AIME 2024, AIME 2025-I, AIME 2025-II, GPQA Diamond) that are not used for any tuning or selection decisions."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down per model (8 models), per dataset (4 datasets), per strategy, and per problem difficulty (easy vs. hard in Table 1). Table 4 provides per-model, per-dataset breakdowns."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses beam search's inverse scaling (Section 3.1), cases where FFS degrades accuracy, and identifies that DAPO does not improve length bias over GRPO (Section 3.2). These are failure mode discussions."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Several negative results are reported: beam search shows inverse scaling (Section 3.1), LFS is always suboptimal to MV (Section 4), DAPO fails to mitigate length bias as claimed (Section 3.2), and FFS can substantially hurt accuracy for some models."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims three findings: (1) no single TTS strategy dominates — supported by Table 4, (2) reasoning models form short-horizon and long-horizon categories — supported by Table 1, (3) optimal TTS performance scales monotonically with compute — supported by Figures 4 and 5. All are supported in the results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The paper makes causal claims about post-training algorithms causing different reasoning horizons (e.g., 'GRPO introduces length bias', 'distinct post-training methods give rise to varying reasoning horizons') but the evidence is observational — correlating model training method with behavior without controlled experiments isolating the training algorithm as the causal factor."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title 'The Art of Scaling Test-Time Compute for Large Language Models' suggests general applicability, but the study tests only 8 open-source models on 4 reasoning datasets (math and science QA). The recipe in Table 2 is presented without bounding its applicability to these specific benchmarks and models."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper attributes short-horizon vs. long-horizon behavior to post-training algorithms (GRPO vs. GSPO) but does not discuss alternative explanations such as differences in training data, model architecture, scale, or other confounds that could explain the observed behavioral differences."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are referred to by marketing names (e.g., 'DeepSeek-R1', 'QwQ-32B', 'GPT-OSS-120B', 'Qwen3-32B') without specific version identifiers, snapshot dates, or API version strings. The paper mentions using deepinfra.com for inference but provides no model version or API endpoint details."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper describes prompting models to output answers within '\\boxed{}' format but does not provide the actual prompt text used for any model or dataset."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Table 3 (Appendix A) provides detailed hyperparameters for each model and dataset, including top-p, temperature, max token limits, beam width, number of samples, and answer-reserve settings."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. The paper evaluates standard decoding strategies (beam search, majority voting, FFS, LFS) applied to model outputs without any scaffolding infrastructure."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 2.3 describes the datasets, formatting (answers within '\\boxed{}'), and evaluation approach. Section 2.5 describes how problem difficulty is measured. The data pipeline from benchmark to evaluation is clear."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (Section 6) summarizes findings but does not discuss limitations."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity are discussed anywhere in the paper."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show or what settings/models/tasks are excluded from the conclusions. The recipe in Table 2 is presented without boundary conditions on its applicability."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper claims to have generated 'over thirty billion tokens' across experiments, but raw model outputs, individual trace data, or per-question results are not released. Only aggregated accuracy values are reported."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 2 describes the data collection procedure: models are evaluated via API (deepinfra.com), with N=8 samples per configuration, using specified hyperparameters (Table 3), on four named datasets."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The study uses standard public benchmarks and LLM APIs."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is clear: sample N traces from each model via API → extract answers from \\boxed{} → apply strategy (BS/MV/FFS/LFS) → compute accuracy. Algorithms 1 and 2 formalize the FFS and LFS strategies."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding disclosure or acknowledgments section is present in the paper. The first author is affiliated with Microsoft Research, which likely provides funding, but this is not disclosed."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed: Microsoft Research and Indian Institute of Technology Delhi. These are stated on the first page."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The first author is from Microsoft Research. Microsoft has a financial interest in LLM performance and test-time compute strategies. No discussion of potential conflict is provided. The paper does not evaluate Microsoft models directly but the funder's independence is unclear given no funding disclosure."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "The paper evaluates eight LLMs on benchmarks but does not state any model's training data cutoff date. This is important because AIME 2024 problems could be in training data of models trained after 2024."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of potential train/test overlap. AIME 2024 problems have been publicly available since early 2024, and some models may have been trained on data including these problems."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "GPQA Diamond was published in 2023 and AIME 2024 problems have been publicly available. Many of the models tested (e.g., Qwen3, GPT-OSS-120B) were likely trained after these benchmarks were published. No contamination analysis is provided."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants. This is a benchmark evaluation study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants. This is a benchmark evaluation study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. This is a benchmark evaluation study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. This is a benchmark evaluation study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. This is a benchmark evaluation study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants. This is a benchmark evaluation study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants. This is a benchmark evaluation study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Token consumption is reported in two ways: total tokens and sequential tokens (Section 2.4). Table 4 reports per-model, per-strategy token costs. The paper notes generating 'over thirty billion tokens' total."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "While token counts are reported, the total computational budget in terms of API cost (dollars spent on deepinfra.com), GPU hours, or wall-clock time is not stated. The paper claims 'over thirty billion tokens' were generated but does not quantify the associated cost."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "No single TTS strategy universally dominates across all models and tasks.",
    286       "evidence": "Table 4 shows different strategies (BS, MV, FFS, LFS) winning on different model-dataset combinations. MV leads most often but not always. Section 3 and Appendix B provide detailed results.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Reasoning models can be categorized into short-horizon and long-horizon types based on trace-length-accuracy correlation.",
    291       "evidence": "Table 1 shows that short-horizon models (R1, DAPO-32B, QwQ-32B) have higher accuracy on shorter traces regardless of difficulty, while long-horizon models (GPT-OSS-120B, Qwen3-32B) prefer longer traces for hard problems.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Beam search shows inverse or no scaling for reasoning-focused datasets.",
    296       "evidence": "Section 3.1 and Figure 1 show that accuracy degrades or remains flat as beam width increases for short-horizon and non-reasoning models across AIME and GPQA.",
    297       "supported": "moderate"
    298     },
    299     {
    300       "claim": "LFS is always suboptimal to majority voting at the same compute cost.",
    301       "evidence": "Section 4 states 'the maximum performance for a given amount of total compute is always achieved when k is large (which implies k=N)' for LFS, and k=N is simply MV-N. Supported by Figure 5.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "DAPO induces length bias to the same extent as GRPO.",
    306       "evidence": "Table 1 shows DAPO-32B has similar short-trace preference patterns to R1 (GRPO-trained). Section 3.2 notes 'any improvements in mitigating length bias over GRPO may be limited under our evaluation.'",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "The optimal TTS strategy is independent of task difficulty.",
    311       "evidence": "Table 2 shows the recipe is the same regardless of 'High/Low' difficulty for each model family. However, this is derived from aggregating results over only 4 datasets with a binary easy/hard split.",
    312       "supported": "weak"
    313     },
    314     {
    315       "claim": "The choice of post-training strategy plays a key role in determining a reasoning model's effective horizon.",
    316       "evidence": "Section 1 correlates GRPO training with short-horizon behavior and GSPO with long-horizon behavior. However, the evidence is observational with only a few models per category and no controlled experiment.",
    317       "supported": "weak"
    318     }
    319   ],
    320   "methodology_tags": ["benchmark-eval"],
    321   "key_findings": "This large-scale empirical study of test-time scaling (TTS) across eight LLMs and four reasoning benchmarks finds that no single TTS strategy universally dominates. The paper introduces a taxonomy of reasoning models into 'short-horizon' (preferring shorter traces, typically GRPO-trained) and 'long-horizon' (benefiting from longer traces on hard problems) categories. Beam search shows inverse scaling for reasoning tasks, and longest-trace filtering (LFS) is consistently suboptimal compared to majority voting. The authors provide a practical decision matrix for selecting TTS strategies based on model family and compute budget.",
    322   "red_flags": [
    323     {
    324       "flag": "Small benchmark sizes without power analysis",
    325       "detail": "AIME datasets have only 30 problems each. With N=8 samples and binary accuracy on 30 questions, differences between strategies may not be statistically meaningful. No significance tests or power analysis are provided."
    326     },
    327     {
    328       "flag": "No statistical significance tests",
    329       "detail": "All comparative claims (beam search degrades, LFS is suboptimal to MV, DAPO has same length bias as GRPO) are based on comparing raw accuracy percentages without any statistical testing. With small test sets, observed differences could easily be due to chance."
    330     },
    331     {
    332       "flag": "Causal claims from observational evidence",
    333       "detail": "The paper attributes short-horizon vs. long-horizon behavior to post-training algorithms (GRPO vs. GSPO) based on a small observational sample of models. These models differ in many ways beyond training algorithm (architecture, data, scale, etc.), making the causal claim unjustified."
    334     },
    335     {
    336       "flag": "No contamination analysis",
    337       "detail": "Multiple benchmarks used (AIME 2024, GPQA Diamond) were publicly available before several tested models were trained. No analysis of potential benchmark contamination is provided."
    338     },
    339     {
    340       "flag": "No limitations section",
    341       "detail": "The paper contains no dedicated limitations or threats-to-validity section, despite making broad claims about optimal TTS strategies based on a narrow set of benchmarks and models."
    342     },
    343     {
    344       "flag": "Broad title and claims vs. narrow evaluation",
    345       "detail": "The paper is titled 'The Art of Scaling Test-Time Compute for Large Language Models' and presents a general 'recipe', but evaluation is limited to math competition problems and science QA — two specific reasoning domains that may not generalize to other LLM applications."
    346     }
    347   ],
    348   "cited_papers": [
    349     {
    350       "title": "Scaling LLM test-time compute optimally can be more effective than scaling model parameters",
    351       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    352       "year": 2024,
    353       "arxiv_id": "2408.03314",
    354       "relevance": "Foundational work on test-time compute scaling showing compute-optimal allocation outperforms model size scaling, directly relevant to LLM evaluation methodology."
    355     },
    356     {
    357       "title": "Self-consistency improves chain of thought reasoning in language models",
    358       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    359       "year": 2023,
    360       "arxiv_id": "2203.11171",
    361       "relevance": "Introduces self-consistency (majority voting over reasoning paths) as a core TTS strategy for improving LLM reasoning accuracy."
    362     },
    363     {
    364       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    365       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    366       "year": 2023,
    367       "arxiv_id": "2201.11903",
    368       "relevance": "Foundational chain-of-thought prompting paper, establishing the basis for sequential test-time scaling in LLMs."
    369     },
    370     {
    371       "title": "s1: Simple test-time scaling",
    372       "authors": ["Niklas Muennighoff", "Zitong Yang", "Weijia Shi"],
    373       "year": 2025,
    374       "arxiv_id": "2501.19393",
    375       "relevance": "Proposes fine-tuning models for self-correction at test-time, a key sequential scaling approach evaluated in the survey scope."
    376     },
    377     {
    378       "title": "Don't overthink it: preferring shorter thinking chains for improved LLM reasoning",
    379       "authors": ["Michael Hassid", "Gabriel Synnaeve", "Yossi Adi", "Roy Schwartz"],
    380       "year": 2025,
    381       "arxiv_id": "2505.17813",
    382       "relevance": "Proposes short-m@k strategy showing shorter reasoning traces often outperform longer ones, directly relevant to test-time compute efficiency."
    383     },
    384     {
    385       "title": "Inverse scaling in test-time compute",
    386       "authors": ["Aryo Pradipta Gema", "Alexander Hägele"],
    387       "year": 2025,
    388       "arxiv_id": "2507.14417",
    389       "relevance": "Demonstrates that more test-time compute can degrade accuracy, a critical finding for LLM scaling methodology."
    390     },
    391     {
    392       "title": "Reflexion: Language agents with verbal reinforcement learning",
    393       "authors": ["Noah Shinn", "Federico Cassano", "Edward Berman"],
    394       "year": 2023,
    395       "arxiv_id": "2303.11366",
    396       "relevance": "Introduces verbal self-reflection for sequential scaling in LLM agents, relevant to agentic AI evaluation."
    397     },
    398     {
    399       "title": "Tree of thoughts: Deliberate problem solving with large language models",
    400       "authors": ["Shunyu Yao", "Dian Yu", "Jeffrey Zhao"],
    401       "year": 2023,
    402       "relevance": "Proposes structured search-based test-time scaling for LLMs, a key method in the TTS landscape relevant to agentic reasoning."
    403     },
    404     {
    405       "title": "GPQA: A graduate-level Google-proof Q&A benchmark",
    406       "authors": ["David Rein", "Betty Li Hou", "Asa Cooper Stickland"],
    407       "year": 2023,
    408       "arxiv_id": "2311.12022",
    409       "relevance": "Key reasoning benchmark used to evaluate LLM capabilities, relevant to benchmark evaluation methodology."
    410     },
    411     {
    412       "title": "DAPO: An open-source LLM reinforcement learning system at scale",
    413       "authors": ["Qiying Yu", "Zheng Zhang", "Ruofei Zhu"],
    414       "year": 2025,
    415       "arxiv_id": "2503.14476",
    416       "relevance": "Open-source RL training system for LLMs, relevant to understanding how post-training affects model behavior and evaluation."
    417     },
    418     {
    419       "title": "A survey on test-time scaling in large language models: What, how, where, and how well?",
    420       "authors": ["Qiyuan Zhang", "Fuyuan Lyu", "Zexu Sun"],
    421       "year": 2025,
    422       "arxiv_id": "2503.24235",
    423       "relevance": "Comprehensive survey of TTS methods and their evaluation, directly relevant to systematic review of LLM scaling methodology."
    424     },
    425     {
    426       "title": "Let's verify step by step",
    427       "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yura Burda"],
    428       "year": 2023,
    429       "arxiv_id": "2305.20050",
    430       "relevance": "Process reward model for step-level verification in reasoning, a key component of test-time scaling evaluation methodology."
    431     }
    432   ]
    433 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs