scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26319B)
      1 {
      2   "paper": {
      3     "title": "Can 1B LLM Surpass 405B LLM? Rethinking Compute-Optimal Test-Time Scaling",
      4     "authors": [
      5       "Runze Liu",
      6       "Junqi Gao",
      7       "Jian Zhao",
      8       "Kaiyan Zhang",
      9       "Xiu Li",
     10       "Biqing Qi",
     11       "Wanli Ouyang",
     12       "Bowen Zhou"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv",
     16     "arxiv_id": "2502.06703"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper references an open-source codebase OpenR (https://github.com/openreasoner/openr) used for experiments and provides a website at https://ryanliu112.github.io/compute-optimal-tts. The OpenR framework is explicitly cited as their codebase."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available datasets: MATH-500 (Lightman et al., 2024) and AIME24 (AI-MO, 2024, available on HuggingFace). All PRMs used are open-source and publicly available with HuggingFace links provided."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions using OpenR but does not specify library versions or system requirements."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The paper mentions using OpenR and specifies some hyperparameters (temperature, token limits, beam width) but does not provide a README or scripts to replicate experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No confidence intervals or error bars are reported for any results. All tables (Tables 3-6) and figures report only point estimates without uncertainty measures."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No statistical significance tests are used despite numerous comparative claims (e.g., '3B LLM surpasses 405B LLM'). All comparisons are based solely on comparing point estimates."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Table 5 reports percentage performance gains (e.g., '154.6% improvement over CoT') and efficiency gains (e.g., '>256.0x'). Table 4 reports FLOPS comparisons with specific magnitudes. The paper provides baseline context throughout (e.g., 'from 26.0% CoT to 66.2% compute-optimal TTS')."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification is provided for why MATH-500 (500 problems) and AIME24 (30 problems) are sufficient. AIME24 has only 30 problems, which is very small for the granular difficulty-level analyses performed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported across any experimental runs. It is unclear whether results are from single runs or averaged across multiple runs."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Multiple baselines are included: CoT (zero-shot), majority voting, and comparisons against GPT-4o, o1-preview, o1-mini, o1, DeepSeek-R1, QwQ-32B-Preview, and various long-CoT methods (rStar-Math, Eurus-2, SimpleRL, Satori)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include state-of-the-art models as of early 2025: DeepSeek-R1, o1, GPT-4o, QwQ-32B-Preview, and recent long-CoT methods like SimpleRL and Satori. These are contemporary and competitive."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The paper systematically ablates across multiple dimensions: different policy model sizes (0.5B to 72B), different PRMs (7 different PRMs), different TTS methods (BoN, beam search, DVTS), different scoring/voting methods (Table 2), and different difficulty levels (Sections 4.2-4.4)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses accuracy on MATH-500 and AIME24, Pass@k, FLOPS comparisons (Table 4), efficiency gains, and performance gains as distinct metrics."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is a benchmark evaluation paper on mathematical reasoning. Human evaluation of model outputs is not relevant since correctness is determined by matching ground-truth answers."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "MATH-500 is a standard test subset of MATH, and AIME24 is a separate competition dataset. The compute-optimal strategy is evaluated on these test sets, not on data used for tuning."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by difficulty level (easy, medium, hard) in Figures 8-9, by policy model size (0.5B to 72B) in Figures 7 and 10-11, and by PRM in Figures 4-5. Per-model and per-task breakdowns are provided in Tables 3 and 5."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Appendix C provides detailed failure case analysis of PRMs, identifying four categories: Over-Criticism (Figure 13), Error Neglect (Figures 14-15), Error Localization Bias (Figure 16), and Scoring Bias (Figures 17-18). Section 4.4 discusses PRM biases."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports several negative results: TTS performance drops on harder tasks like AIME24 (Table 6), TTS is less effective than distillation from strong reasoning models (Section 5.3), PRMs fail to generalize across policy models (Section 4.2), and TTS gains diminish with larger policy models (Table 5, Qwen2.5-32B shows only 0.8x efficiency gain)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims that 'a 1B LLM can exceed a 405B LLM on MATH-500' are supported by Table 3 (Llama-3.2-1B-Inst. at N=512 achieves 72.2% vs. Llama-3.1-405B-Inst. at 71.4%). Claims about 0.5B outperforming GPT-4o, 3B surpassing 405B, and 7B beating o1 are all supported in Table 3."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's causal claims are mostly about component contributions (e.g., 'the reward significantly affects the generation process'). These are supported by controlled ablations varying one factor at a time: varying PRM while holding policy model fixed, varying policy model size while holding PRM fixed, etc. The ablation design is adequate for these claims."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title 'Can 1B LLM Surpass 405B LLM?' implies general capability, but results are only on mathematical reasoning tasks (MATH-500 and AIME24). The limitations section acknowledges this ('Extending TTS to more tasks such as coding and chemistry tasks') but the title and abstract do not bound the claim to math."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper does not discuss alternative explanations for its results. For example, it does not consider whether the strong results of small models with TTS could be an artifact of PRM overfitting to MATH-style problems, or whether the compute-optimal strategy is overfit to these specific benchmarks. No threats-to-validity or confound discussion is provided."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model names with sizes are provided: Llama-3.2-1B-Instruct, Llama-3.2-3B-Instruct, Llama-3.1-8B-Instruct, Qwen2.5-0.5B-Instruct through Qwen2.5-72B-Instruct, DeepSeek-R1-Distill-Qwen-1.5B, DeepSeek-R1-Distill-Qwen-7B. PRM versions are also specified (e.g., Math-Shepherd-PRM-7B, RLHFlow-PRM-Mistral-8B). However, for proprietary models (GPT-4o, o1, o1-mini, o1-preview), no snapshot dates or API versions are given."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Appendix A (Tables 7 and 8) provides the full system prompts used for Llama 3 series and Qwen2.5 series models. These are complete prompt texts, not just descriptions."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Section 4.1 specifies: compute budgets {4, 16, 64, 256}, beam width 4 for search methods, temperature 0.0 for CoT and 0.7 for other methods, max new tokens 8192 for CoT/BoN, 2048 per step and 8192 total for search methods."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 2.2 describes the three TTS methods (Best-of-N, Beam Search, DVTS) in detail with mathematical formulations. Section 4.1 describes scoring methods (PRM-Min, PRM-Last, PRM-Avg) and voting methods (Majority Vote, PRM-Max, PRM-Vote). Figure 2 provides a visual comparison of the methods."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.2 documents how problem difficulty is categorized (easy: 50-100%, medium: 10-50%, hard: 0-10% Pass@1 accuracy). Section 4.1 describes the step division format ('\\n\\n'). The footnote explains that Qwen2.5-32B-Instruct is used to extract answers from Llama-3.2-1B outputs that don't contain \\boxed."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "A 'Limitations' subsection is present at the end of Section 7 (Conclusion & Discussion), identifying two specific limitations: results are limited to mathematical tasks, and more effective compute-optimal methods could be explored."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The limitations section mentions only two generic directions ('extending to more tasks' and 'more effective methods'). No specific threats to the validity of the current results are discussed, such as the small size of AIME24 (30 problems), potential PRM overfitting to math domains, or the fact that compute-optimal strategy selection uses oracle labels."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The limitations section mentions extending to other tasks but does not explicitly state what the results do NOT show. There is no statement like 'these results do not demonstrate that TTS is effective for non-mathematical reasoning' or 'the compute-optimal strategy was selected with oracle difficulty labels and may not transfer to real-world settings.'"
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Raw experimental outputs (model generations, PRM scores, per-problem results) are not made available. Only aggregated accuracy numbers are reported in tables and figures."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The data collection is well-described: MATH-500 is a specific 500-problem subset from MATH following prior work, AIME24 is from a HuggingFace dataset. PRM training data sources are documented in Section 4.1 and Table 1. The experimental framework (OpenR) is cited."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants are involved. Data sources are standard public benchmarks (MATH-500, AIME24)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: problems are taken from standard benchmarks, policy models generate responses using specified TTS methods, PRMs score steps/responses, voting/scoring methods select final answers, and answers are extracted (with special handling for Llama-3.2-1B via Qwen2.5-32B as noted in footnote 5)."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper. The authors are from Shanghai AI Laboratory and Tsinghua University but no grants or funding sources are mentioned."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Shanghai AI Laboratory, Tsinghua University, Harbin Institute of Technology, and BUPT. The paper notes 'Work done during an internship at Shanghai AI Laboratory.'"
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed. The authors are affiliated with Shanghai AI Laboratory, which develops AI systems, creating a potential interest in demonstrating that smaller models can compete with larger ones. Without funding disclosure, this question cannot be answered affirmatively."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the models used (Llama, Qwen, DeepSeek, GPT-4o, o1). This is relevant because MATH and MATH-500 have been publicly available since 2021."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of potential train/test overlap. MATH-500 is a subset of MATH (published 2021), and all models tested were trained after 2021. The paper does not address whether these models may have seen MATH problems during training."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "MATH was published in 2021 and MATH-500 is derived from it. All models used were trained after this date and could have been trained on MATH problems. AIME24 problems may have appeared online before some models' training cutoffs. None of this is addressed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Table 4 reports inference FLOPS for key comparisons. Table 5 reports efficiency gains (e.g., '>256.0x' more efficient than majority voting). Token counts are discussed in Section 4.4 and shown in the case studies (e.g., Figure 12 shows 890 vs 2419 tokens)."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Table 4 reports pre-training FLOPS, inference FLOPS, and total FLOPS for key model comparisons. The compute budgets N={4, 16, 64, 256} are stated in Section 4.1. The paper explicitly compares total computational cost between small and large models."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "A 1B LLM can surpass a 405B LLM on MATH-500 using compute-optimal test-time scaling",
    295       "evidence": "Table 3 shows Llama-3.2-1B-Instruct with N=512 achieves 72.2% on MATH-500 vs. Llama-3.1-405B-Instruct at 71.4%. Section 5.1.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "A 3B LLM outperforms a 405B LLM on both MATH-500 and AIME24",
    300       "evidence": "Table 3 shows Llama-3.2-3B-Instruct achieves 75.6% on MATH-500 (vs 71.4%) and 30.0% on AIME24 (vs 23.3%). Section 5.1.",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "DeepSeek-R1-Distill-Qwen-7B with compute-optimal TTS beats o1 and DeepSeek-R1",
    305       "evidence": "Table 3 shows DeepSeek-R1-Distill-Qwen-7B achieves 95.2% on MATH-500 (vs o1's 94.8%) and 83.3% on AIME24 (vs o1's 79.2% and DeepSeek-R1's 79.8%). Section 5.1.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "Compute-optimal TTS can be 256x more efficient than majority voting",
    310       "evidence": "Table 5 shows efficiency gains ranging from 0.8x to >256.0x depending on policy model size. Section 5.2.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "The compute-optimal TTS strategy depends on the choice of policy model, PRM, and problem difficulty",
    315       "evidence": "Figures 4-5 show different optimal methods across PRMs, Figure 7 shows dependence on policy model size, and Figures 8-9 show dependence on difficulty level. Sections 4.2-4.3.",
    316       "supported": "strong"
    317     },
    318     {
    319       "claim": "TTS is more effective than direct RL/SFT methods but less effective than distillation from strong reasoning models",
    320       "evidence": "Table 6 shows TTS with Qwen2.5-7B outperforms rStar-Math, Eurus-2, SimpleRL, and Satori but falls significantly behind DeepSeek-R1-Distill-Qwen-7B on AIME24 (36.7% vs 63.3%). Section 5.3.",
    321       "supported": "strong"
    322     },
    323     {
    324       "claim": "Small policy models with TTS can reduce total FLOPS by 100x-1000x compared to larger models",
    325       "evidence": "Table 4 shows Llama-3.2-3B total FLOPS is 1.62e23 vs Llama-3.1-405B at 3.65e25 (225x reduction), and DeepSeek-R1-Distill-7B is 7.56e23 vs DeepSeek-R1 at 5.96e25 (79x reduction). Section 5.1.",
    326       "supported": "moderate"
    327     }
    328   ],
    329   "methodology_tags": [
    330     "benchmark-eval"
    331   ],
    332   "key_findings": "The paper demonstrates that compute-optimal test-time scaling (TTS) strategy depends critically on the policy model, process reward model (PRM), and problem difficulty level. Using reward-aware compute-optimal TTS, small language models can outperform much larger ones on mathematical reasoning benchmarks: a 3B model surpasses a 405B model on MATH-500 and AIME24, and a 7B model beats o1 and DeepSeek-R1. However, TTS gains diminish for larger policy models, and TTS is less effective than distillation from strong reasoning models on hard tasks. The paper also identifies systematic PRM biases including over-criticism, error neglect, error localization bias, and scoring bias related to step length.",
    333   "red_flags": [
    334     {
    335       "flag": "No uncertainty quantification",
    336       "detail": "All results are reported as single point estimates without confidence intervals, error bars, or variance across runs. Given the stochastic nature of sampling-based TTS methods (temperature=0.7), the absence of uncertainty measures makes it impossible to assess whether observed differences are statistically meaningful."
    337     },
    338     {
    339       "flag": "Very small test set for AIME24",
    340       "detail": "AIME24 contains only 30 problems. Performance differences on this dataset (e.g., 30.0% vs 23.3%, which is a difference of about 2 problems) may not be statistically significant. The paper makes strong claims based on these small differences."
    341     },
    342     {
    343       "flag": "Benchmark contamination risk unaddressed",
    344       "detail": "MATH-500 is derived from MATH (2021). All tested models were trained after 2021 and may have seen MATH problems during pre-training. The paper does not discuss this contamination risk, which could inflate absolute performance numbers and bias comparisons between models trained on different data."
    345     },
    346     {
    347       "flag": "Oracle difficulty labels in compute-optimal strategy",
    348       "detail": "The compute-optimal TTS strategy selects different methods for different difficulty levels, but difficulty is measured using Pass@1 accuracy which requires generating many samples. This oracle information would not be available in a real deployment, limiting the practical applicability of the 'compute-optimal' strategy."
    349     },
    350     {
    351       "flag": "Overly broad title and claims relative to scope",
    352       "detail": "The title 'Can 1B LLM Surpass 405B LLM?' implies general capability, but all experiments are on mathematical reasoning only (MATH-500 and AIME24). The limitations section briefly notes this but the abstract and title do not qualify the claims to math."
    353     }
    354   ],
    355   "cited_papers": [
    356     {
    357       "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters",
    358       "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"],
    359       "year": 2024,
    360       "arxiv_id": "2408.03314",
    361       "relevance": "Foundational work on compute-optimal test-time scaling that this paper directly extends and compares against."
    362     },
    363     {
    364       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    365       "authors": ["DeepSeek-AI"],
    366       "year": 2025,
    367       "arxiv_id": "2501.12948",
    368       "relevance": "Key baseline reasoning model; the paper compares TTS against DeepSeek-R1's distilled models and CoT performance."
    369     },
    370     {
    371       "title": "Scaling Test-Time Compute with Open Models",
    372       "authors": ["Edward Beeching", "Lewis Tunstall", "Sasha Rush"],
    373       "year": 2024,
    374       "relevance": "Introduces Diverse Verifier Tree Search (DVTS), one of the three TTS methods evaluated in this paper."
    375     },
    376     {
    377       "title": "Inference Scaling Laws: An Empirical Analysis of Compute-Optimal Inference for Problem-Solving with Language Models",
    378       "authors": ["Yangzhen Wu", "Zhiqing Sun", "Shanda Li", "Sean Welleck", "Yiming Yang"],
    379       "year": 2024,
    380       "arxiv_id": "2408.00724",
    381       "relevance": "Studies inference scaling laws for LLM problem-solving, directly related to the test-time compute scaling investigated here."
    382     },
    383     {
    384       "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling",
    385       "authors": ["Bradley Brown", "Jordan Juravsky", "Ryan Ehrlich"],
    386       "year": 2024,
    387       "arxiv_id": "2407.21787",
    388       "relevance": "Explores scaling inference compute through repeated sampling with domain verifiers, a closely related approach to Best-of-N TTS."
    389     },
    390     {
    391       "title": "Let's Verify Step by Step",
    392       "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yuri Burda"],
    393       "year": 2024,
    394       "relevance": "Introduces process reward models and PRM800K dataset used for training several PRMs evaluated in this paper."
    395     },
    396     {
    397       "title": "The Lessons of Developing Process Reward Models in Mathematical Reasoning",
    398       "authors": ["Zhenru Zhang", "Chujie Zheng", "Yangzhen Wu"],
    399       "year": 2025,
    400       "arxiv_id": "2501.07301",
    401       "relevance": "Provides guidelines for PRM development and releases the Qwen2.5-Math-PRM models evaluated as the strongest open-source PRMs in this paper."
    402     },
    403     {
    404       "title": "ProcessBench: Identifying Process Errors in Mathematical Reasoning",
    405       "authors": ["Chujie Zheng", "Zhenru Zhang", "Beichen Zhang"],
    406       "year": 2024,
    407       "arxiv_id": "2412.06559",
    408       "relevance": "Benchmark for evaluating PRMs' process supervision abilities, used to correlate PRM quality with TTS performance in Figure 6."
    409     },
    410     {
    411       "title": "rStar-Math: Small LLMs Can Master Math Reasoning with Self-Evolved Deep Thinking",
    412       "authors": ["Xinyu Guan", "Li Lyna Zhang", "Yifei Liu"],
    413       "year": 2025,
    414       "arxiv_id": "2501.04519",
    415       "relevance": "Long-CoT baseline that uses MCTS for reasoning data generation, directly compared with TTS in Table 6."
    416     },
    417     {
    418       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    419       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    420       "year": 2023,
    421       "relevance": "Introduces majority voting for LLM reasoning, which serves as a key baseline throughout this paper's experiments."
    422     },
    423     {
    424       "title": "Kimi k1.5: Scaling Reinforcement Learning with LLMs",
    425       "authors": ["Kimi Team"],
    426       "year": 2025,
    427       "arxiv_id": "2501.12599",
    428       "relevance": "Recent work on scaling reinforcement learning for LLM reasoning, representing the state of the art in internal TTS approaches."
    429     }
    430   ]
    431 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs