scan.json (29733B)
1 { 2 "paper": { 3 "title": "T1: Advancing Language Model Reasoning through Reinforcement Learning and Inference Scaling", 4 "authors": [ 5 "Zhenyu Hou", 6 "Xin Lv", 7 "Rui Lu", 8 "Jiajie Zhang", 9 "Yujiang Li", 10 "Zijun Yao", 11 "Juanzi Li", 12 "Jie Tang", 13 "Yuxiao Dong" 14 ], 15 "year": 2025, 16 "venue": "Proceedings of the 42nd International Conference on Machine Learning (ICML 2025), PMLR 267", 17 "arxiv_id": "2501.11651" 18 }, 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper states: 'The model weights and the data for SFT and RL training are publicly available at https://github.com/THUDM/T1.' A working GitHub URL is provided." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "Training data sourced from publicly available datasets (MATH-train, NuminaMath) and the paper states that the SFT and RL training data are publicly available at the GitHub repository." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper mentions using SGLANG as the inference engine (with a reference) and provides learning rates and sampling parameters, but does not provide a requirements.txt, Dockerfile, or equivalent environment specification with library versions." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper provides training hyperparameters in the appendix (Section A.1) but does not include step-by-step reproduction instructions or README-level commands that a researcher could follow to replicate the main experiments." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "Results in Table 1 are reported as single accuracy values (e.g., '92.4') with no confidence intervals, error bars, or other uncertainty quantification. AIME is evaluated 32 times and averaged, but no variance measure is reported." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper makes comparative claims (e.g., T1 outperforms QwQ-32B-Preview on MATH500, AIME2024, and Omni-MATH-500) without any statistical significance tests. All comparisons are based on point estimates only." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "The paper reports absolute percentage improvements with baseline context (e.g., 'over a 10% improvement on MATH-500 and over a 20% improvement on AIME over its T1-SFT version'), providing enough context to assess magnitude of effects." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "AIME uses only 30 problems (evaluated 32 times for stability). No power analysis or justification for why these sample sizes are sufficient for the claims made is provided. Omni-MATH-500 uses 500 examples described as 'efficient yet comprehensive' without statistical justification." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "AIME is evaluated 32 times and averaged for stability, but no standard deviation or confidence interval is reported for these repeated evaluations. All other benchmarks appear to report single-run results only." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 1 includes multiple baselines: GPT-4o, Claude-3.5-sonnet, Llama-3.3-70B-Instruct, Qwen2.5-Math-7B-Instruct, o1-preview, QwQ-32B-preview, and the base models (GLM-4-9B-chat, Qwen2.5-14B-Instruct, Qwen2.5-32B-Instruct) plus T1-SFT ablations." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Baselines include contemporary systems such as QwQ-32B-Preview, o1-preview, Claude-3.5-Sonnet, and GPT-4o, all of which represent the state of the art at the time of writing in late 2024/early 2025." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper includes ablation studies on: number of sampled responses K (Table 2, Figure 3), sampling temperature (Table 2), penalty rewards (Table 3), and comparison of T1-SFT vs T1 (full RL). Individual components are isolated and their contribution measured." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": false, 88 "justification": "Results are reported across four benchmarks: MATH500, AIME2024, Omni-MATH-500, and GPQA. The paper also tracks response length, KL divergence, and reward as secondary metrics during training." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "This is a benchmark evaluation paper for mathematical reasoning. Human evaluation of model outputs is not applicable; correctness is determined objectively by matching against ground-truth answers." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "Evaluation is performed on standard test sets (AIME2024, MATH500 test split, Omni-MATH-500, GPQA). The training data uses MATH-train and NuminaMath; test splits are separate. Appendix A.1 states: 'We split around 12k for the SFT stage and the others for RL training.'" 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down across four different benchmark types (competition math AIME, olympiad math Omni-MATH, school math MATH500, and science GPQA), and Figure 7 shows inference scaling behavior separately for each dataset." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": false, 108 "justification": "The paper presents a case study of a successfully solved AIME problem but does not discuss failure cases or error analysis of where T1 performs poorly. No examples of incorrect reasoning or failure modes are shown." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper reports that training with min-p sampling 'can make the training process more prone to collapse' (Table 2, temperature=1.2 with min-p=0.05 shows degradation from 86.4 to 78.8 on MATH500), and that low temperatures (<=1.0) often cause training collapse." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims that T1 'achieves superior performance on challenging math reasoning benchmarks' and 'exhibits inference scaling behavior.' Table 1 shows T1 outperforming all baselines including QwQ-32B-Preview, and Figures 6-7 demonstrate inference scaling behavior. Claims are supported." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper makes causal claims via ablation studies (e.g., 'sampling more responses encourages exploration') and uses controlled single-variable manipulation (varying only K in Table 2 / Figure 3, varying only temperature in Table 2, varying only penalty in Table 3). This design is adequate for causal inference within the system." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper tests only on mathematical reasoning benchmarks (MATH, AIME, Omni-MATH) and GPQA science problems but the abstract and conclusion make broader claims about 'enhancing large language models' reasoning capabilities' without adequately bounding the scope. The paper acknowledges OOD generalization to GPQA but does not bound the broader generalization claims." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper does not discuss alternative explanations for the observed performance improvements. For example, it does not consider whether the improvement could be due to training on overlapping data, or whether the reward signal design alone (vs. the exploration strategies) accounts for the gains." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper specifies exact model versions: GLM-4-9B, Qwen2.5-14B, Qwen2.5-32B, Qwen2.5-32B-Instruct, QwQ-32B-Preview, Llama-3.3-70B-Instruct. These are specific enough to identify the models used." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper describes in natural language how the LLM is prompted to 'examine each attempt' and 'incorporate refined attempts into a single output' but does not provide the actual prompt text used. Section 2.2.1 describes the procedure without giving the actual prompts sent to the model." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix A.1 reports: SFT learning rate 1e-5, RL learning rate 1.5e-6, KL coefficient 2e-4, K=64 responses, temperature=1.2, top-p=0.95, max generation length 10,240 or 16,384, EMA decay, entropy bonus coefficient. This is sufficient to understand the training setup." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "T1 is a training methodology (SFT + RL), not an agentic scaffolding system. There is no external scaffolding; the model generates reasoning chains end-to-end within a single forward pass (with truncation for analysis)." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix A.1 describes data preprocessing: answer extraction from NuminaMath using LLM, data filtering to retain only instances with pass rate in (0, 0.3), generating 16 responses per question, resulting in 30k RL training examples. The filtering criteria and resulting dataset size are stated." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper has an 'Impact Statement' section but no dedicated Limitations or Threats to Validity section. The Impact Statement is generic and does not discuss methodological limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity are discussed. The Impact Statement is generic ('ethical aspects and societal implications of our work align with those commonly associated with advancing the field of machine learning') and the conclusion does not discuss threats or limitations." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. While it notes that T1 is 'primarily optimized for math-related tasks,' it does not explicitly bound the scope or enumerate what claims are not being made." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "Model weights and training data are released at https://github.com/THUDM/T1. The evaluation benchmarks (AIME2024, MATH, GPQA, Omni-MATH) are publicly available, enabling independent re-evaluation." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Appendix A.1 describes data collection: training data from MATH-train and NuminaMath public datasets, with the answer extraction and filtering pipeline described (16 responses generated per question, retaining pass rate in (0, 0.3), yielding 30k examples)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are involved. Training and evaluation data come from publicly available mathematical datasets." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The data pipeline is documented in Appendix A.1: raw data (NuminaMath + MATH-train) → answer extraction via LLM → data filtering (pass rate criterion) → 12k SFT split + 30k RL split. Transformation steps and resulting sizes are stated." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Acknowledgements section discloses funding from NSFC 62495063, Tsinghua University-Siemens Joint Research Center (JCIIOT), and the New Cornerstone Science Foundation (XPLORER PRIZE)." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: Tsinghua University (Hou, Lu, Zhang, Li, Yao, Li, Tang, Dong) and ZhipuAI (Lv). Affiliations are disclosed in the paper header." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Funders (NSFC, Tsinghua-Siemens center, New Cornerstone Science Foundation) do not have a direct financial stake in the LLM reasoning performance results. One author is from ZhipuAI (maker of GLM models), which is a minor conflict since GLM-4-9B is one of three base models used." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present in the paper. One co-author (Xin Lv) is affiliated with ZhipuAI, whose GLM-4-9B model is evaluated in the paper, but this potential conflict is not explicitly declared." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not state the training data cutoffs for the comparison models (GPT-4o, Claude-3.5-Sonnet, QwQ-32B-Preview, o1-preview). The T1 model itself trains on MATH-train and NuminaMath, but the pre-training cutoffs of the base models (Qwen2.5, GLM-4) are not specified in the paper." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper does not discuss potential overlap between training data (MATH-train, NuminaMath) and evaluation benchmarks (MATH500 is a subset of MATH-test, AIME2024 is from 2024). No analysis of whether test examples could have been in training data is provided." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "MATH benchmark (Hendrycks et al., 2021) was published in 2021; the base models (Qwen2.5, GLM-4) could have seen it during pre-training. AIME2024 problems are newer but NuminaMath may contain similar problems. No contamination analysis is performed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. Pre-registration is not applicable." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. IRB approval is not applicable." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study. Demographics are not applicable." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study. Inclusion/exclusion criteria are not applicable." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study. Randomization of participants is not applicable." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study. Blinding is not applicable." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study. Attrition is not applicable." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "The paper reports token counts for inference scaling analysis (up to ~6000 thinking tokens) but does not report API costs, wall-clock time, or cost per example for the T1 system." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The paper does not report GPU hours, hardware used, or total compute budget for training T1. Training details (learning rates, steps) are given but not the compute required." 290 } 291 } 292 }, 293 "claims": [ 294 { 295 "claim": "T1 (Qwen2.5-32B) achieves superior performance on MATH500 (92.4%), AIME2024 (50.6%), and Omni-MATH-500 (49.6%), outperforming QwQ-32B-Preview (90.6%, 50.0%, 46.6%) and o1-preview (85.5%, 44.6%, /) across benchmarks.", 296 "evidence": "Table 1 provides accuracy numbers for all compared models across MATH500, AIME2024, Omni-MATH-500, and GPQA benchmarks. T1 (Qwen2.5-32B) achieves the highest scores on MATH500 and Omni-MATH-500.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "T1 exhibits inference scaling behavior: longer generation (more thinking tokens) directly leads to better performance without requiring external verification.", 301 "evidence": "Figure 6 shows accuracy increasing monotonically from 24% to 50% on AIME as average thinking tokens increase from ~500 to ~6000. Figure 7 shows similar trends across training steps and max generation lengths.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "Sampling more responses (higher K) during RL training improves performance: K=64 yields substantially better results than K=4, with a >6% improvement.", 306 "evidence": "Figure 3 and Figure 4 show that K=64 outperforms K=16 and K=4 on MATH500 and other benchmarks. Table text states 'sampling only 4 responses shows little to no benefit (around 3%)'.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "High sampling temperature (1.2) during RL training is beneficial, and training with temperature ≤1.0 often collapses.", 311 "evidence": "Table 2 shows performance peaks at temperature=1.2 (86.4 MATH500, 29.3 AIME, 38.6 Omni-MATH-500) compared to 0.9 (78.2, 19.1, 32.0) and 1.3 (84.6, 24.3, 36.4). Training instability at lower temperatures is reported in text.", 312 "supported": "strong" 313 }, 314 { 315 "claim": "Penalty for unexpected patterns (repetition, overlong text, garbage text) prevents training collapse and is necessary for stable training.", 316 "evidence": "Table 3 shows that without penalty, overlong ratio reaches 16.3% by step 120 and training diverges (accuracy drops from 79.2 to 76.4 and training stops), while penalized training maintains stability.", 317 "supported": "strong" 318 }, 319 { 320 "claim": "RL training (not just SFT) is necessary to unlock inference scaling: SFT-only models show minimal gains with increased inference budgets.", 321 "evidence": "Figure 7 shows that T1-SFT and 30% RL step models show flat accuracy curves as max_gen_length increases, while 100% RL trained models show steep improvement from ~1024 to ~16384 tokens.", 322 "supported": "strong" 323 } 324 ], 325 "methodology_tags": [ 326 "benchmark-eval" 327 ], 328 "key_findings": "T1 is a reinforcement learning training pipeline for LLMs that promotes exploration through oversampling (K=64), high-temperature sampling, entropy bonuses, and on-policy KL normalization, combined with penalties for degenerate outputs. Applied to Qwen2.5-32B, T1 achieves state-of-the-art results on math reasoning benchmarks including MATH500 (92.4%) and AIME2024 (50.6%), outperforming QwQ-32B-Preview. A key finding is that T1-trained models exhibit inference scaling behavior — performance improves monotonically as more thinking tokens are generated — and that this scaling property requires sufficient RL training (SFT alone does not enable it). The paper introduces a truncation-based method to measure inference scaling without requiring external verifiers.", 329 "red_flags": [ 330 { 331 "flag": "No statistical uncertainty quantification", 332 "detail": "All results in Table 1 are point estimates with no confidence intervals, error bars, or standard deviations. While AIME is evaluated 32 times for stability, the spread is not reported. Given AIME has only 30 problems, even the average of 32 runs has substantial variance that is not quantified." 333 }, 334 { 335 "flag": "No contamination analysis", 336 "detail": "MATH benchmark (2021) could be in the training data of the base models (Qwen2.5, GLM-4). The paper uses MATH-train for T1 training but evaluates on MATH-test/MATH500, which is methodologically sound for T1 itself, but the comparison baselines (QwQ-32B-Preview, o1-preview) may have seen all MATH problems during pre-training. This is not discussed." 337 }, 338 { 339 "flag": "No limitations section", 340 "detail": "The paper has no dedicated limitations section. The Impact Statement is purely generic ('ethical aspects and societal implications align with those commonly associated with advancing the field of machine learning') and does not discuss methodological limitations, scope boundaries, or threats to validity." 341 }, 342 { 343 "flag": "Generalization claims outrun evidence", 344 "detail": "The paper claims T1 'enhances reasoning capabilities' broadly, but evaluation is limited to mathematical reasoning (MATH, AIME, Omni-MATH) and GPQA. Generalization to code generation, commonsense reasoning, or other reasoning domains is not tested." 345 }, 346 { 347 "flag": "Compute budget not reported", 348 "detail": "Training 32B-parameter models with K=64 sampled responses per prompt using RL is computationally expensive, but the paper reports no GPU hours, hardware used, or total compute cost, making it difficult to assess the practical feasibility of reproducing or extending this work." 349 }, 350 { 351 "flag": "Affiliation conflict not declared", 352 "detail": "Co-author Xin Lv is affiliated with ZhipuAI, whose GLM-4-9B model is one of three models used as a base in T1 experiments. While the affiliation is disclosed, no competing interests statement is made regarding this connection." 353 } 354 ], 355 "cited_papers": [ 356 { 357 "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective than Scaling Model Parameters", 358 "authors": [ 359 "Charlie Snell", 360 "Jaehoon Lee", 361 "Kelvin Xu", 362 "Aviral Kumar" 363 ], 364 "year": 2024, 365 "arxiv_id": "2408.03314", 366 "relevance": "Key reference for test-time scaling of LLMs; directly relevant to inference scaling claims made in T1." 367 }, 368 { 369 "title": "Large Language Monkeys: Scaling Inference Compute with Repeated Sampling", 370 "authors": [ 371 "Bradley Brown", 372 "Jordan Juravsky", 373 "Ryan Ehrlich", 374 "Ronald Clark", 375 "Quoc V. Le", 376 "Christopher Ré", 377 "Azalia Mirhoseini" 378 ], 379 "year": 2024, 380 "arxiv_id": "2407.21787", 381 "relevance": "Baseline approach for inference scaling via repeated sampling; contrasted with T1's single long-response approach." 382 }, 383 { 384 "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", 385 "authors": [ 386 "Zhihong Shao", 387 "Peiyi Wang", 388 "Qihao Zhu", 389 "Runxin Xu", 390 "Junxiao Song", 391 "Mingchuan Zhang", 392 "Y. K. Li", 393 "Y. Wu", 394 "Daya Guo" 395 ], 396 "year": 2024, 397 "arxiv_id": "2402.03300", 398 "relevance": "Important prior work on RL for math reasoning in LLMs; baseline for group relative policy optimization (GRPO) that T1 builds upon." 399 }, 400 { 401 "title": "Does RLHF Scale? Exploring the Impacts from Data, Model, and Method", 402 "authors": [ 403 "Zhenyu Hou", 404 "P. Du", 405 "Y. Niu", 406 "Z. Du", 407 "A. Zeng", 408 "X. Liu", 409 "M. Huang", 410 "H. Wang", 411 "J. Tang", 412 "Y. Dong" 413 ], 414 "year": 2024, 415 "arxiv_id": "2412.06000", 416 "relevance": "Prior work by the same group on RLHF scaling, directly motivating T1's exploration of RL scaling for reasoning." 417 }, 418 { 419 "title": "STaR: Self-Taught Reasoner", 420 "authors": [ 421 "Eric Zelikman", 422 "Yuhuai Wu", 423 "Noah D. Goodman" 424 ], 425 "year": 2022, 426 "arxiv_id": "2203.14465", 427 "relevance": "Foundational work on self-taught reasoning that T1's CoT initialization approach extends." 428 }, 429 { 430 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 431 "authors": [ 432 "Jason Wei", 433 "Xuezhi Wang", 434 "Dale Schuurmans", 435 "Maarten Bosma", 436 "Brian Ichter", 437 "Fei Xia", 438 "Ed H. Chi", 439 "Quoc V. Le", 440 "Denny Zhou" 441 ], 442 "year": 2022, 443 "relevance": "Foundational CoT paper; central to T1's reasoning paradigm." 444 }, 445 { 446 "title": "Training Language Models to Follow Instructions with Human Feedback", 447 "authors": [ 448 "Long Ouyang", 449 "Jeff Wu", 450 "Xu Jiang", 451 "Diogo Almeida", 452 "Carroll L. Wainwright", 453 "Pamela Mishkin", 454 "Chong Zhang", 455 "Sandhini Agarwal", 456 "Katarina Slama", 457 "Alex Ray", 458 "et al." 459 ], 460 "year": 2022, 461 "relevance": "Original RLHF paper; T1's RL training framework is built on RLHF foundations." 462 }, 463 { 464 "title": "Qwen2.5 Technical Report", 465 "authors": [ 466 "A. Yang", 467 "B. Yang", 468 "B. Zhang", 469 "et al." 470 ], 471 "year": 2024, 472 "arxiv_id": "2412.15115", 473 "relevance": "Base model used in T1 experiments; Qwen2.5-14B and Qwen2.5-32B are the primary models evaluated." 474 }, 475 { 476 "title": "Omni-MATH: A Universal Olympiad Level Mathematic Benchmark for Large Language Models", 477 "authors": [ 478 "B. Gao", 479 "F. Song", 480 "Z. Yang", 481 "Z. Cai", 482 "Y. Miao", 483 "Q. Dong", 484 "L. Li", 485 "C. Ma", 486 "L. Chen", 487 "R. Xu", 488 "et al." 489 ], 490 "year": 2024, 491 "arxiv_id": "2410.07985", 492 "relevance": "Primary evaluation benchmark used in T1; Olympiad-level math problems for assessing LLM reasoning." 493 }, 494 { 495 "title": "Measuring Mathematical Problem Solving With the MATH Dataset", 496 "authors": [ 497 "Dan Hendrycks", 498 "Collin Burns", 499 "Saurav Kadavath", 500 "Akul Arora", 501 "Steven Basart", 502 "Eric Tang", 503 "Dawn Song", 504 "Jacob Steinhardt" 505 ], 506 "year": 2021, 507 "relevance": "Core evaluation benchmark (MATH500) and source of training data; critical for assessing benchmark contamination risks." 508 }, 509 { 510 "title": "GPQA: A Graduate-Level Google-Proof Q&A Benchmark", 511 "authors": [ 512 "David Rein", 513 "Betty Li Hou", 514 "Asa Cooper Stickland", 515 "Jackson Petty", 516 "Richard Yuanzhe Pang", 517 "Julien Dirani", 518 "Julian Michael", 519 "Samuel R. Bowman" 520 ], 521 "year": 2023, 522 "arxiv_id": "2311.12022", 523 "relevance": "Out-of-domain evaluation benchmark testing generalization of T1's math-trained reasoning to graduate-level science." 524 }, 525 { 526 "title": "Math-Shepherd: Verify and Reinforce LLMs Step-by-Step without Human Annotations", 527 "authors": [ 528 "Peiyi Wang", 529 "Lei Li", 530 "Zhihong Shao", 531 "Runxin Xu", 532 "Damai Dai", 533 "Yunfei Li", 534 "Daya Chen", 535 "Yu Wu", 536 "Zhifang Sui" 537 ], 538 "year": 2024, 539 "relevance": "Related work on process reward models for RL training of math reasoning LLMs." 540 }, 541 { 542 "title": "VinePPO: Unlocking RL Potential for LLM Reasoning through Refined Credit Assignment", 543 "authors": [ 544 "Amirhossein Kazemnejad", 545 "Milad Aghajohari", 546 "Eva Portelance", 547 "Alessandro Sordoni", 548 "Siva Reddy", 549 "Aaron Courville", 550 "Nicolas Le Roux" 551 ], 552 "year": 2024, 553 "arxiv_id": "2410.01679", 554 "relevance": "Related RL approach for LLM reasoning with credit assignment; comparison baseline context for T1." 555 }, 556 { 557 "title": "Training Language Models to Self-Correct via Reinforcement Learning", 558 "authors": [ 559 "Aviral Kumar", 560 "Vincent Zhuang", 561 "Rishabh Agarwal", 562 "Yi Su", 563 "John D. Co-Reyes", 564 "Avi Singh", 565 "Kate Baumli", 566 "Shariq Iqbal", 567 "Colton Bishop", 568 "Rebecca Roelofs", 569 "et al." 570 ], 571 "year": 2024, 572 "arxiv_id": "2409.12917", 573 "relevance": "Related work on RL for self-correction in LLMs; closely related to T1's verification and trial-and-error approach." 574 } 575 ] 576 }