scan.json (23449B)
1 { 2 "paper": { 3 "title": "OpenR: An Open Source Framework for Advanced Reasoning with Large Language Models", 4 "authors": [ 5 "Jun Wang", 6 "Meng Fang", 7 "Ziyu Wan", 8 "Muning Wen", 9 "Jiachen Zhu", 10 "Anjie Liu", 11 "Ziqin Gong", 12 "Yan Song", 13 "Lei Chen", 14 "Lionel M. Ni", 15 "Linyi Yang", 16 "Ying Wen", 17 "Weinan Zhang" 18 ], 19 "year": 2024, 20 "venue": "arXiv", 21 "arxiv_id": "2410.09671" 22 }, 23 "scan_version": 2, 24 "active_modules": ["experimental_rigor", "data_leakage"], 25 "methodology_tags": ["benchmark-eval"], 26 "key_findings": "OpenR is an open-source framework integrating process reward models (PRMs), reinforcement learning, and test-time guided search to enhance LLM reasoning. On the MATH500 benchmark, their PRM (Math-psa) combined with best-of-N or beam search achieves ~82% accuracy, approximately 10% relative improvement over majority voting. Online RL training with PPO and PRM feedback shows steady reward improvement on individual problems but exhibits instability on the full dataset.", 27 "checklist": { 28 "artifacts": { 29 "code_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper states 'The OpenR framework, including code, models, and datasets, is accessible at https://openreasoner.github.io' in the abstract." 33 }, 34 "data_released": { 35 "applies": true, 36 "answer": true, 37 "justification": "The paper releases the MATH-APS dataset and states code, models, and datasets are publicly available. They also use public datasets (PRM800K, Math-Shepherd, MATH)." 38 }, 39 "environment_specified": { 40 "applies": true, 41 "answer": false, 42 "justification": "No environment specifications, requirements.txt, or dependency versions are mentioned in the paper." 43 }, 44 "reproduction_instructions": { 45 "applies": true, 46 "answer": false, 47 "justification": "No step-by-step reproduction instructions are provided in the paper. The paper points to the website but does not include specific commands or procedures to replicate experiments." 48 } 49 }, 50 "statistical_methodology": { 51 "confidence_intervals_or_error_bars": { 52 "applies": true, 53 "answer": false, 54 "justification": "Results in Figure 4 show point estimates of accuracy with no confidence intervals or error bars." 55 }, 56 "significance_tests": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper claims their PRM outperforms Math-Shepherd and that search methods outperform majority vote, but no significance tests are reported." 60 }, 61 "effect_sizes_reported": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper reports 'approximately 10%' relative improvement and shows accuracy comparisons (e.g., ~75% for majority vote vs ~82% for best methods) providing baseline context for effect sizes." 65 }, 66 "sample_size_justified": { 67 "applies": true, 68 "answer": false, 69 "justification": "They use MATH500 (500 problems) following Lightman et al. but do not justify why 500 is sufficient for the claims being made." 70 }, 71 "variance_reported": { 72 "applies": true, 73 "answer": false, 74 "justification": "No variance, standard deviation, or spread measures are reported. The PPO training curves in Figure 5 show single runs with no indication of variance across seeds or runs." 75 } 76 }, 77 "evaluation_design": { 78 "baselines_included": { 79 "applies": true, 80 "answer": true, 81 "justification": "Majority voting is used as a baseline, and Math-Shepherd PRM is compared against their Math-psa PRM in Figure 4b." 82 }, 83 "baselines_contemporary": { 84 "applies": true, 85 "answer": true, 86 "justification": "Math-Shepherd (Wang et al., 2024a) and Qwen2.5-Math models (Yang et al., 2024) are contemporary baselines from 2024." 87 }, 88 "ablation_study": { 89 "applies": true, 90 "answer": true, 91 "justification": "The paper compares multiple search methods (best-of-N vs beam search), voting strategies (majority vote, PRM-last-vote, PRM-last-max), and different PRMs, which functions as component ablation." 92 }, 93 "multiple_metrics": { 94 "applies": true, 95 "answer": false, 96 "justification": "Only accuracy on MATH500 is reported as the evaluation metric. No secondary metrics are used." 97 }, 98 "human_evaluation": { 99 "applies": false, 100 "answer": false, 101 "justification": "Human evaluation is not relevant for automated math benchmark evaluation." 102 }, 103 "held_out_test_set": { 104 "applies": true, 105 "answer": true, 106 "justification": "MATH500 is a held-out subset of 500 randomly sampled problems from MATH, following Lightman et al. (2023), used specifically for evaluation." 107 }, 108 "per_category_breakdown": { 109 "applies": true, 110 "answer": false, 111 "justification": "The MATH dataset has difficulty levels and problem categories, but no per-category or per-difficulty breakdown is provided." 112 }, 113 "failure_cases_discussed": { 114 "applies": true, 115 "answer": true, 116 "justification": "Figures 6-12 provide case studies including incorrect answers (Cases 2, 3, 5) and show where reasoning goes wrong, with PRM score comparisons." 117 }, 118 "negative_results_reported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Figure 5 shows PPO training on MATH500 exhibits 'more fluctuations in rewards' and the paper acknowledges 'the need for further improvements on the algorithm to enhance its adaptability for diverse problem sets.'" 122 } 123 }, 124 "claims_and_evidence": { 125 "abstract_claims_supported": { 126 "applies": true, 127 "answer": true, 128 "justification": "The abstract claims 'substantial gains' with test-time computation and RL through PRMs, which is supported by Figure 4 showing ~10% improvement and Figure 5 showing RL training progress." 129 }, 130 "causal_claims_justified": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper makes causal claims (PRM improves reasoning, RL training improves accuracy) supported by controlled comparisons: same model with different search strategies (Figure 4a), same strategy with different PRMs (Figure 4b), before/after RL training (Figures 8-9)." 134 }, 135 "generalization_bounded": { 136 "applies": true, 137 "answer": false, 138 "justification": "The title claims 'Advanced Reasoning with Large Language Models' broadly, but experiments are limited to the MATH dataset with specific Qwen2.5 models. The abstract frames results as general LLM reasoning improvements." 139 }, 140 "alternative_explanations_discussed": { 141 "applies": true, 142 "answer": false, 143 "justification": "No alternative explanations are discussed for the observed improvements. For example, the PRM advantage could stem from dataset differences rather than training methodology." 144 }, 145 "proxy_outcome_distinction": { 146 "applies": true, 147 "answer": false, 148 "justification": "The paper measures accuracy on MATH500 but frames contributions as 'enhancing reasoning capabilities' broadly. No discussion of whether MATH500 accuracy is an adequate proxy for general reasoning ability." 149 } 150 }, 151 "setup_transparency": { 152 "model_versions_specified": { 153 "applies": true, 154 "answer": true, 155 "justification": "Specific model versions are stated: 'Qwen2.5-Math-7B-Instruct' for PRM training and 'Qwen2.5-1.5B-Math-Instruct' for policy learning (Sections 4.1 and 4.2)." 156 }, 157 "prompts_provided": { 158 "applies": false, 159 "answer": false, 160 "justification": "The paper does not use prompting as a primary methodology — it trains models via RL and supervised fine-tuning, and uses the models for generation and scoring." 161 }, 162 "hyperparameters_reported": { 163 "applies": true, 164 "answer": false, 165 "justification": "Key hyperparameters for RL training (learning rate, batch size, number of epochs, PPO clipping parameter, temperature for generation) are not reported. The data augmentation section mentions constants α, β, L, cpuct without specifying their values." 166 }, 167 "scaffolding_described": { 168 "applies": false, 169 "answer": false, 170 "justification": "No agentic scaffolding is used. The system is a framework for RL training and guided decoding, not an agent-based system." 171 }, 172 "data_preprocessing_documented": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 3.2 describes the MATH-APS data augmentation pipeline using OmegaPRM, including tree construction, rollout selection via PUCT, and binary search for error identification. Section 4.1 notes the dataset was 'reduced to approximately 150k pairs after cleaning and preprocessing.'" 176 } 177 }, 178 "limitations_and_scope": { 179 "limitations_section_present": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 6 'Limitations' explicitly discusses limited scale of experiments, model size constraints, and limited process supervision data." 183 }, 184 "threats_to_validity_specific": { 185 "applies": true, 186 "answer": true, 187 "justification": "The limitations section identifies specific issues: 'restricted access to large-scale computing infrastructure,' evaluations on 'relatively smaller models,' and 'the scale and diversity of process supervision data remain limited.'" 188 }, 189 "scope_boundaries_stated": { 190 "applies": true, 191 "answer": false, 192 "justification": "While limitations are listed, the paper does not explicitly state what the results do NOT show or what settings/populations are excluded. No explicit boundary on what claims are not being made." 193 } 194 }, 195 "data_integrity": { 196 "raw_data_available": { 197 "applies": true, 198 "answer": true, 199 "justification": "The paper states datasets are publicly available at the project website, and they use publicly available datasets (PRM800K, Math-Shepherd, MATH)." 200 }, 201 "data_collection_described": { 202 "applies": true, 203 "answer": true, 204 "justification": "Section 3.2 describes the MATH-APS data collection procedure in detail, including the tree construction method, PUCT selection, Monte Carlo estimation, and binary search for error identification." 205 }, 206 "recruitment_methods_described": { 207 "applies": false, 208 "answer": false, 209 "justification": "No human participants; data comes from standard benchmarks and automated generation." 210 }, 211 "data_pipeline_documented": { 212 "applies": true, 213 "answer": true, 214 "justification": "The paper documents the pipeline from PRM800K + Math-Shepherd + MATH-APS generation through to PRM training (Section 3.3) and policy learning (Section 3.4). The data reduction from 500k to 150k pairs is noted." 215 } 216 }, 217 "conflicts_of_interest": { 218 "funding_disclosed": { 219 "applies": true, 220 "answer": false, 221 "justification": "No funding sources or acknowledgments section is present in the paper." 222 }, 223 "affiliations_disclosed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Author affiliations are listed: UCL, University of Liverpool, Shanghai Jiao Tong University, HKUST (Guangzhou), and Westlake University." 227 }, 228 "funder_independent_of_outcome": { 229 "applies": true, 230 "answer": false, 231 "justification": "No funding information is disclosed, so independence cannot be assessed." 232 }, 233 "financial_interests_declared": { 234 "applies": true, 235 "answer": false, 236 "justification": "No competing interests or financial disclosure statement is present in the paper." 237 } 238 }, 239 "contamination": { 240 "training_cutoff_stated": { 241 "applies": true, 242 "answer": false, 243 "justification": "The paper uses Qwen2.5-Math models but does not state the training data cutoff date for these models." 244 }, 245 "train_test_overlap_discussed": { 246 "applies": true, 247 "answer": false, 248 "justification": "MATH is a well-known benchmark published in 2021. Qwen2.5 models trained in 2024 could have seen MATH problems. No discussion of potential overlap." 249 }, 250 "benchmark_contamination_addressed": { 251 "applies": true, 252 "answer": false, 253 "justification": "MATH was published in 2021, well before Qwen2.5's training. No contamination analysis or discussion is provided." 254 } 255 }, 256 "human_studies": { 257 "pre_registered": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "irb_or_ethics_approval": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "demographics_reported": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "inclusion_exclusion_criteria": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "randomization_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "blinding_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 }, 287 "attrition_reported": { 288 "applies": false, 289 "answer": false, 290 "justification": "No human participants in this study." 291 } 292 }, 293 "cost_and_practicality": { 294 "inference_cost_reported": { 295 "applies": true, 296 "answer": false, 297 "justification": "The paper reports generation budget in terms of average tokens per question (Figure 4) but does not report wall-clock time, API costs, or actual compute cost for inference." 298 }, 299 "compute_budget_stated": { 300 "applies": true, 301 "answer": false, 302 "justification": "Figure 5 shows wall-clock hours for PPO training (~12 hours) but no GPU type, total GPU hours, or compute budget for PRM training or data generation is stated." 303 } 304 }, 305 "experimental_rigor": { 306 "seed_sensitivity_reported": { 307 "applies": true, 308 "answer": false, 309 "justification": "No mention of multiple random seeds. Results appear to be from single runs." 310 }, 311 "number_of_runs_stated": { 312 "applies": true, 313 "answer": false, 314 "justification": "The number of experimental runs is not stated for any experiment." 315 }, 316 "hyperparameter_search_budget": { 317 "applies": true, 318 "answer": false, 319 "justification": "No hyperparameter search budget is reported for PRM training or RL training." 320 }, 321 "best_config_selection_justified": { 322 "applies": true, 323 "answer": false, 324 "justification": "The paper compares search methods and PRMs but does not explain how the best configuration was selected or whether selection was done on validation data." 325 }, 326 "multiple_comparison_correction": { 327 "applies": false, 328 "answer": false, 329 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 330 }, 331 "self_comparison_bias_addressed": { 332 "applies": true, 333 "answer": false, 334 "justification": "The authors compare their Math-psa PRM against Math-Shepherd without acknowledging potential bias from evaluating their own system." 335 }, 336 "compute_budget_vs_performance": { 337 "applies": true, 338 "answer": true, 339 "justification": "Figure 4 plots accuracy vs generation budget (tokens per question), directly showing performance as a function of compute." 340 }, 341 "benchmark_construct_validity": { 342 "applies": true, 343 "answer": false, 344 "justification": "MATH is used without discussing whether competition-level math problems are an adequate measure of 'advanced reasoning capabilities' as claimed." 345 }, 346 "scaffold_confound_addressed": { 347 "applies": false, 348 "answer": false, 349 "justification": "No scaffolding is involved; the comparisons are between search/voting strategies applied to the same models." 350 } 351 }, 352 "data_leakage": { 353 "temporal_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "MATH (2021) predates Qwen2.5 (2024) training. No discussion of whether MATH solutions appeared in Qwen2.5's training data." 357 }, 358 "feature_leakage_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether the evaluation setup leaks information through the PRM or search process." 362 }, 363 "non_independence_addressed": { 364 "applies": true, 365 "answer": false, 366 "justification": "The PRM is trained on PRM800K and Math-Shepherd data derived from MATH. No discussion of whether this creates non-independence with the MATH500 test set." 367 }, 368 "leakage_detection_method": { 369 "applies": true, 370 "answer": false, 371 "justification": "No leakage detection or prevention methods are described." 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "Process reward models combined with guided search improve test-time reasoning performance by approximately 10% on MATH500.", 378 "evidence": "Figure 4a shows accuracy increasing from ~75% (majority vote) to ~82% (best PRM-guided methods) on MATH500.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "Their Math-psa PRM achieves higher test accuracy than Math-Shepherd PRM across all computation budgets.", 383 "evidence": "Figure 4b shows Math-psa consistently outperforming Math-Shepherd in best-of-N comparisons.", 384 "supported": "weak" 385 }, 386 { 387 "claim": "PPO with PRM feedback steadily improves reasoning accuracy on individual math problems.", 388 "evidence": "Figure 5 (left) shows reward increasing and stabilizing after ~6 hours of training on a single problem.", 389 "supported": "weak" 390 }, 391 { 392 "claim": "OpenR is the first open-source framework to explore core methods of OpenAI's o1 model with reinforcement learning.", 393 "evidence": "Stated in abstract and Section 1. No systematic comparison with other open-source RL-for-reasoning frameworks.", 394 "supported": "unsupported" 395 } 396 ], 397 "red_flags": [ 398 { 399 "flag": "Single-run results without variance", 400 "detail": "All experiments appear to be single runs with no error bars, standard deviations, or confidence intervals. Results could vary substantially across seeds." 401 }, 402 { 403 "flag": "No contamination analysis", 404 "detail": "MATH (2021) is well within Qwen2.5's training window. The base model may already know many MATH solutions, inflating reported accuracy for all methods equally and potentially biasing PRM comparisons." 405 }, 406 { 407 "flag": "PRM trained on MATH-derived data, tested on MATH", 408 "detail": "The PRM is trained on PRM800K, Math-Shepherd, and MATH-APS — all derived from the MATH dataset. Testing on MATH500 (a subset of MATH) creates potential data leakage. No analysis of overlap between PRM training data and MATH500 test problems." 409 }, 410 { 411 "flag": "Overclaiming scope", 412 "detail": "The paper claims to provide a framework for 'Advanced Reasoning with Large Language Models' but only evaluates on a single math benchmark (MATH500) with specific Qwen2.5 models." 413 }, 414 { 415 "flag": "Labeled 'Technical Report: Work in progress'", 416 "detail": "The paper self-describes as a work-in-progress technical report, yet makes broad claims about being 'the first open-source framework' for o1-style reasoning." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "Scaling LLM Test-Time Compute Optimally Can Be More Effective Than Scaling Model Parameters", 422 "authors": ["Charlie Snell", "Jaehoon Lee", "Kelvin Xu", "Aviral Kumar"], 423 "year": 2024, 424 "arxiv_id": "2408.03314", 425 "relevance": "Core reference for test-time compute scaling, directly replicated in OpenR's experiments." 426 }, 427 { 428 "title": "Let's Verify Step by Step", 429 "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yura Burda"], 430 "year": 2023, 431 "arxiv_id": "2305.20050", 432 "relevance": "Foundational work on process reward models and PRM800K dataset used in this framework." 433 }, 434 { 435 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 436 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 437 "year": 2022, 438 "arxiv_id": "2201.11903", 439 "relevance": "Foundational chain-of-thought prompting work that motivated o1-style reasoning approaches." 440 }, 441 { 442 "title": "Improve Mathematical Reasoning in Language Models by Automated Process Supervision", 443 "authors": ["Liangchen Luo", "Yinxiao Liu", "Rosanne Liu"], 444 "year": 2024, 445 "arxiv_id": "2406.06592", 446 "relevance": "OmegaPRM method used for automated process supervision data collection in MATH-APS." 447 }, 448 { 449 "title": "Math-Shepherd: Verify and Reinforce LLMs Step-by-Step without Human Annotations", 450 "authors": ["Peiyi Wang", "Lei Li", "Zhihong Shao"], 451 "year": 2024, 452 "relevance": "Baseline PRM compared against in experiments; its training data is also used for Math-psa." 453 }, 454 { 455 "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models", 456 "authors": ["Zhihong Shao", "Peiyi Wang", "Qihao Zhu"], 457 "year": 2024, 458 "arxiv_id": "2402.03300", 459 "relevance": "GRPO algorithm used in OpenR's RL training pipeline." 460 }, 461 { 462 "title": "Training Language Models to Follow Instructions with Human Feedback", 463 "authors": ["Long Ouyang", "Jeff Wu", "Xu Jiang"], 464 "year": 2022, 465 "relevance": "Foundational RLHF work that motivates RL-based training of reasoning models." 466 }, 467 { 468 "title": "Generative Verifiers: Reward Modeling as Next-Token Prediction", 469 "authors": ["Lunjun Zhang", "Arian Hosseini", "Hritik Bansal"], 470 "year": 2024, 471 "arxiv_id": "2408.15240", 472 "relevance": "Generative reward model approach discussed as alternative to scalar PRMs." 473 }, 474 { 475 "title": "AlphaZero-like Tree-Search Can Guide Large Language Model Decoding and Training", 476 "authors": ["Xidong Feng", "Ziyu Wan", "Muning Wen"], 477 "year": 2024, 478 "relevance": "MCTS-based decoding for LLMs, directly related to OpenR's search methods." 479 }, 480 { 481 "title": "Scaling Laws for Neural Language Models", 482 "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"], 483 "year": 2020, 484 "arxiv_id": "2001.08361", 485 "relevance": "Foundational scaling laws work that OpenR extends from training-time to inference-time compute." 486 } 487 ] 488 }