scan.json (27151B)
1 { 2 "paper": { 3 "title": "Boosting LLM Reasoning via Spontaneous Self-Correction", 4 "authors": [ 5 "Xutong Zhao", 6 "Tengyu Xu", 7 "Xuewei Wang", 8 "Zhengxing Chen", 9 "Di Jin", 10 "Liang Tan", 11 "Yen-Ting", 12 "Zishun Yu", 13 "Zhuokai Zhao", 14 "Yun He", 15 "Sinong Wang", 16 "Han Fang", 17 "Sarath Chandar", 18 "Chen Zhu" 19 ], 20 "year": 2025, 21 "venue": "arXiv", 22 "arxiv_id": "2506.06923" 23 }, 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. There is no mention of code release." 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper uses publicly available benchmarks: MATH500 (Lightman et al., 2023), AMC23, and AIME24, all with HuggingFace dataset links provided. The training data is NuminaMath (LI et al., 2024), also publicly available." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper mentions '32 NVIDIA H100 GPUs' (Appendix B) but provides no requirements.txt, Dockerfile, library versions, or framework versions (e.g., PyTorch version). Not enough detail to recreate the environment." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No README, reproduction scripts, or step-by-step instructions for reproducing the experiments are provided. The paper describes the method conceptually but does not provide runnable reproduction artifacts." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "All results are reported as point estimates (e.g., '61.0' on MATH500). No confidence intervals, error bars, or uncertainty measures are provided for any result in any table." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper claims 'SPOC significantly improves performance' and 'SPOC consistently outperforms the base models' but provides no statistical significance tests (no p-values, t-tests, bootstrap tests, etc.) to support these comparative claims." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "The paper reports absolute percentage improvements with baseline context, e.g., 'gains of 8.8% and 11.6% on MATH500' with base model accuracies clearly listed in Table 1 (52.2% -> 61.0% for 8B). This provides enough context to understand the magnitude of improvement." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "No justification is given for using the specific benchmark sizes (MATH500=500, AMC23=40, AIME24=30 problems). The AMC23 and AIME24 benchmarks are particularly small (40 and 30 problems), where a single correct answer changes the score by 2.5% and 3.3%, yet this is not discussed." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "Results appear to be from single runs using greedy decoding. No standard deviations, variance across seeds, or repeated experiment results are reported. The only exception is the 'avg@4' sampling results for DeepSeek-R1 reference numbers, which are from the original paper, not the authors' experiments." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 1 includes multiple baselines: base models, SFT, RAFT, PairSFT, Self-Refine (with and without oracle), S2R variants (Ma et al., 2025), SCoRe (Kumar et al., 2024), and Self-rewarding IFT (Xiong et al., 2025)." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": true, 83 "justification": "Baselines include concurrent/recent works: S2R (Ma et al., 2025), SCoRe (Kumar et al., 2024), Self-rewarding IFT (Xiong et al., 2025). These are contemporary and competitive approaches to the same problem of LLM self-correction." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 4 presents an ablation study on reward configurations (Corr, Last, All) using Llama-3.1-8B-Instruct. The paper also examines iterative training (Table 3) and per-turn analysis (Tables 2, 6, 7, 8, 9). The RAFT vs RLOO comparison also serves as an ablation on the policy optimizer." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper reports both final answer accuracy (pass@1) and verification accuracy (Verif.Acc.) as complementary metrics. Tables 2 and 5 also report per-turn correction rates and verifier confusion matrices (TP, TN, FP, FN rates)." 94 }, 95 "human_evaluation": { 96 "applies": false, 97 "answer": false, 98 "justification": "Human evaluation is not relevant here. The tasks are mathematical reasoning benchmarks with definitive correct answers that can be verified by rule-based checkers. Automated evaluation is the appropriate method." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": true, 103 "justification": "Training uses NuminaMath dataset, while evaluation is on separate benchmarks: MATH500, AMC23, and AIME24. These are distinct from the training data. The paper also notes excluding Orca-Math and synthetic data subsets from training." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Results are broken down per benchmark (MATH500, AMC23, AIME24) which represent different difficulty levels. Per-turn breakdowns are provided in Tables 2, 6-9. Verifier confusion matrices in Table 5 provide per-model, per-task diagnostics." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": true, 113 "justification": "Appendix F provides a detailed example response showing an initial incorrect solution (concluding a=1) followed by self-correction to the correct answer (a=41). Table 5 reports false positive and false negative rates of the verifier. The paper discusses cases where correction degrades performance (delta_c->i in Tables 2, 6, 7)." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper reports several negative results: Self-Refine without oracle degrades performance significantly (e.g., 52.2% -> 39.4% for 8B on MATH500). SPOC shows marginal improvement on Llama3.3-70B. Table 9 shows performance decreasing at turn 3 (61.2% -> 61.0%). RAFT underperforms PairSFT for DeepSeek-R1-Distill models." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": true, 125 "justification": "The abstract claims SPOC 'boosts the accuracy of Llama-3.1-8B and 70B Instruct models, achieving gains of 8.8% and 11.6% on MATH500, 10.0% and 20.0% on AMC23, and 3.3% and 6.7% on AIME24.' These numbers match Table 1 exactly." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper makes causal claims (e.g., SPOC 'improves' performance, reward settings 'yield' better results). The ablation study (Table 4) with controlled single-variable manipulation of reward configurations, and the comparisons holding the base model constant while varying only the training method, provide adequate support for these causal claims." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "The title 'Boosting LLM Reasoning' is broader than what is tested. The paper evaluates only mathematical reasoning on three benchmarks. The conclusion states 'future work could explore extending SPOC to broader reasoning domains beyond mathematics,' but the title and claims about 'LLM reasoning' in the abstract do not adequately bound the generalization to math-only evaluation." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper does not discuss alternative explanations for the observed improvements. For example, it does not consider whether improvements stem from simply having more tokens/compute at inference time rather than the specific SPOC architecture, or whether the SFT training data quality drives gains rather than the RL stage. No threats-to-validity section or discussion of confounds is present." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Specific model versions are provided: 'Llama-3.1-8B-Instruct', 'Llama-3.1-70B-Instruct', 'Llama-3.3-70B-Instruct', 'DeepSeek-R1-Distill-Llama-8B', 'DeepSeek-R1-Distill-Llama-70B'. These are identifiable models with specific HuggingFace model cards. References to specific papers (Dubey et al., 2024; Guo et al., 2025) further identify versions." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix E provides full prompt templates for: COT query (Figures 4, 5), instance reflection query (Figure 6), SPOC evaluation query (Figure 7), and Self-Refine baselines (Figures 8, 9). Templates include placeholders with clear variable names ({{ Question }})." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Appendix B reports: AdamW optimizer, beta1=0.9, beta2=0.95, weight_decay=0.1, learning_rate=1.0e-6, global_batch_size=2048, 256 training steps. Evaluation uses greedy decoding (temperature=0). Max generation lengths specified (6,144 and 32,768 tokens)." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "The multi-turn generation mechanism is described in detail in Section 3.1 (multi-turn formalism, EFG model), Section 3.2 (PairSFT data construction), and Section 3.3 (online RL). The use of special tokens (<|eom_id|>, <|eot_id|>) for turn management is explained. Figure 1 illustrates the open-loop vs closed-loop paradigms." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 4.1 describes data preprocessing: training on NuminaMath dataset, excluding Orca-Math and synthetic data subsets 'since their correctness are not human-validated.' Section 3.2 and Algorithm 2 detail the PairSFT data construction procedure, including how correct/incorrect solutions are paired and verification messages generated." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion briefly mentions future work directions ('extending SPOC to partial solutions' and 'broader reasoning domains') but does not substantively discuss limitations of the current work." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "No specific threats to validity are discussed. Generic future work suggestions are mentioned but no concrete threats such as small benchmark sizes, single-run results, or domain-specific limitations are addressed." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "No explicit scope boundaries are stated. The paper does not clarify that results apply only to mathematical reasoning, or that the benchmarks are relatively small (especially AMC23 with 40 and AIME24 with 30 problems). The title suggests broad 'LLM Reasoning' applicability without stating what the results do NOT show." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "No raw experimental data (model outputs, per-problem results, training logs) is made available. Only aggregate accuracy numbers are reported in tables." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3.2 and Algorithm 2 describe in detail how the PairSFT training data is constructed from model rollouts. Section 4.1 describes the training data source (NuminaMath) and the evaluation benchmarks with references and links." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants are involved. The study uses standard public benchmarks (MATH500, AMC23, AIME24) and publicly available training data (NuminaMath)." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The data pipeline is documented: base model rollout -> K solutions per question -> binary correctness labeling -> pairing correct/incorrect solutions -> generating verification messages -> filtering valid verifications -> SFT training -> RL training. Algorithm 2 formalizes this pipeline." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding sources or acknowledgments section is present in the paper. The work is noted as 'Work done at Meta' but no specific funding disclosure is made." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: MetaAI, Mila - Quebec AI Institute, Polytechnique Montreal. The footnote states 'Work done at Meta.'" 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "The work was done at Meta, and the primary base models used are Meta's Llama models. Meta has a financial interest in demonstrating that their Llama models can be improved through training techniques like SPOC. The funder is not independent of the outcome." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper. There is no disclosure regarding patents, equity, or other financial interests related to the findings." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper does not state the training data cutoff dates for the Llama-3.1, Llama-3.3, or DeepSeek-R1-Distill models used. This is relevant because the benchmarks (MATH, AMC23, AIME24) could have been in the pre-training data." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of potential overlap between the pre-training data of the base models and the evaluation benchmarks. MATH problems have been publicly available since 2021, and the models trained after that could have seen them." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "MATH500 problems are from the MATH dataset published in 2021. AMC and AIME competition problems are widely available online. The paper does not discuss whether these problems appeared in the pre-training data of the Llama or DeepSeek models. This is a significant omission given that contamination could inflate results." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are involved in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants are involved in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants are involved in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No inference cost, latency, or tokens consumed per example are reported. The method involves multiple solution attempts and verifications per problem, which multiplies inference cost, but this cost is not quantified." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": true, 294 "justification": "Appendix B states: 'We conduct all training runs on 32 NVIDIA H100 GPUs' with 256 training steps and batch size 2048. While not a complete cost statement (no GPU-hours or wall-clock time), the hardware and training scale are specified." 295 } 296 } 297 }, 298 "claims": [ 299 { 300 "claim": "SPOC boosts the accuracy of Llama-3.1-8B Instruct by 8.8% on MATH500 (52.2% to 61.0%), 10.0% on AMC23 (22.5% to 32.5%), and 3.3% on AIME24 (3.3% to 6.7%).", 301 "evidence": "Table 1 shows exact numbers matching these claims for Llama-3.1-8B-Instruct base model vs SPOC.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "SPOC boosts the accuracy of Llama-3.1-70B Instruct by 11.6% on MATH500 (65.8% to 77.4%), 20.0% on AMC23 (32.5% to 52.5%), and 6.7% on AIME24 (16.7% to 23.3%).", 306 "evidence": "Table 1 shows exact numbers matching these claims for Llama-3.1-70B-Instruct base model vs SPOC.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "SPOC-RLOO achieves 94.6%/92.5%/76.7% on MATH500/AMC23/AIME24 with DeepSeek-R1-Distill-Llama-70B, substantially outperforming the base model (82.8%/72.5%/60.0%).", 311 "evidence": "Table 1 shows these exact numbers for DeepSeek-R1-Distill-Llama-70B base model vs SPOC-RLOO.", 312 "supported": "strong" 313 }, 314 { 315 "claim": "The Corr reward setting yields the best overall performance compared to Last and All reward variants.", 316 "evidence": "Table 4 shows Corr achieves 61.0/32.5/6.7 vs Last's 59.8/27.5/10.0 and All's 58.4/35.0/6.7 on MATH500/AMC23/AIME24. Corr outperforms on MATH500 (the largest benchmark) but loses on individual smaller benchmarks.", 317 "supported": "moderate" 318 }, 319 { 320 "claim": "Data balancing in PairSFT leads to higher verification accuracy and more stable RL training.", 321 "evidence": "Section 3.2 states this as an observation but provides no quantitative comparison with vs without data balancing. No ablation on this specific design choice is presented.", 322 "supported": "weak" 323 }, 324 { 325 "claim": "Iterative training (iter2) provides consistent improvement, with larger gains on challenging benchmarks.", 326 "evidence": "Table 3 shows iter2 results. For Llama-3.1-70B, iter2 improves over iter1 by 10% on AMC23 and 6.7% on AIME24. However, for 8B model, AMC23 stays the same or decreases from iter1 to iter2 SPOC.", 327 "supported": "moderate" 328 } 329 ], 330 "methodology_tags": [ 331 "benchmark-eval" 332 ], 333 "key_findings": "SPOC (Spontaneous Self-Correction) enables LLMs to perform interleaved solution generation and verification in a single inference pass without external prompts. Using PairSFT initialization followed by online RL (RAFT or RLOO), SPOC achieves consistent improvements across Llama-3.1 8B/70B, Llama-3.3 70B, and DeepSeek-R1-Distill models on MATH500, AMC23, and AIME24 benchmarks. The strongest results come from SPOC-RLOO on DeepSeek-R1-Distill-Llama-70B, reaching 94.6% on MATH500 and 76.7% on AIME24. The Corr reward setting, which jointly rewards correct solutions and correct verifications, outperforms alternative reward configurations.", 334 "red_flags": [ 335 { 336 "flag": "No statistical significance testing", 337 "detail": "All comparisons between SPOC and baselines rely on point estimate comparisons with no significance tests. On small benchmarks like AMC23 (40 problems) and AIME24 (30 problems), a single additional correct answer changes the score by 2.5% and 3.3% respectively, making reported improvements potentially indistinguishable from noise." 338 }, 339 { 340 "flag": "No variance or multi-run reporting", 341 "detail": "Results appear to be from single greedy-decoding runs. No standard deviations across random seeds or multiple training runs are reported, making it impossible to assess result stability." 342 }, 343 { 344 "flag": "Benchmark contamination risk unaddressed", 345 "detail": "MATH problems (published 2021) and AMC/AIME competition problems are widely available online. The pre-training data of Llama-3.1/3.3 and DeepSeek-R1-Distill likely includes these. The paper does not discuss this contamination risk at all." 346 }, 347 { 348 "flag": "Very small evaluation benchmarks", 349 "detail": "AMC23 has only 40 problems and AIME24 has only 30 problems. Reported improvements of 10-20% on AMC23 correspond to 4-8 additional correct answers, and improvements of 3.3-6.7% on AIME24 correspond to 1-2 additional correct answers. These are too few to draw reliable conclusions." 350 }, 351 { 352 "flag": "No limitations section", 353 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries. This is a significant omission for a paper making broad claims about 'LLM Reasoning.'" 354 }, 355 { 356 "flag": "Meta evaluating Meta's Llama models", 357 "detail": "Most authors are from MetaAI, and the primary models evaluated are Meta's Llama models. This conflict of interest is not acknowledged. While DeepSeek models are also tested, the focus and framing center on Llama improvements." 358 }, 359 { 360 "flag": "Inference cost not reported", 361 "detail": "SPOC generates multiple solution-verification turns per problem, substantially increasing inference cost. The paper does not report this cost, making it impossible to assess the cost-accuracy tradeoff relative to simpler approaches like best-of-N sampling." 362 } 363 ], 364 "cited_papers": [ 365 { 366 "title": "Training language models to self-correct via reinforcement learning", 367 "authors": ["Aviral Kumar", "Vincent Zhuang", "Rishabh Agarwal"], 368 "year": 2024, 369 "arxiv_id": "2409.12917", 370 "relevance": "Directly comparable prior work on RL-based self-correction for LLMs (SCoRe), a key baseline in this paper." 371 }, 372 { 373 "title": "Self-refine: Iterative refinement with self-feedback", 374 "authors": ["Aman Madaan", "Niket Tandon", "Prakhar Gupta"], 375 "year": 2023, 376 "relevance": "Foundational work on LLM self-refinement through iterative critique, used as a baseline in this paper." 377 }, 378 { 379 "title": "Large language models cannot self-correct reasoning yet", 380 "authors": ["Jie Huang", "Xinyun Chen", "Swaroop Mishra"], 381 "year": 2023, 382 "arxiv_id": "2310.01798", 383 "relevance": "Key prior work arguing intrinsic self-correction is ineffective without external feedback, directly motivating SPOC's approach." 384 }, 385 { 386 "title": "S2R: Teaching LLMs to self-verify and self-correct via reinforcement learning", 387 "authors": ["Ruotian Ma", "Peisong Wang", "Cheng Liu"], 388 "year": 2025, 389 "arxiv_id": "2502.12853", 390 "relevance": "Concurrent work on self-verification and self-correction via RL, directly compared as a baseline in Table 1." 391 }, 392 { 393 "title": "Self-rewarding correction for mathematical reasoning", 394 "authors": ["Wei Xiong", "Hanning Zhang", "Chenlu Ye"], 395 "year": 2025, 396 "arxiv_id": "2502.19613", 397 "relevance": "Concurrent work on self-rewarding correction for math reasoning, compared as a baseline." 398 }, 399 { 400 "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning", 401 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 402 "year": 2025, 403 "arxiv_id": "2501.12948", 404 "relevance": "Major work on RL-based reasoning improvement in LLMs, provides base models and reference results used in this paper." 405 }, 406 { 407 "title": "MALT: Improving reasoning with multi-agent LLM training", 408 "authors": ["Sumeet Ramesh Motwani", "Chandler Smith", "Rocktim Jyoti Das"], 409 "year": 2024, 410 "arxiv_id": "2412.01928", 411 "relevance": "Multi-agent LLM training approach for reasoning improvement, related work on training separate models for correction roles." 412 }, 413 { 414 "title": "Reflexion: Language agents with verbal reinforcement learning", 415 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"], 416 "year": 2023, 417 "relevance": "Foundational agentic self-correction framework using verbal reinforcement, a key prior work in the self-correction space." 418 }, 419 { 420 "title": "RAFT: Reward ranked finetuning for generative foundation model alignment", 421 "authors": ["Hanze Dong", "Wei Xiong", "Deepanshu Goyal"], 422 "year": 2023, 423 "arxiv_id": "2304.06767", 424 "relevance": "Core RL algorithm used in SPOC's policy optimization stage." 425 }, 426 { 427 "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation", 428 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 429 "year": 2023, 430 "arxiv_id": "2308.08155", 431 "relevance": "Multi-agent framework for LLM applications, discussed as related work on multi-agent reasoning approaches." 432 }, 433 { 434 "title": "Let's verify step by step", 435 "authors": ["Hunter Lightman", "Vineet Kosaraju", "Yura Burda"], 436 "year": 2023, 437 "arxiv_id": "2305.20050", 438 "relevance": "Process reward model for step-by-step verification in math reasoning, provides the MATH500 benchmark used in evaluation." 439 }, 440 { 441 "title": "The perfect blend: Redefining RLHF with mixture of judges", 442 "authors": ["Tengyu Xu", "Eryk Helenowski", "Karthik Abinav Sankararaman"], 443 "year": 2024, 444 "arxiv_id": "2409.20370", 445 "relevance": "CGPO framework used for implementing RAFT in SPOC's training pipeline." 446 } 447 ] 448 }