scan.json (29528B)
1 { 2 "paper": { 3 "title": "RelayGen: Intra-Generation Model Switching for Efficient Reasoning", 4 "authors": ["Jiwon Song", "Yoongon Kim", "Jae-Joon Kim"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2602.06454" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "RelayGen is a training-free, segment-level runtime model switching framework that exploits difficulty variation within long reasoning trajectories. By identifying model-specific switch cues through offline probability margin analysis, it dynamically delegates low-difficulty segments to a smaller model. Combined with speculative decoding (Eagle-3), RelayGen achieves up to 2.2× end-to-end speedup with less than 2% accuracy degradation on AIME 2025, while retaining ~70% large-model utilization. Answer-stage delegation shows 99.86% consistency, and ablation confirms that empirical cue selection outperforms using all candidate cues.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "GitHub link provided in the paper header: https://github.com/jiwonsong-dev/RelayGen." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "All evaluation datasets (AIME 2025, MATH500, GPQA-Diamond) and the calibration set (AMC 2023) are publicly available benchmarks. No proprietary data was collected." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "The paper mentions vLLM version 0.13.0 and NVIDIA A100 80GB SXM GPUs, but does not provide a requirements.txt, Dockerfile, or detailed dependency list sufficient to recreate the environment." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper describes the method and implementation details (Section 5.1) but does not provide step-by-step reproduction instructions. The GitHub repo may contain these, but the paper itself does not." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Table 3 reports standard deviation for latency (5 problems × 5 runs), but the main accuracy results in Table 2 report only point estimates of pass@1 with no confidence intervals or error bars, despite the stochastic evaluation (4 outputs per problem with sampling)." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "No statistical significance tests are used. All comparisons ('RelayGen outperforms Speculative Thinking', 'RelayGen achieves comparable accuracy to R2R') are based on comparing raw numbers without any tests." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Speedup ratios are reported (e.g., 2.2×), and accuracy results in Table 2 show both baseline (large model, small model) and method scores, providing full context to assess the magnitude of differences. Large-model utilization percentages are also reported." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification is given for the number of problems or runs used. The latency evaluation uses only 5 randomly sampled problems from AIME 2025. The calibration set size (160 samples from 40 problems) is not justified, though ablation in Table 5 shows robustness to smaller sizes." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "Table 3 reports standard deviation for latency measurements. However, the main accuracy results in Table 2 show single pass@1 numbers without any variance or spread across the 4 generated outputs per problem." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Comparisons include small-model-only, large-model-only, Speculative Thinking (Yang et al., 2025b), and R2R (Fu et al., 2025). Eagle-3 speculative decoding is also compared for latency." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "R2R (2025), Speculative Thinking (2025b), and Eagle-3 (2025) are all very recent, contemporary baselines for inference efficiency with reasoning models." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Section 5.4 presents three ablation studies: (1) effect of switch cue selection vs. all candidates (Table 4), (2) effect of heterogeneous model pairs (Figure 6), and (3) sensitivity to calibration set size (Table 5)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper reports pass@1 accuracy (Table 2), inference speedup (Table 3), and large-model utilization percentage (Table 3), capturing both accuracy and efficiency dimensions." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation is performed. All evaluations are automated (pass@1 on benchmarks with ground truth answers, wall-clock latency). Human evaluation of reasoning trace quality could have been informative for understanding how model switching affects intermediate reasoning." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "The calibration set (AMC 2023, 40 problems) is separate from all evaluation benchmarks (AIME 2025, MATH500, GPQA-Diamond). No tuning or selection is performed on the test data." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down per benchmark (MATH500, AIME 2025, GPQA-Diamond) and per model pair (Qwen3 and R1-Distill families) in Table 2." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "The Limitations section describes scenarios where the method may be less effective (non-reasoning tasks, weak small models, English-only). However, no specific failure examples or error analysis of individual problems where model switching degraded answers is provided." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 4 shows that using all candidate cues instead of selected cues degrades accuracy (60.00 vs 68.33 on AIME, 57.32 vs 63.64 on GPQA). Figure 6 shows meaningful accuracy drops with certain model pair configurations." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims 'up to 2.2× end-to-end speedup with less than 2% accuracy degradation.' Table 3 shows 2.20× speedup for RelayGen + Eagle-3. Table 2 shows accuracy drops of 0.47% (MATH500), 1.67% (AIME), and 0.94% (GPQA) for the Qwen3 pair, all under 2%." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "Causal claims (e.g., 'switch cue selection improves accuracy') are supported by controlled ablations. Table 4 isolates the effect of cue selection by comparing selected vs. all cues with other variables held constant. Figure 6 isolates the effect of model pair choice." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "The Limitations section explicitly bounds generalization: 'RelayGen targets settings where long-form reasoning is explicitly externalized,' acknowledges inapplicability to non-reasoning tasks, notes the moderate capability gap assumption, and acknowledges English-only evaluation." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper does not substantively discuss alternative explanations for its main results. Section 5.4 (Figure 6) identifies that accuracy drops stem from small model capacity rather than model family incompatibility, but broader alternative explanations (e.g., cue selection overfitting to math domain, or the improvements being due to prefix caching effects rather than difficulty-aware switching) are not considered." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper measures pass@1 accuracy and inference latency/speedup, and its claims are stated at the same granularity ('preserving most of the accuracy' and 'reducing inference latency'). No proxy gap exists between measured quantities and claimed outcomes." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Specific model names with sizes are provided: Qwen3-32B, Qwen3-1.7B, R1-Distill-Qwen-32B, R1-Distill-Qwen-1.5B. These are specific open-source model checkpoints with unambiguous identities, not marketing names." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper does not show the exact prompt format used when feeding benchmark problems to the models (e.g., system prompt, chat template, or formatting of math problems). Only the method's switching mechanics are described." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Section 5.1 specifies: temperature 0.6, top-p 0.95, top-k 20 (Qwen3 only), maximum generation length 32,768 tokens. States these follow recommended settings of each model family." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "RelayGen is an inference-time model switching framework, not an agentic scaffolding system. It does not use tools, retry logic, or agent-style workflows." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The calibration pipeline is well-documented: AMC 2023 (40 problems) → 4 traces per problem via large model → probability margin extraction → discourse-level aggregation → cue selection via standard-error threshold (Section 4.2, Appendix B). Calibration overhead is quantified in Table 7." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "A dedicated 'Limitations' section is present between Section 6 (Conclusion) and Section 7 (Ethical Considerations), with three substantive paragraphs." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The limitations are specific to this study: (1) applicability limited to tasks with externalized long-form reasoning, (2) requires moderate capability gap between large and small models, (3) experiments are English-only. These are concrete, not boilerplate." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The Limitations section explicitly states what the results do NOT show: non-reasoning/short-output tasks, scenarios where the small model lacks capacity for even low-difficulty segments, and multilingual settings." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "Raw generation traces, probability margin data, and per-problem results are not released. Only aggregated results (pass@1, speedup) are reported in tables." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "The calibration data collection is described: AMC 2023, 40 problems, 4 independent traces per problem using the large model (Section 5.1, Appendix B). Evaluation uses standard public benchmarks with clear setup." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. All data sources are standard public benchmarks (AIME 2025, MATH500, GPQA-Diamond, AMC 2023)." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The offline calibration pipeline is documented in detail: large model generates traces → token-level probability margins computed → post-sentence margins aggregated per discourse-level cue → cues filtered by standard-error threshold (Section 4.2, Appendix B). Table 7 reports timing for each stage." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding disclosure, acknowledgments section, or grant information is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "All three authors are identified as affiliated with Seoul National University, with email addresses on the snu.ac.kr domain." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed. The paper uses open-source models (Qwen3, DeepSeek R1-Distill) and the authors are academic researchers, suggesting no obvious conflict, but no explicit statement is made." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial disclosure statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for any of the models used (Qwen3-32B, Qwen3-1.7B, R1-Distill-Qwen-32B, R1-Distill-Qwen-1.5B)." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether benchmark problems (especially MATH500 from 2023 and GPQA-Diamond from 2024) appeared in the training data of models released in 2025." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "MATH500 (2023) was published well before the training of Qwen3 and R1-Distill models and could easily be in their training data. No contamination analysis or discussion is present." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "Inference latency is reported in detail: Table 3 shows speedup factors with standard deviations (e.g., 2.20±0.21× for RelayGen + Eagle-3). Large-model utilization percentages are also reported. Table 7 reports offline calibration wall-clock time (~100 minutes)." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Hardware is specified (two NVIDIA A100 80GB SXM GPUs). Calibration overhead is quantified in Table 7 (80 min for trace generation + 20 min for margin extraction = 100 min total, one-time cost)." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No seed sensitivity analysis is reported for the main accuracy results. The paper generates 4 outputs per problem using sampling (temperature 0.6) but does not report variability across seeds or runs for Table 2 accuracy numbers." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Section 5.1 states 'we generate four outputs per problem and report pass@1.' Section 5.3 states '5 problems, 5 runs per problem' for latency evaluation." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "The switch cue selection uses a threshold criterion (post-sentence margin > global average + 1 standard error), but the choice of this threshold method and the standard-error criterion is not justified against alternatives. No hyperparameter search budget is reported." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "The switch cue selection criterion is principled and clearly described: cues are selected if their post-sentence probability margin exceeds the global average by at least one standard error (Section 4.2). Selection is based on the calibration set, separate from test data." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "No discussion of potential bias from the authors evaluating their own system. It is unclear whether baseline implementations (R2R, Speculative Thinking) use official code or author re-implementations, and no acknowledgment of author-evaluation bias is present." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": true, 325 "justification": "Table 3 reports both speedup and large-model utilization for each method, directly showing the performance-compute trade-off. The paper explicitly discusses how R2R achieves low utilization but modest speedup due to routing overhead, while RelayGen achieves comparable speedup with higher utilization." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether AIME 2025, MATH500, or GPQA-Diamond actually measure the reasoning capabilities claimed. The benchmarks are used without questioning their construct validity." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": true, 335 "justification": "All methods are compared using the same base model pairs (Qwen3-32B/1.7B or R1-Distill-32B/1.5B). The switching mechanism IS the variable under study, and comparisons are properly controlled — different switching strategies applied to the same models." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "No discussion of whether Qwen3 or R1-Distill models were trained on data that includes MATH500 (2023) or GPQA-Diamond (2024) solutions. AIME 2025 is likely post-training, but this is not verified." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup (e.g., prompt formatting, benchmark presentation) leaks information not available in real usage." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of potential overlap between calibration data (AMC 2023) and evaluation benchmarks, or between benchmark problems and training data. AMC and AIME are from the same organization (MAA) and math competition domain." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No concrete leakage detection or prevention methods are used (no canary strings, membership inference, n-gram analysis, or decontamination)." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "RelayGen combined with speculative decoding achieves up to 2.2× end-to-end speedup with less than 2% accuracy degradation", 364 "evidence": "Table 3 shows RelayGen + Eagle-3 achieves 2.20±0.21× speedup. Table 2 shows accuracy drops of 0.47% (MATH500), 1.67% (AIME 2025), and 0.94% (GPQA-Diamond) for Qwen3 pair. However, the accuracy is measured for RelayGen alone, not for RelayGen + Eagle-3 combined (speculative decoding is theoretically lossless).", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Segment-level switching preserves most of the large model's accuracy while outperforming Speculative Thinking on all benchmarks", 369 "evidence": "Table 2: RelayGen achieves 94.80 vs 95.27 (large) on MATH500, 68.33 vs 70.00 on AIME, 63.64 vs 64.58 on GPQA for Qwen3 pair. Outperforms Speculative Thinking across all six benchmark-model pair combinations.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Answer-stage delegation from large to small model is nearly lossless, with 99.86% answer consistency", 374 "evidence": "Table 1: 727/728 answers match when Qwen3-0.6B generates the answer stage conditioned on Qwen3-32B reasoning traces across MATH500, AIME 2025, and GPQA-Diamond.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Empirical switch cue selection is critical — using all candidate cues degrades accuracy significantly", 379 "evidence": "Table 4: Selected cues achieve 68.33/63.64 (AIME/GPQA) vs 60.00/57.32 for all candidates, showing 8.33pp and 6.32pp drops when cue selection is removed.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "RelayGen is robust to calibration set size, maintaining performance even with 10 samples", 384 "evidence": "Table 5: With 10 calibration samples, pass@1 is 70.00 (AIME) and 61.87 (GPQA), comparable to or exceeding the 160-sample configuration (68.33/63.64). However, this non-monotonic behavior is unexplained.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Token-level routing (R2R) achieves limited speedup despite low large-model utilization due to per-token routing overhead", 389 "evidence": "Table 3: R2R uses only 19.27% large-model tokens but achieves only 1.30× speedup, while RelayGen uses 69.80% large-model tokens and achieves comparable 1.29× speedup. The per-token routing overhead explanation is discussed in Section 5.3 and Appendix C.", 390 "supported": "moderate" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "No error bars on main accuracy results", 396 "detail": "Table 2 reports pass@1 accuracy as point estimates without any uncertainty quantification, despite using stochastic sampling (temperature 0.6, 4 outputs per problem). On AIME 2025 with only ~30 problems, small absolute differences (e.g., 68.33 vs 70.00) could be within noise. Table 3 (latency) does include standard deviations." 397 }, 398 { 399 "flag": "Tiny latency evaluation sample", 400 "detail": "Inference latency (Table 3) is evaluated on only 5 randomly sampled AIME 2025 problems with 5 runs each. This is a very small sample for generalizing speedup claims, and problem selection could significantly affect results." 401 }, 402 { 403 "flag": "No contamination discussion despite using pre-2025 benchmarks", 404 "detail": "MATH500 (2023) and GPQA-Diamond (2024) were published before the likely training cutoffs of the 2025 models (Qwen3, R1-Distill). The paper never discusses whether benchmark problems could be in training data. While contamination would affect all methods equally and not invalidate relative comparisons, the absolute pass@1 numbers may be inflated." 405 }, 406 { 407 "flag": "Calibration domain matches evaluation domain", 408 "detail": "Offline calibration uses AMC 2023 (math competition), and 2 of 3 evaluation benchmarks are also math (AIME 2025, MATH500). Switch cues may be overfit to math reasoning discourse. The one non-math benchmark (GPQA-Diamond) shows somewhat larger accuracy drops for the R1-Distill pair (56.82 vs 60.61), though this could also be due to model capacity." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 414 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 415 "year": 2023, 416 "arxiv_id": "2305.05176", 417 "relevance": "Foundational work on LLM cost reduction via model routing and cascading strategies." 418 }, 419 { 420 "title": "RouteLLM: Learning to Route LLMs with Preference Data", 421 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"], 422 "year": 2024, 423 "arxiv_id": "2406.18665", 424 "relevance": "Input-level LLM routing using preference data, a key baseline category for model selection approaches." 425 }, 426 { 427 "title": "R2R: Efficiently Navigating Divergent Reasoning Paths with Small-Large Model Token Routing", 428 "authors": ["Tianyu Fu", "Yi Ge", "Yichen You"], 429 "year": 2025, 430 "arxiv_id": "2505.21600", 431 "relevance": "Token-level routing between small and large models during reasoning, directly compared as a baseline." 432 }, 433 { 434 "title": "Speculative Thinking: Enhancing Small-Model Reasoning with Large Model Guidance at Inference Time", 435 "authors": ["Wang Yang", "Xiang Yue", "Vipin Chaudhary", "Xiaotian Han"], 436 "year": 2025, 437 "arxiv_id": "2504.12329", 438 "relevance": "Step-level model switching for reasoning using predefined lexical cues, directly compared as a baseline." 439 }, 440 { 441 "title": "SplitReason: Learning to Offload Reasoning", 442 "authors": ["Yash Akhauri", "Anthony Fei", "Chi-Chih Chang"], 443 "year": 2025, 444 "arxiv_id": "2504.16379", 445 "relevance": "Training-based approach for learning when to hand off reasoning generation between models via control tokens." 446 }, 447 { 448 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 449 "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"], 450 "year": 2025, 451 "arxiv_id": "2501.12948", 452 "relevance": "Key large reasoning model demonstrating RL-based training for extended reasoning, source of R1-Distill models used in evaluation." 453 }, 454 { 455 "title": "OpenAI o1 System Card", 456 "authors": ["Aaron Jaech", "Adam Kalai"], 457 "year": 2024, 458 "arxiv_id": "2412.16720", 459 "relevance": "System card for OpenAI's reasoning model that pioneered inference-time scaling for reasoning tasks." 460 }, 461 { 462 "title": "Stop Overthinking: A Survey on Efficient Reasoning for Large Language Models", 463 "authors": ["Yang Sui", "Yu-Neng Chuang", "Guanchu Wang"], 464 "year": 2025, 465 "arxiv_id": "2503.16419", 466 "relevance": "Survey of efficient reasoning methods for LLMs, covering the broader landscape of inference-time efficiency approaches." 467 }, 468 { 469 "title": "Efficient Inference for Large Reasoning Models: A Survey", 470 "authors": ["Yue Liu", "Jiaying Wu", "Yufei He"], 471 "year": 2025, 472 "arxiv_id": "2503.23077", 473 "relevance": "Survey of inference efficiency methods specifically for reasoning models." 474 }, 475 { 476 "title": "Eagle-3: Scaling Up Inference Acceleration of Large Language Models via Training-Time Test", 477 "authors": ["Yuhui Li", "Fangyun Wei", "Chao Zhang", "Hongyang Zhang"], 478 "year": 2025, 479 "arxiv_id": "2503.01840", 480 "relevance": "Speculative decoding acceleration method combined with RelayGen for composable speedup." 481 }, 482 { 483 "title": "The Avengers: A Simple Recipe for Uniting Smaller Language Models to Challenge Proprietary Giants", 484 "authors": ["Yiqun Zhang", "Hao Li", "Chenxu Wang"], 485 "year": 2025, 486 "arxiv_id": "2505.19797", 487 "relevance": "Ensemble approach for combining smaller LLMs, related to model collaboration for efficiency." 488 }, 489 { 490 "title": "R-Stitch: Dynamic Trajectory Stitching for Efficient Reasoning", 491 "authors": ["Zhuokun Chen", "Zeren Chen", "Jiahao He"], 492 "year": 2025, 493 "arxiv_id": "2507.17307", 494 "relevance": "Token-level trajectory stitching for efficient reasoning, a concurrent approach to intra-generation model switching." 495 } 496 ] 497 }