scan.json (19412B)
1 { 2 "paper": { 3 "title": "Confidence-Guided Stepwise Model Routing for Cost-Efficient Reasoning", 4 "authors": ["Sangmook Lee", "Dohyung Kim", "Hyukhun Koh", "Nakyeong Yang", "Kyomin Jung"], 5 "year": 2025, 6 "venue": "AAAI 2026", 7 "arxiv_id": "2511.06190" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive link found in the paper." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "All benchmarks used (MATH500, OmniMath, AIME, MuSiQue, ACPBench, KOR-Bench) are publicly available datasets." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions using vLLM but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. Algorithm 1 describes the method but not how to run experiments." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Tables 1 and 2 report only point estimates for accuracy and FLOPs with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims STEER outperforms baselines but provides no statistical significance tests. Comparisons are based solely on comparing numbers." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports percentage improvements with baseline context, e.g., '+20% accuracy with 48% less FLOPs compared to solely using the larger model on AIME', and provides full baseline and method numbers in tables." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for benchmark sizes or number of runs. The paper does not discuss whether the sample sizes are sufficient for the claims made." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. All results appear to be single-run numbers." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares against RSD, Damani et al. (2025), and SpecReason, plus small-only and large-only baselines (Tables 1 and 2)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "All baselines are from 2024-2025 (RSD, SpecReason, Damani et al. 2025), representing recent work in the cost-efficient reasoning space." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 4.3 includes ablations on confidence measure selection (Figure 3), group size variations (Table 3), and larger model usage pattern analysis (Figure 5)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper reports Accuracy, FLOPs, and Accuracy-per-FLOPs (A/F) across all experiments." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "The paper evaluates automated reasoning benchmarks with ground-truth answers; human evaluation is not relevant to the claims." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "All benchmarks used are standard test sets (MATH500, AIME, OmniMath, MuSiQue, ACPBench, KOR-Bench). The GMM is fit on the same benchmark questions at inference time, but no training/tuning is done on these sets." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per benchmark (Tables 1, 2), by difficulty level (Figure 5 on OmniMath), and by step position." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses where STEER has lower accuracy than RSD on Qwen2.5-Math-Instruct (Section 4.2), and analyzes larger model usage on incorrect solutions (Figure 5)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports cases where STEER does not outperform RSD (e.g., Qwen2.5-Math-Instruct on math benchmarks) and where accuracy drops relative to the large model only." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims '+20% accuracy with 48% less FLOPs on AIME' which is supported by Table 1 (Gemma3 AIME: 15.8 vs 17.5 accuracy, 30.6 vs 33.8 FLOPs — though the 20% figure seems to be relative to a different comparison point). The claim of outperforming trained external module baselines is supported in most but not all settings." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims through ablation studies (confidence measure selection in Figure 3, group size in Table 3) which use controlled single-variable manipulation. The claim that confidence-based routing causes cost reduction is supported by the framework design." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract claim 'domain-agnostic' routing, but results are on 6 benchmarks across only math, QA, and planning. The paper does not bound claims to these specific domains and generalizes to 'cost-efficient reasoning' broadly." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for why STEER works, such as whether the performance gains could be attributed to the specific threshold tuning rather than the confidence signal itself." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper uses 'Gemma3-Instruct 4B/12B', 'Qwen2.5-Instruct 1.5B/7B', and 'Qwen2.5-Math-Instruct 1.5B/7B' without specifying exact version snapshots or release dates." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes using prompts for reasoning but does not provide the actual prompt text used in experiments. The appendix references (A, B) are not included in the available text." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Temperature=0.7 is specified. The threshold γ is selected via grid search with gap 0.1. The GMM fitting procedure is described. vLLM is specified as the inference engine." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The step-level routing mechanism is described in detail in Section 3, with formal algorithm (Algorithm 1), confidence estimation (Section 3.1), GMM-based routing (Section 3.2), and the full STEER procedure (Section 3.3)." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "The paper documents benchmark selection, exclusion of Operation and Puzzles subsets from KOR-Bench with justification, and exclusion of model families from certain benchmarks with reasons stated." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "No dedicated limitations or threats-to-validity section found in the paper." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show or which settings are excluded from the claims." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental data (per-question results, confidence scores, routing decisions) is made available." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The benchmarks are standard and well-referenced. AIME questions are described as 'collected from 2022 through 2025'. Benchmark details are in Section 4.1." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; all data comes from standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The paper describes the full pipeline: benchmark selection, model selection with exclusion criteria, threshold grid search procedure, and FLOPs computation method." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Section 7 (Acknowledgments) lists IITP grants funded by the Korea government (MSIT) and thanks LYWAY for experimental support." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "All authors are from Seoul National University, clearly stated in the paper header." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funding is from Korean government research grants (IITP/MSIT) which have no financial stake in the specific routing method's performance. LYWAY provided API support but the paper does not evaluate LYWAY products." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper does not state the training data cutoff dates for the models used (Gemma3, Qwen2.5)." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether MATH500 or other benchmarks may have been in the training data of the models evaluated." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "MATH500 (2021) and other benchmarks predate the models used, raising contamination risk that is not addressed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "FLOPs per query are reported for all methods across all benchmarks (Tables 1, 2). Routing latency comparison is in Figure 4." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No mention of total GPU hours, hardware used, or total computational budget for the experiments." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "STEER achieves +20% accuracy with 48% less FLOPs compared to solely using the larger model on AIME", 286 "evidence": "Table 1 shows Gemma3-Instruct STEER on AIME: 15.8% accuracy, 30.6 FLOPs vs 12B only: 17.5% accuracy, 33.8 FLOPs. The numbers don't directly show +20% accuracy; the claim likely refers to comparison with 4B only (10.8% accuracy).", 287 "supported": "weak" 288 }, 289 { 290 "claim": "STEER outperforms baselines that rely on trained external modules across diverse benchmarks", 291 "evidence": "Tables 1 and 2 show STEER achieves highest average A/F in most settings. However, RSD outperforms STEER on Qwen2.5-Math-Instruct math benchmarks in accuracy.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "STEER is robust to changes in backbone LLM family", 296 "evidence": "Experiments on both Gemma3-Instruct and Qwen2.5 families (Tables 1, 2) show STEER works across model families, unlike RSD which degrades on Gemma3.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "GMM routing latency is over a magnitude lower than baseline approaches", 301 "evidence": "Figure 4 shows routing latency comparison on MATH500 benchmark.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "Maximum logit is the most effective confidence signal for routing", 306 "evidence": "Figure 3 compares logit, entropy, and maximum probability on MATH500, showing logit yields highest accuracy especially at higher routing percentiles.", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "STEER proposes using logit-based confidence scores with Gaussian Mixture Models for step-level routing between small and large LLMs during reasoning, without requiring trained external models. Across 6 benchmarks spanning math, QA, and planning tasks, STEER achieves competitive accuracy while reducing inference FLOPs by 10-48% compared to using only the larger model. The method shows greater cross-model robustness than PRM-based baselines like RSD, which degrade when applied to model families different from their training data.", 312 "red_flags": [ 313 { 314 "flag": "No variance or error bars", 315 "detail": "All results appear to be single-run numbers with no standard deviation, confidence intervals, or multiple-seed averaging. Given the stochastic nature of LLM generation with temperature=0.7, results could vary substantially across runs." 316 }, 317 { 318 "flag": "Questionable abstract claim", 319 "detail": "The abstract claims '+20% accuracy with 48% less FLOPs on AIME' but Table 1 shows STEER at 15.8% vs 12B-only at 17.5% accuracy on AIME with Gemma3. The +20% claim is unclear in its reference point and may be cherry-picked." 320 }, 321 { 322 "flag": "No limitations section", 323 "detail": "The paper has no dedicated limitations or threats-to-validity discussion, which is a notable omission for an empirical methods paper." 324 }, 325 { 326 "flag": "Benchmark contamination risk unaddressed", 327 "detail": "MATH500 (2021) and other benchmarks predate the models used. No discussion of whether contamination affects the absolute accuracy numbers, though the relative comparisons between methods may be less affected." 328 }, 329 { 330 "flag": "Threshold tuning on test set", 331 "detail": "Grid search over threshold values is performed selecting the 'best-performing value' — it is unclear if this is done on the test set itself, which would inflate results for all methods including STEER." 332 } 333 ], 334 "cited_papers": [ 335 { 336 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 337 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 338 "year": 2023, 339 "relevance": "Foundational work on cost-efficient LLM inference through model cascading and routing." 340 }, 341 { 342 "title": "Reward-Guided Speculative Decoding for Efficient LLM Reasoning", 343 "authors": ["Baohao Liao"], 344 "year": 2025, 345 "relevance": "Key baseline (RSD) for step-level model routing using process reward models." 346 }, 347 { 348 "title": "Learning How Hard to Think: Input-Adaptive Allocation of LM Computation", 349 "authors": ["Mehul Damani"], 350 "year": 2025, 351 "relevance": "Key baseline for query-level model allocation using trained difficulty estimators." 352 }, 353 { 354 "title": "RouteLLM: Learning to Route LLMs from Preference Data", 355 "authors": ["Isaac Ong"], 356 "year": 2024, 357 "relevance": "Trained router for LLM model selection, relevant to cost-efficient LLM deployment." 358 }, 359 { 360 "title": "Beyond Chinchilla-Optimal: Accounting for Inference in Language Model Scaling Laws", 361 "authors": ["Nikhil Sardana"], 362 "year": 2024, 363 "relevance": "Scaling laws accounting for inference costs, foundational to the cost-efficiency motivation." 364 }, 365 { 366 "title": "Stop overthinking: A survey on efficient reasoning for large language models", 367 "authors": ["Yuan Sui"], 368 "year": 2025, 369 "relevance": "Survey of efficient reasoning methods for LLMs, directly relevant to the survey scope." 370 }, 371 { 372 "title": "CoT-Valve: Length-Compressible Chain-of-Thought Tuning", 373 "authors": ["Xinyin Ma"], 374 "year": 2025, 375 "relevance": "Method for compressing chain-of-thought reasoning to reduce inference cost." 376 }, 377 { 378 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 379 "authors": ["Dujian Ding"], 380 "year": 2024, 381 "relevance": "Query routing between different-sized LLMs for cost efficiency." 382 }, 383 { 384 "title": "Detecting hallucinations in large language models using semantic entropy", 385 "authors": ["Sebastian Farquhar"], 386 "year": 2024, 387 "relevance": "Semantic entropy for LLM uncertainty quantification, relevant to confidence estimation approaches." 388 }, 389 { 390 "title": "Deepseek-r1: Incentivizing reasoning capability in llms via reinforcement learning", 391 "authors": ["Daya Guo"], 392 "year": 2025, 393 "relevance": "Major reasoning LLM using RL-based test-time computation, motivating the need for cost-efficient reasoning." 394 } 395 ] 396 }