scan.json (26780B)
1 { 2 "paper": { 3 "title": "Dynamic Mix Precision Routing for Efficient Multi-step LLM Interaction", 4 "authors": [ 5 "Yuanzhe Li", 6 "Jianing Deng", 7 "Jingtong Hu", 8 "Tianlong Chen", 9 "Song Wang", 10 "Huanrui Yang" 11 ], 12 "year": 2026, 13 "venue": "arXiv", 14 "arxiv_id": "2602.02711" 15 }, 16 "scan_version": 2, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "Sensitivity to model quantization in multi-step agentic tasks is concentrated at a small number of critical decision steps. A lightweight 2-layer Transformer router trained via KL-divergence supervision and GRPO can dynamically select between high- and low-precision LLMs at each step, achieving near full-precision performance (88.8% vs 89.6% on ALFWorld for Qwen3-8B) while using only ~27% high-precision calls. GRPO improves routing efficiency on larger models but provides no gains on the smallest model (Qwen3-1.7B), suggesting routing effectiveness is bounded by the high-precision model's capability.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses ALFWorld, a publicly available benchmark environment (Shridhar et al., 2021). No proprietary data was collected." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper specifies model names and quantization methods (GPTQ 3-bit/4-bit) but provides no requirements.txt, Dockerfile, or detailed environment setup with library versions." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "Table 1 and Table 2 report only point estimates for success rate, high-precision ratio, and GHC. No confidence intervals, error bars, or ± notation appear anywhere." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims routing 'achieves a great improvement' and 'consistently achieves superior performance–cost trade-offs' but provides no statistical significance tests — all comparisons are based on raw number differences." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Table 1 provides success rates for all methods alongside their high-precision ratios, and the GHC metric quantifies improvement magnitude per unit of high-precision cost. For example, the router achieves 88.8% (vs 82.8% quantized-only baseline) with 26.7% HP ratio, giving sufficient context to understand effect magnitude." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification for the number of evaluation episodes used. The number of ALFWorld test tasks is not stated explicitly, nor is any power analysis provided." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No standard deviations, variance across seeds, or spread measures are reported. All results appear to be single-run point estimates." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Table 1 compares against full-precision BF16, quantized-only (GPTQ), and random routing at 20/40/60/80% high-precision ratios across four model configurations." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": false, 79 "justification": "The baselines are only random routing at various ratios and single-precision inference. No comparison with existing routing methods from the related work (FrugalGPT, RouteLLM, HybridLLM, BEST-Route, Router-R1) despite extensive discussion of them in Section 2." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Table 2 ablates KL-ST vs GRPO-only vs KL-ST+GRPO. Table 3 ablates KL-ST training data scale (100/200/300/400 episodes). Both are systematic single-variable manipulations." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Three metrics are reported: success rate, high-precision usage ratio, and Gain per High-Precision Call (GHC). These capture effectiveness, cost, and their trade-off." 90 }, 91 "human_evaluation": { 92 "applies": false, 93 "answer": false, 94 "justification": "The paper evaluates routing in an automated simulator (ALFWorld) with binary task success. Human evaluation is not relevant to the claims." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 5.1 states 'we evaluate on the unseen test task following Yao et al. (2022b),' using ALFWorld's standard unseen test split." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": false, 104 "justification": "ALFWorld has 6 task types (pick, clean, heat, cool, examine, pick-two) but no per-task-type breakdown is provided. Results are only broken down by model configuration, not by task category." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Figure 1 and Appendix A.2 provide detailed case studies of trajectories where the low-precision model fails at critical steps while the router succeeds, with step-by-step traces." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Table 2 shows GRPO provides no improvement on Qwen3-1.7B (identical results to KL-ST). Section 5.3 discusses this: 'the effectiveness of routing-based optimization is bounded by the expressiveness and capability of the high-precision model itself.'" 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims 'great improvement on accuracy–cost trade-off over single-precision baselines and heuristic routing methods.' Table 1 supports this: the router achieves the highest GHC across all model configurations." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The ablation study (Table 2) uses controlled single-variable manipulation: KL-ST only, GRPO only, and KL-ST+GRPO. This is an adequate design for the causal claims about component contributions." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims 'Multi-step LLM Interaction' generally, and Section 1 claims the framework is for 'long-horizon agentic tasks.' But experiments are only on ALFWorld — no evaluation on other agentic benchmarks (WebArena, ScienceWorld, SWE-bench) despite citing them in related work." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "No discussion of alternative explanations for the results. For example, whether the router merely learns to predict task difficulty rather than precision sensitivity, or whether simple heuristics (e.g., step position in trajectory) could explain the routing decisions." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper directly measures what it claims: task success rate and high-precision usage ratio. The GHC metric is explicitly defined as combining these two. No proxy gap exists between measurement and claim." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "Models are specified as 'Qwen3-8B', 'Qwen3-4B', 'Qwen3-1.7B', and 'DeepSeek-R1-Distill-Llama3-8B' without specific checkpoint versions, HuggingFace model IDs, or snapshot dates. 'Post-training form released by the authors' is vague." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "The paper uses ReAct-style prompting for ALFWorld but does not provide the actual prompt text. Only interaction traces (action/observation pairs) are shown in examples." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Appendix B reports training hyperparameters: learning rate 1e-4, batch size 64, 5 epochs for KL-ST; learning rate 1e-6, βKL=0.02, STRONG_COST=0.02 for GRPO. Router threshold between 78th-85th percentile of KL distribution." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The routing framework is described in detail: Section 4.1 covers the 2-layer Transformer encoder architecture, step-level state representation, positional embeddings, and masked pooling. Section 3 formalizes the routing decision process. Figures 2 and 3 show the architecture." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4.2 describes the trajectory sampling protocol: high-precision rollouts, retention of successful trajectories only, step-wise KL computation, CDF-based thresholding to binary labels, and class-specific weighting for imbalance." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": false, 175 "justification": "No dedicated limitations, threats to validity, or discussion section exists. The conclusion (Section 6) is 4 sentences with no discussion of limitations." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": false, 180 "justification": "No threats to validity are discussed anywhere in the paper." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "No explicit statements about what the results do NOT show. The paper does not acknowledge limitations of testing on only one benchmark or one task domain." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw data (trajectories, KL divergence values, routing decisions) is released for independent verification." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 4.2 describes the trajectory sampling protocol in detail: 200 episodes from high-precision rollouts, filtering for successful trajectories, computing action distributions from both precision levels at each step." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. The data source is ALFWorld, a standard public benchmark." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is documented: high-precision rollouts → filter successful trajectories → compute step-wise KL → apply CDF threshold for binary labels → class-weighted training. GRPO pipeline (120 episodes, K rollouts per instance) is also described in Appendix B." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding information or acknowledgments section is present in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: University of Arizona, University of Pittsburgh, University of North Carolina at Chapel Hill, University of Central Florida. All academic institutions." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "No funding is disclosed, so independence cannot be assessed. The paper is a multi-university collaboration where funding would typically exist." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial disclosure statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for Qwen3 or DeepSeek-R1-Distill models." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether the models' training data could include ALFWorld task descriptions, solutions, or related content." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "ALFWorld was published in 2021. All models used (Qwen3, DeepSeek-R1-Distill) were trained after 2021. No contamination analysis is provided." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "High-precision usage ratio is reported for all methods (Table 1), which directly measures the fraction of expensive inference calls. The GHC metric explicitly captures cost-effectiveness. However, no actual wall-clock time or dollar costs are provided." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Training uses 200 episodes for KL-ST and 120 for GRPO (Appendix B), but no GPU hours, wall-clock time, or hardware specifications are reported." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No mention of multiple random seeds. All results appear to be single-seed runs." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of evaluation runs per configuration is never stated. Results are presented as point estimates without clarifying how many runs produced them." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "The KL threshold τ is 'manually selected based on the empirical distribution' (Appendix B) between 78th-85th percentiles. No systematic search budget is reported." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Appendix B states: 'Model selection is performed by choosing the checkpoint that achieves the highest validation accuracy across all training epochs.' Selection criterion and split are specified." 318 }, 319 "multiple_comparison_correction": { 320 "applies": false, 321 "answer": false, 322 "justification": "No statistical significance tests are performed, so multiple comparison correction does not apply." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The baselines (random routing) are all implemented by the authors. No discussion of self-evaluation bias or comparison with independently implemented baselines." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "Figure 5 and Table 1 plot success rate against high-precision usage ratio, which directly measures compute allocation. The GHC metric normalizes performance gain by compute cost." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "Section 5.1 asserts ALFWorld is 'a representative benchmark for evaluating agentic ability' but does not critically assess whether ALFWorld's text-based household tasks actually measure the 'long-horizon decision making' capability claimed." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "All comparisons use the same ReAct-based interaction framework with ALFWorld. The only variable is precision routing — the scaffold is held constant across all conditions." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "No discussion of whether the models' training data included ALFWorld-related content or solutions from after the benchmark's release." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup leaks information through context that would not be available in genuine deployment." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether training episodes (used for router training) and test episodes share structural similarities or task templates." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention method is applied." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "The router achieves near full-precision performance with less than 30% high-precision calls on Qwen3-8B", 371 "evidence": "Table 1: Router achieves 88.8% success rate with 26.7% high-precision ratio vs BF16's 89.6% at 100%. GHC of 19.85 vs best random routing GHC of 8.5.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "Sensitivity to quantization is concentrated at a small number of critical decision steps", 376 "evidence": "Figure 4 shows the step-wise KL divergence distribution between 3-bit and bf16 Qwen-8B is highly skewed, with most steps near zero and a heavy tail of high-divergence steps.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "GRPO further improves routing efficiency over KL-ST alone", 381 "evidence": "Table 2: On Qwen3-4B, GHC improves from 26.43 (KL-ST) to 43.02 (KL-ST+GRPO). On Qwen3-8B, from 18.79 to 19.85. However, no improvement on Qwen3-1.7B (both 21.95) or marginal on DeepSeek (30.61 to 38.77).", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "A lightweight 2-layer Transformer encoder is sufficient for step-level precision routing", 386 "evidence": "Section 4.1 and results in Table 1 demonstrate the architecture works. However, no comparison with larger routers, simpler routers, or other architectures is provided to establish that 2 layers is the right size.", 387 "supported": "weak" 388 }, 389 { 390 "claim": "The effectiveness of routing is bounded by the high-precision model's capability", 391 "evidence": "Table 2: GRPO provides largest GHC improvement on Qwen3-4B (highest base success rate at 93.3% BF16) and zero improvement on Qwen3-1.7B (lowest at 69.4% BF16). Section 5.3 discusses this pattern.", 392 "supported": "moderate" 393 } 394 ], 395 "red_flags": [ 396 { 397 "flag": "Single benchmark evaluation", 398 "detail": "All experiments are on ALFWorld only, despite the paper citing multiple agentic benchmarks (WebArena, ScienceWorld, SWE-bench) in related work. Claims about 'long-horizon agentic tasks' generally are not supported by a single text-based household environment." 399 }, 400 { 401 "flag": "No error bars or variance reporting", 402 "detail": "All results in Tables 1-3 are point estimates with no standard deviations, confidence intervals, or indication of how many runs produced each number. Given that ALFWorld episodes involve stochastic sampling, results could vary significantly across runs." 403 }, 404 { 405 "flag": "Missing comparison with existing routing methods", 406 "detail": "The related work section discusses FrugalGPT, RouteLLM, HybridLLM, BEST-Route, and Router-R1 extensively, but none are included as baselines. Only random routing (a trivially weak baseline) is compared." 407 }, 408 { 409 "flag": "No limitations section", 410 "detail": "The paper contains no limitations, threats to validity, or future work discussion. The 4-sentence conclusion acknowledges no weaknesses." 411 }, 412 { 413 "flag": "GHC metric can mask absolute performance gaps", 414 "detail": "For Qwen3-4B, the router achieves 81.3% success (with GHC 43.02) while Random@60% achieves 88.8% success (GHC 18.67). The high GHC rewards extreme cost reduction but the router's success rate is 7.5 percentage points lower than an achievable configuration." 415 } 416 ], 417 "cited_papers": [ 418 { 419 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 420 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R. Narasimhan", "Yuan Cao"], 421 "year": 2022, 422 "relevance": "Foundational LLM reasoning-and-acting framework used as the agent backbone in this study." 423 }, 424 { 425 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 426 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"], 427 "year": 2023, 428 "relevance": "Key work on LLMs learning to use external tools, relevant to agentic AI capability evaluation." 429 }, 430 { 431 "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?", 432 "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig"], 433 "year": 2024, 434 "arxiv_id": "2310.06770", 435 "relevance": "Major benchmark for evaluating LLM agents on real-world software engineering tasks." 436 }, 437 { 438 "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering", 439 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"], 440 "year": 2024, 441 "relevance": "LLM agent system for software engineering, representative of agentic AI capabilities." 442 }, 443 { 444 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 445 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 446 "year": 2024, 447 "relevance": "Pioneering cascading framework for cost-efficient LLM inference, directly comparable to routing approaches." 448 }, 449 { 450 "title": "RouteLLM: Learning to Route LLMs with Preference Data", 451 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"], 452 "year": 2025, 453 "arxiv_id": "2406.18665", 454 "relevance": "Learned routing policies between LLMs using preference data, a direct precursor to this work." 455 }, 456 { 457 "title": "Can Compressed LLMs Truly Act? An Empirical Evaluation of Agentic Capabilities in LLM Compression", 458 "authors": ["Peiyan Dong", "Zhangchi Tang", "Xinyu Liu"], 459 "year": 2025, 460 "relevance": "Empirical study showing significant performance degradation when quantized LLMs are used in agentic settings — the core motivation for this paper." 461 }, 462 { 463 "title": "Quantization Meets Reasoning: Exploring LLM Low-Bit Quantization Degradation for Mathematical Reasoning", 464 "authors": ["Zhen Li", "Yupeng Su", "Runming Yang"], 465 "year": 2025, 466 "arxiv_id": "2501.03035", 467 "relevance": "Studies quantization degradation in LLM reasoning, complementary evidence for precision sensitivity." 468 }, 469 { 470 "title": "BEST-Route: Adaptive LLM Routing with Test-Time Optimal Compute", 471 "authors": ["Dujian Ding", "Ankur Mallick", "Shuai Zhang"], 472 "year": 2025, 473 "relevance": "LLM routing with test-time compute allocation, directly comparable routing approach." 474 }, 475 { 476 "title": "Router-R1: Teaching LLMs Multi-Round Routing and Aggregation via Reinforcement Learning", 477 "authors": ["Hao Zhang", "Tao Feng", "Jieyu You"], 478 "year": 2025, 479 "arxiv_id": "2506.09033", 480 "relevance": "Formulates routing as sequential decision process with RL, the closest prior work to this paper's approach." 481 }, 482 { 483 "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models", 484 "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"], 485 "year": 2024, 486 "relevance": "Open-ended LLM agent for embodied tasks, representative of the agentic AI paradigm this paper targets." 487 }, 488 { 489 "title": "Efficient Agents: Building Effective Agents While Reducing Cost", 490 "authors": ["Nan Wang", "Xuemei Hu", "Pengfei Liu"], 491 "year": 2025, 492 "arxiv_id": "2508.02694", 493 "relevance": "Directly addresses cost reduction for LLM agents, the same problem space as this paper." 494 }, 495 { 496 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 497 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"], 498 "year": 2023, 499 "relevance": "LLM agent self-improvement through verbal feedback, relevant to agentic AI methodology." 500 } 501 ] 502 }