scan.json (19888B)
1 { 2 "scan_version": 2, 3 "active_modules": [], 4 "paper": { 5 "title": "Routing, Cascades, and User Choice for LLMs", 6 "authors": ["Rafid Mahmood"], 7 "year": 2026, 8 "venue": "ICLR 2026", 9 "arxiv_id": "2602.09902" 10 }, 11 "methodology_tags": ["theoretical"], 12 "key_findings": "In a Stackelberg game between an LLM provider (two models) and a user, optimal routing policies almost always collapse to simple threshold-based static routing with no cascading. Cascading is rarely optimal except in a narrow regime where models are sufficiently differentiated in net value. Provider-user misalignment arises when their model rankings by utility versus cost-of-pass disagree. When churn penalties are low, providers are incentivized to throttle latency to reduce costs, harming user utility.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No code repository or plotting scripts are provided. The paper has computational figures (heatmaps in Figures 3-5) but no code is released." 19 }, 20 "data_released": { 21 "applies": false, 22 "answer": false, 23 "justification": "Purely theoretical paper. There is no dataset — all results are derived from closed-form mathematical expressions." 24 }, 25 "environment_specified": { 26 "applies": false, 27 "answer": false, 28 "justification": "Purely theoretical paper. Numerical examples are deterministic computations from closed-form equations, requiring no special computational environment." 29 }, 30 "reproduction_instructions": { 31 "applies": false, 32 "answer": false, 33 "justification": "Purely theoretical paper. The proofs are self-contained in the paper and appendices. Figures can be reproduced from the stated equations." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": false, 39 "answer": false, 40 "justification": "Purely theoretical paper with no experiments. All results are mathematical proofs, not empirical measurements." 41 }, 42 "significance_tests": { 43 "applies": false, 44 "answer": false, 45 "justification": "No experiments or comparative empirical claims. Results are proven theorems." 46 }, 47 "effect_sizes_reported": { 48 "applies": false, 49 "answer": false, 50 "justification": "No experiments. Theoretical characterizations are exact, not estimated." 51 }, 52 "sample_size_justified": { 53 "applies": false, 54 "answer": false, 55 "justification": "No experiments or samples. All results are analytical." 56 }, 57 "variance_reported": { 58 "applies": false, 59 "answer": false, 60 "justification": "No experimental runs. Results are deterministic mathematical derivations." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": false, 66 "answer": false, 67 "justification": "Purely theoretical paper. No system is evaluated against baselines. The paper characterizes equilibria of a game-theoretic model." 68 }, 69 "baselines_contemporary": { 70 "applies": false, 71 "answer": false, 72 "justification": "No empirical evaluation with baselines." 73 }, 74 "ablation_study": { 75 "applies": false, 76 "answer": false, 77 "justification": "No system with components to ablate. The paper is a mathematical analysis of a game." 78 }, 79 "multiple_metrics": { 80 "applies": false, 81 "answer": false, 82 "justification": "No empirical evaluation. The paper characterizes user utility and provider cost as optimization objectives within the model." 83 }, 84 "human_evaluation": { 85 "applies": false, 86 "answer": false, 87 "justification": "Purely theoretical paper. Human evaluation is irrelevant to the mathematical claims." 88 }, 89 "held_out_test_set": { 90 "applies": false, 91 "answer": false, 92 "justification": "No data or test sets. All results are analytical." 93 }, 94 "per_category_breakdown": { 95 "applies": false, 96 "answer": false, 97 "justification": "No empirical evaluation to break down by category. The paper does characterize results across parameter regimes (Theorems 2-5), but this is part of the theoretical analysis, not evaluation design." 98 }, 99 "failure_cases_discussed": { 100 "applies": false, 101 "answer": false, 102 "justification": "No system is being evaluated. The paper does characterize misalignment and throttling as adverse outcomes within the model, but this is part of the theoretical contribution, not failure analysis of an evaluated system." 103 }, 104 "negative_results_reported": { 105 "applies": false, 106 "answer": false, 107 "justification": "No experiments. The paper's finding that cascading is rarely optimal is part of the theoretical characterization, not a negative experimental result." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "All abstract claims are supported by formal proofs: static routing optimality (Theorems 3-5), misalignment gap (Proposition 1), and throttling incentives (Proposition 2). Each claim maps directly to a proven result." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper makes causal claims about incentives and optimal strategies (e.g., 'providers are incentivized to throttle latency'). These are derived rigorously from the formal game-theoretic model via mathematical proof, which is adequate for causal inference within the model's assumptions." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract states 'single-provider, single-user interactions.' Section 7 explicitly bounds the scope: two models only, subscription pricing (not pay-per-call), observable provider strategy, stationary abandonment policy, i.i.d. success probabilities, and known parameters." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 4.1 discusses what happens under imperfect information ('beliefs drive the same threshold logic'). Section 4.2 discusses extension to heterogeneous tasks with task-dependent parameters. Section 7 discusses how different modeling choices (hidden strategies, non-stationary policies, non-i.i.d. success) might affect results." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": false, 133 "answer": false, 134 "justification": "Purely theoretical paper with no empirical measurements. The paper defines its quantities (utility, cost, net value) precisely within the model." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": false, 140 "answer": false, 141 "justification": "No LLM models are used in experiments. The paper is a mathematical analysis. LLM usage for writing assistance (Appendix A) is not part of the experimental setup." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "The paper does not use prompting as part of its methodology. It is a theoretical analysis." 147 }, 148 "hyperparameters_reported": { 149 "applies": false, 150 "answer": false, 151 "justification": "No experiments with hyperparameters. The model parameters (pi, ci, ti, V, P) are symbolic and their ranges are analyzed theoretically." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding used. Purely theoretical paper." 157 }, 158 "data_preprocessing_documented": { 159 "applies": false, 160 "answer": false, 161 "justification": "No data collected or preprocessed. All results are derived analytically." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 7 contains a substantive limitations paragraph: 'We note several limitations and next steps' followed by four specific limitations — two-model restriction, subscription-only pricing, observable strategy assumption, and i.i.d. success assumption." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The limitations are specific to this study: 'our analysis is of two models,' 'our subscription framing abstracts away per-query monetary prices,' 'we assume that a user can observe the provider cascade strategy,' 'we treat the per-pass success as i.i.d.' These are not generic boilerplate." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 7 explicitly states what the results do NOT show: does not cover more than two models, pay-per-call pricing, hidden routing strategies, adaptive abandonment, non-i.i.d. success, or unknown parameters requiring online learning." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": false, 184 "answer": false, 185 "justification": "Purely theoretical paper. No empirical data to verify." 186 }, 187 "data_collection_described": { 188 "applies": false, 189 "answer": false, 190 "justification": "No data collection. All results are mathematical derivations." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No participants and no data source beyond the mathematical model." 196 }, 197 "data_pipeline_documented": { 198 "applies": false, 199 "answer": false, 200 "justification": "No data pipeline. Results are analytical." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Acknowledgments section states: 'This work was funded partially by the Natural Sciences and Engineering Research Council of Canada.'" 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliation is listed as 'University of Ottawa & NVIDIA.' NVIDIA is a major LLM infrastructure provider, and this is transparently disclosed." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "NSERC is a public Canadian funding agency with no commercial interest in LLM routing outcomes. While the author is affiliated with NVIDIA, NVIDIA is not listed as a funder." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present. The author is affiliated with NVIDIA, which has commercial interests in LLM inference infrastructure, but this potential conflict is not explicitly declared beyond the affiliation line." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "The paper does not evaluate any pre-trained model's capability on a benchmark. It is a purely theoretical game-theoretic analysis." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "No model evaluation on any benchmark. Purely theoretical." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "No benchmark evaluation. Purely theoretical." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants. Purely theoretical paper." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "Purely theoretical paper. No system with inference costs to report." 284 }, 285 "compute_budget_stated": { 286 "applies": false, 287 "answer": false, 288 "justification": "Purely theoretical paper. No significant computation beyond plotting." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "User patience is governed by the sign of net value ξi = Vpi − ti: when both models are value-dominated or both latency-dominated, user behavior is static regardless of routing policy.", 295 "evidence": "Theorems 1 and 2 (Section 4.1) provide formal proofs. Theorem 1 shows q*(2,s) = 1{ξ2 < 0}. Theorem 2 shows q*(1,s) = 0 when ξ1, ξ2 ≥ 0 and q*(1,s) = 1 when ξ1, ξ2 ≤ 0.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "Optimal routing almost always collapses to simple threshold-based static policies with no cascading.", 300 "evidence": "Theorems 3-5 (Section 4.2) characterize the provider-optimal policy. Theorem 3 shows no cascading is optimal when both models share the same sign. Theorem 4 shows cascading is optimal only in specific regimes for ξ1 < 0 < ξ2. Theorem 5 provides sufficient conditions for static policies when ξ1 > 0 > ξ2. Figure 4 visualizes that cascading is only optimal in narrow parameter regions.", 301 "supported": "strong" 302 }, 303 { 304 "claim": "Misalignment between provider and user arises when the user's model utility ranking and the provider's cost-of-pass ranking disagree.", 305 "evidence": "Proposition 1 (Section 5) formally characterizes when ΔU = 0 for each regime. Condition 1 shows alignment requires sign(c1/p1 − c2/p2) = sign(ξ2/p2 − ξ1/p1). Figure 5 visualizes misalignment gaps.", 306 "supported": "strong" 307 }, 308 { 309 "claim": "When churn penalties are sufficiently low (P ≤ min{c1/p1, c2/p2}), providers benefit from throttling latency, reducing user utility.", 310 "evidence": "Proposition 2 (Section 6) proves that setting ĥt_i > Vpi yields J*_post ≤ J*_pre. Full proof in Appendix C.4. Figure 5 (Right) visualizes the gain from throttling as a function of P and cost-of-pass difference.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "Cascading is valuable only when the two models are sufficiently differentiated in user net value.", 315 "evidence": "Theorem 4 shows cascading is optimal only in the regime ξ1 < 0 < ξ2 with c1/p1 < c2/p2 and P > P2. Theorem 5 shows cascading when ξ1 > 0 > ξ2 only for low P. When models are undifferentiated (both value- or latency-dominated), Theorem 3 shows cascading is never optimal.", 316 "supported": "strong" 317 } 318 ], 319 "red_flags": [], 320 "cited_papers": [ 321 { 322 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 323 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 324 "year": 2023, 325 "arxiv_id": "2305.05176", 326 "relevance": "Foundational work on LLM routing to reduce cost while maintaining performance, directly extended by this paper's game-theoretic analysis." 327 }, 328 { 329 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 330 "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang"], 331 "year": 2024, 332 "relevance": "Proposes routing by predicting task difficulty and allocating to cheaper models; this paper adds the user-behavior dimension missing from such algorithms." 333 }, 334 { 335 "title": "RouteLLM: Learning to Route LLMs from Preference Data", 336 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"], 337 "year": 2025, 338 "relevance": "Learns routing policies from preference data; relevant to the survey's coverage of LLM inference optimization and routing." 339 }, 340 { 341 "title": "A Unified Approach to Routing and Cascading for LLMs", 342 "authors": ["Jasper Dekoninck", "Maximilian Baader", "Martin Vechev"], 343 "year": 2025, 344 "relevance": "Proposes unified optimization for routing and cascading; this paper's finding that cascading is rarely optimal contrasts with that approach." 345 }, 346 { 347 "title": "RouterBench: A Benchmark for Multi-LLM Routing System", 348 "authors": ["Qitian Jason Hu", "Jacob Bieker", "Xiuyu Li"], 349 "year": 2024, 350 "arxiv_id": "2403.12031", 351 "relevance": "Benchmark for evaluating LLM routing systems, relevant to the survey's coverage of evaluation methodology for routing." 352 }, 353 { 354 "title": "Cost-of-Pass: An Economic Framework for Evaluating Language Models", 355 "authors": ["Mehmet Hamza Erol", "Batu El", "Mirac Suzgun"], 356 "year": 2025, 357 "arxiv_id": "2504.13359", 358 "relevance": "Formalizes cost-of-pass metric for LLM evaluation, directly used and extended in this paper's provider cost model." 359 }, 360 { 361 "title": "Pricing and Competition for Generative AI", 362 "authors": ["Rafid Mahmood"], 363 "year": 2024, 364 "relevance": "Prior work by the same author on LLM pricing economics; this paper extends the single-model framework to routing between two models." 365 }, 366 { 367 "title": "The Economics of Large Language Models: Token Allocation, Fine-tuning, and Optimal Pricing", 368 "authors": ["Dirk Bergemann", "Alessandro Bonatti", "Alex Smolin"], 369 "year": 2025, 370 "relevance": "Economic analysis of LLM deployment including user engagement and pricing, relevant to the survey's coverage of LLM operational economics." 371 }, 372 { 373 "title": "Human-AI Interactions and Societal Pitfalls", 374 "authors": ["Francisco Castro", "Jian Gao", "Sébastien Martin"], 375 "year": 2023, 376 "arxiv_id": "2309.10448", 377 "relevance": "Models multi-round human-AI interaction dynamics; foundational for this paper's user abandonment model." 378 }, 379 { 380 "title": "The Burden of Interactive Alignment with Inconsistent Preferences", 381 "authors": ["Ali Shirali"], 382 "year": 2025, 383 "relevance": "Studies interactive alignment as a Stackelberg game with costly user signals; closely related game-theoretic approach to human-AI interaction." 384 }, 385 { 386 "title": "Large Language Model Cascades with Mixture of Thoughts Representations for Cost-efficient Reasoning", 387 "authors": ["Murong Yue", "Jie Zhao", "Min Zhang"], 388 "year": 2023, 389 "arxiv_id": "2310.03094", 390 "relevance": "Proposes LLM cascade architectures for cost-efficient reasoning, relevant to the cascading mechanisms analyzed in this paper." 391 }, 392 { 393 "title": "Game Theory Meets Large Language Models: A Systematic Survey", 394 "authors": ["Haoran Sun", "Yusen Wu", "Yukun Cheng"], 395 "year": 2025, 396 "relevance": "Survey of game theory applied to LLMs; contextualizes this paper within the broader game-theoretic LLM literature." 397 } 398 ] 399 }