scan-v5.json (25586B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Federate the Router: Learning Language Model Routers with Sparse and Decentralized Evaluations", 6 "authors": [ 7 "Baris Askin", 8 "Shivam Patel", 9 "Anupam Nayak", 10 "Andrea Vigano", 11 "Jiin Woo" 12 ], 13 "year": 2026, 14 "venue": "arXiv.org", 15 "arxiv_id": "2601.22318", 16 "doi": "10.48550/arXiv.2601.22318" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "All abstract claims are backed by experimental results: federated improvement over client-local routers (Figures 2–3), accuracy-cost frontier gains (AUC comparisons across all clients), and theoretical suboptimality bounds (Section 5 and Appendix G).", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims ('federated training improves...') are supported by controlled comparisons holding all conditions constant except federation, evaluated on held-out test sets across two independent benchmarks.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The conclusion states federated learning 'offers a practical foundation for training LLM routers from privacy-sensitive, fragmented data,' but all experiments use a simulated N=10 client setup with artificial Dirichlet partitioning—no discussion of the simulation-to-deployment gap.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper does not discuss whether simple data pooling (rather than the federated algorithm itself) could explain the gains; Appendix D.1 shows federated ≈ centralized but does not analyze whether dataset size alone drives improvements.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper directly measures routing accuracy (model correctness on actual queries) and inference cost, which are precisely what is claimed—no proxy conflation.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions only future work (online routing) but does not systematically address limitations.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No specific threats are discussed—e.g., that Dirichlet partitioning may not reflect real heterogeneity, that the uniform logging assumption required for K-Means theory is violated in experiments, or that N=10 may not scale.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "No explicit scope boundaries state what the results do NOT show—e.g., behavior with hundreds of clients, real privacy attacks, or non-stationary query distributions.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Acknowledgements explicitly list US DOE grant DESC0025652, NSF grants CNS-2409138, CNS-2533813, CCF 2045694, CNS-2112471, CPS-2111751, ONR N00014-23-1-2149, and the AI2C Seed grant.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All authors are listed as Carnegie Mellon University affiliates in the author block.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": true, 87 "justification": "All funding sources are US government agencies (DOE, NSF, ONR) and an academic seed grant, independent of commercial LLM routing outcomes.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial interests statement appears anywhere in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are formally defined: 'routing policy' (Section 3, Eq. 4), 'utility' (acc(x,m) − λ·cost(x,m), Eq. 1), 'suboptimality' (Definition 5.2), and the federated data model (Eq. 2).", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Four explicit bullet-point contributions are listed at the end of Section 1: problem formulation, federated training procedures for both router families, theoretical guarantees, and empirical evaluation.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 2 and Appendix A provide detailed engagement with both FL and LLM routing literature, explicitly positioning this as the first FL-routing combination and distinguishing from cascading, speculative decoding, and bandit-based approaches.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": false, 124 "justification": "No code repository is linked or mentioned anywhere in the paper.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper uses RouterBench-Data (Hu et al., 2024) and ProxRouter-Data (Patel et al., 2025), both publicly available prior benchmarks used unmodified.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "No requirements.txt, Dockerfile, or software environment specification is provided; only the MLP architecture and optimizer hyperparameters are described.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "Appendix C provides experimental details but no step-by-step reproduction instructions; key implementation decisions (e.g., federated simulation code, embedding pipeline) are not specified.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All AUC scores and accuracy-cost curves are reported as single point estimates with no confidence intervals, error bars, or standard deviations across runs.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are used for any comparative claims; improvements are reported as raw AUC differences without p-values or hypothesis tests.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "AUC differences are reported in absolute terms throughout (e.g., federated 0.75 vs. client-local mean 0.69 for MLP on RouterBench), providing effect size context.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The choice of N=10 clients and 75/25 train-test split is not justified with any power analysis or sensitivity study.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "No variance across random seeds or Dirichlet partition realizations is reported; all results appear to be single-run point estimates.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Client-local (no-FL) routers serve as the primary baseline; centralized training (oracle with full data pooled) is compared in Appendix D.1.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Client-local training is the natural competitive alternative, and centralized training is the appropriate oracle upper bound; both are competitive and contemporarily relevant.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "The paper ablates adaptive personalization (Section 6.4), different sentence encoder models (Appendix E), and new-model/new-client extension scenarios (Sections 6.3, D.3).", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Both accuracy and inference cost are measured as primary metrics, reported jointly as accuracy-cost frontier curves with normalized AUC as a scalar summary.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "This is a routing systems paper; human evaluation of model outputs is not relevant to routing quality assessment.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "Each client uses a 75/25 train-test split; global test set is the union of all client test splits (Appendix C).", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Per-client breakdowns are provided for all 10 clients in Figures 10–11 (RouterBench) and Figures 17–18 (ProxRouter).", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "Section 6.4 explicitly shows cases where federated MLP-Router underperforms client-local routers under extreme heterogeneity (α=0.03), e.g., Client 6 (federated 0.71 < local 0.73), motivating adaptive personalization.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper directly reports that federated training can hurt some clients under high heterogeneity, and Figure 5 shows specific cases where client-local outperforms federated.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "RouterBench-Data includes evaluations on specifically versioned models (GPT-3.5 Turbo 1106, GPT-4 1106 Preview, Claude v1/v2/Instant, Llama 2 70B Chat, etc.) shown in Figure 8; sentence encoder all-mpnet-base-v2 is also specified.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": false, 243 "answer": false, 244 "justification": "No new LLM querying occurs during experiments; the paper uses pre-existing evaluation datasets from RouterBench and ProxRouter, so no prompts are generated.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Appendix C reports AdamW lr=10^-3, weight decay=3×10^-4, batch size=128, gradient clip norm=1.0, Klocal=15, Kglobal=20, 3 K-means restarts, participation rate=0.6, and λ sweep grid.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": true, 255 "answer": true, 256 "justification": "FedAvg (Algorithm 1) and federated K-Means (Algorithm 2) are described step-by-step with full pseudocode covering all communication rounds, local update steps, and aggregation.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Dirichlet partitioning parameters (α=0.6 main, α=0.03 extreme, α=0.45 model assignment), 75/25 split, and the federated simulation protocol are documented in Appendices B and C.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "RouterBench-Data and ProxRouter-Data are publicly available benchmarks with references; raw evaluation data can be obtained from the original sources.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "The federated simulation protocol is described in detail in Section 6 and Appendices B–C, including Dirichlet partitioning and model assignment procedures.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; all data comes from existing NLP evaluation benchmarks.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The full pipeline from benchmark data → Dirichlet client partitioning → federated training (Algorithms 1–2) → evaluation is documented across Section 6 and Appendices B–C.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "The paper trains routing models (MLPs and K-Means) over pre-existing evaluation data, not LLM capabilities—contamination of the routing model itself is not a meaningful concern.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": false, 301 "answer": false, 302 "justification": "N/A—the routing model is not an LLM evaluated on downstream benchmarks.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": false, 307 "answer": false, 308 "justification": "N/A—the routing model is not an LLM; contamination of underlying LLM evaluations in RouterBench is out of scope.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": true, 360 "justification": "Inference cost is one of the two primary metrics throughout all experiments; accuracy-cost frontier curves explicitly report average API cost per query in dollars.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "The acknowledgements mention PSC Bridges-2 GPU via ACCESS allocation CIS250087 but do not report total GPU-hours or compute cost for the experiments.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Federated training consistently improves the global accuracy-cost frontier over all client-local routers for both MLP and K-Means families.", 375 "evidence": "Figure 2: federated AUC 0.75 vs. client-local range 0.63–0.72 (MLP) and 0.75 vs. 0.55–0.70 (K-Means) on RouterBench-Data; replicated on ProxRouter-Data (Figure 15).", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Federated learning improves in-distribution local performance via better effective model coverage under sparse, imbalanced evaluations.", 380 "evidence": "Figures 3 and 10–11: all 10 clients show improved local-test AUC; mean AUC improves 0.69→0.74 (MLP) and 0.64→0.75 (K-Means) across clients.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Federated routers match centralized (oracle) training performance despite operating under decentralized data constraints.", 385 "evidence": "Figure 9: MLP-Federated AUC 0.75 = MLP-Centralized 0.75; K-Means-Federated 0.75 = K-Means-Centralized 0.75 on RouterBench-Data.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Adaptive personalization improves robustness under extreme client heterogeneity (α=0.03) where global federated routing underperforms some clients.", 390 "evidence": "Figure 5: personalized MLP improves over federated for Clients 4 (0.72 vs 0.69) and 6 (0.72 vs 0.71); mean personalized AUC 0.75 = federated 0.75 across all clients (Figure 13).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Both router types support lightweight extension to new models and new clients without full retraining.", 395 "evidence": "Figure 4: adding 3 withheld models improves MLP AUC 0.732→0.748 and K-Means 0.732→0.749; Figure 12 shows similar gains after 3 new clients join.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Theoretical analysis proves federated routing reduces suboptimality from O(1/√Di) (local) to O(1/√D) (federated) where D >> Di.", 400 "evidence": "Theorems 5.3 and 5.5 bound suboptimality with formal proofs in Appendix G under realizability, smoothness, and bounded heterogeneity assumptions.", 401 "supported": "strong" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "theoretical" 407 ], 408 "key_findings": "This paper introduces the first federated learning framework for LLM query routing, enabling decentralized clients to collaboratively train routing policies over private, heterogeneous query-model evaluation data without centralizing raw queries. Federated training achieves the accuracy-cost frontier of centralized (oracle) training while consistently outperforming all client-local baselines on RouterBench-Data and ProxRouter-Data across both MLP and K-Means router families. An adaptive personalization mechanism that interpolates between federated and local routers based on calibration error provides robustness when global collaboration misaligns with individual client distributions under extreme heterogeneity. Formal suboptimality bounds (Theorems 5.3, 5.5) confirm that federated aggregation reduces routing error at rate O(1/√D) versus O(1/√Di) for local training, with the gain proportional to the degree of complementary model coverage across clients.", 409 "red_flags": [ 410 { 411 "flag": "Simulated federation only", 412 "detail": "The federated setting is entirely simulated via Dirichlet partitioning of existing benchmark datasets with N=10 clients; no real federated deployment is tested, leaving a material gap between simulation and practice that is never discussed." 413 }, 414 { 415 "flag": "No statistical significance testing", 416 "detail": "All comparative claims are made without confidence intervals, error bars, or p-values. AUC differences of 0.01–0.05 are treated as meaningful improvements without accounting for run-to-run variance." 417 }, 418 { 419 "flag": "Single-run results", 420 "detail": "No variance across random seeds or Dirichlet partition realizations is reported; all results depend on a single experimental run, making stability of findings unclear." 421 }, 422 { 423 "flag": "Theory-experiment assumption mismatch", 424 "detail": "The K-Means suboptimality bound (Theorem 5.5) requires uniform model logging (Assumption G.14), but experiments explicitly use highly non-uniform Dirichlet model assignment (α=0.45). The theoretical guarantee does not cover the actual experimental setting." 425 }, 426 { 427 "flag": "No limitations section", 428 "detail": "No dedicated limitations or threats-to-validity section exists; the paper omits discussion of scalability beyond 10 clients, the simulation-reality gap, non-stationary query distributions, or communication overhead in real deployments." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "Communication-Efficient Learning of Deep Networks from Decentralized Data (FedAvg)", 434 "relevance": "Foundational federated learning algorithm (McMahan et al., 2017) that this paper directly adapts as Algorithm 1 for federated MLP-Router training." 435 }, 436 { 437 "title": "RouterBench: A Benchmark for Multi-LLM Routing System", 438 "relevance": "Primary evaluation dataset providing query-model accuracy and cost evaluations for 11 LLMs across 8 datasets; the main experimental benchmark throughout the paper." 439 }, 440 { 441 "title": "RouteLLM: Learning to Route LLMs from Preference Data", 442 "relevance": "Key prior work on learning LLM routers from evaluation data; representative of the centralized routing paradigm this federated framework is designed to complement." 443 }, 444 { 445 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 446 "relevance": "Related parametric routing approach; cited as prior centralized router work that assumes access to centralized evaluation data." 447 }, 448 { 449 "title": "ProxRouter: Proximity-Weighted LLM Query Routing for Improved Robustness to Outliers", 450 "relevance": "Provides the second evaluation benchmark (ProxRouter-Data) with 14 LLMs over 10 datasets used for cross-benchmark validation." 451 }, 452 { 453 "title": "Universal LLM Routing with Correctness-Based Representation", 454 "relevance": "Related nonparametric routing approach; cited as prior K-Means-style routing that this paper extends to the federated setting." 455 }, 456 { 457 "title": "Advances and Open Problems in Federated Learning", 458 "relevance": "Comprehensive survey (Kairouz et al., 2021) cited to frame statistical heterogeneity and open challenges that motivate this work's federated formulation." 459 } 460 ], 461 "engagement_factors": { 462 "practical_relevance": { 463 "score": 3, 464 "justification": "Directly addresses a real enterprise/edge deployment problem: privacy-preserving LLM routing where organizations cannot share sensitive query data centrally." 465 }, 466 "surprise_contrarian": { 467 "score": 1, 468 "justification": "Applying federated learning to LLM routing is novel, but the result that FL helps when data is fragmented is expected; no surprising reversals or counterintuitive findings." 469 }, 470 "fear_safety": { 471 "score": 1, 472 "justification": "Privacy protection is a core motivation (client queries remain local), touching on data safety concerns, but no AI safety or broader risk framing." 473 }, 474 "drama_conflict": { 475 "score": 0, 476 "justification": "Straightforward systems/theory paper with no controversy or conflict with prior work claims." 477 }, 478 "demo_ability": { 479 "score": 1, 480 "justification": "Code is not released, limiting immediate demonstrability; RouterBench data is public and the algorithm is well-specified, but reproduction requires non-trivial implementation effort." 481 }, 482 "brand_recognition": { 483 "score": 1, 484 "justification": "Carnegie Mellon University is well-regarded for ML systems research, but this is not from a major AI lab (Google, OpenAI, Meta, etc.)." 485 } 486 }, 487 "hn_data": { 488 "threads": [ 489 { 490 "hn_id": "46925532", 491 "title": "Convergent Discovery of Critical Phenomena Mathematics Across Disciplines", 492 "points": 4, 493 "comments": 3, 494 "url": "https://news.ycombinator.com/item?id=46925532", 495 "created_at": "2026-02-07T17:16:44Z" 496 } 497 ], 498 "top_points": 4, 499 "total_points": 4, 500 "total_comments": 3 501 } 502 }