scan-v5.json (18299B)
1 { 2 "scan_version": 5, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "Towards Fair and Comprehensive Evaluation of Routers in Collaborative LLM Systems", 6 "authors": [ 7 "Wanxin Wu", 8 "He Zhu", 9 "Yixia Li", 10 "Lei Yang", 11 "Jie Zhao" 12 ], 13 "year": 2026, 14 "venue": "arXiv", 15 "arxiv_id": "2602.11877", 16 "doi": null 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "The 16.68% and 18.86% relative improvement claims are supported by Tables 1 and 2; generalization across model families is supported by Table 5 and Figure 5.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims like 'diverse training improves robustness' and 'hidden states outperform output-based signals' are supported by controlled ablation studies (Table 3, Table 4, Table 6) that vary one factor at a time.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The title claims 'fair and comprehensive evaluation' but the main experiments use a single small-large model pair (Llama-3.1-8B + GPT-5); the limitations section acknowledges this but the abstract still claims generality across diverse settings.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper attributes hidden-state superiority solely to capturing pre-generation uncertainty, but does not discuss alternative explanations such as the advantage being due to model-specific probing rather than a general principle.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper explicitly addresses the conflation problem — separating intrinsic router ability (AUROC) from end-to-end system performance, which is the core methodological contribution of the framework.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": true, 55 "justification": "A dedicated 'Limitations' section appears before the References, discussing the single model-pair constraint and single-run reporting.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": true, 61 "justification": "Specific threats noted include single small-large model pair, single random seed (seed=42), single-run results due to compute, and the model-convergence failure mode analyzed in Appendix D.2.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": true, 67 "justification": "The limitations section explicitly states conclusions are bounded to a single model pair and warns that 'broader validation across diverse architectures, multiple seeds, and more complex OOD conditions would further strengthen the conclusions.'", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding acknowledgment section appears anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are listed in the header: Southern University of Science and Technology, Institut Polytechnique de Paris, Peking University, Deepexi Technology Co. Ltd., University of Edinburgh, Beihang University, and Chinese University of Hong Kong.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funding source is disclosed, making this criterion not applicable.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement or declaration of financial interests appears in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key terms are defined with mathematical precision: router, AUROC, LPM, MPM, HCR, in-distribution, out-of-distribution, and the edge-cloud collaboration setting are all explicitly defined in Section 3.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper clearly states two contributions: (1) RouterXBench evaluation framework with three-dimensional metrics, and (2) ProbeDirichlet router using internal hidden states with Dirichlet aggregation.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The related work section covers LLM routing, LLM collaboration, and uncertainty estimation, and Section 3.2 specifically analyzes limitations of prior metrics (FrugalGPT, HybridLLM, RouteLLM) as motivation.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "benchmark-creation": { 120 "construct_design": { 121 "construct_validity_argued": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper explicitly argues that AUROC measures routing ability independent of the large model's strength, and that LPM/MPM/HCR measure scenario-specific alignment — addressing the conflation problem in existing metrics with mathematical formalization.", 125 "source": "haiku" 126 }, 127 "difficulty_distribution_characterized": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper mentions a 'difficulty gradient' conceptually ('simpler benchmarks such as Alpaca, Magpie, to more challenging ones like MMLU, Big-Math, and MATH') but provides no quantitative characterization of difficulty distribution within or across benchmarks.", 131 "source": "haiku" 132 }, 133 "ceiling_floor_effects_checked": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper does not explicitly check for ceiling or floor effects; AUROC values ranging from ~47% to ~74% suggest adequate discrimination, but this is never explicitly verified or discussed.", 137 "source": "haiku" 138 }, 139 "human_baseline_included": { 140 "applies": true, 141 "answer": false, 142 "justification": "No human baseline is included or discussed for any of the six benchmarks used in the evaluation framework.", 143 "source": "haiku" 144 }, 145 "scoring_rubric_justified": { 146 "applies": true, 147 "answer": false, 148 "justification": "AUROC is well-justified as threshold-independent, but the specific thresholds for scenario alignment (25-30% call rate for LPM, 85-95% relative performance for HCR) are stated as deployment scenarios without principled empirical or domain-specific justification.", 149 "source": "haiku" 150 } 151 }, 152 "robustness": { 153 "contamination_resistance_designed": { 154 "applies": true, 155 "answer": false, 156 "justification": "RouterXBench builds on existing public benchmarks (MMLU, Alpaca, etc.) without any contamination resistance measures such as temporal splits, canary strings, or dynamic generation.", 157 "source": "haiku" 158 }, 159 "temporal_robustness_discussed": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper does not discuss whether the benchmark will remain useful as models improve or whether existing benchmarks will be saturated, nor is there a plan for updates.", 163 "source": "haiku" 164 }, 165 "failure_modes_discussed": { 166 "applies": true, 167 "answer": true, 168 "justification": "Appendix D.2 provides a case study where routing fails because both small and large models converge on the same wrong answer, and the limitations section discusses this as a fundamental gap in routing frameworks.", 169 "source": "haiku" 170 }, 171 "baseline_implementations_provided": { 172 "applies": true, 173 "answer": true, 174 "justification": "Code is publicly available at https://github.com/zhuchichi56/RouterXBench, and Appendix A and B provide implementation details including fixed random seed, training setup, and data preparation.", 175 "source": "haiku" 176 } 177 }, 178 "documentation": { 179 "dataset_documentation_complete": { 180 "applies": true, 181 "answer": false, 182 "justification": "Table 7 provides basic statistics for the six datasets, and Appendix B describes data preparation, but there is no formal data card; ground-truth construction methodology using xVerify and GPT-5-as-Judge is described but not validated.", 183 "source": "haiku" 184 }, 185 "licensing_and_access_clear": { 186 "applies": true, 187 "answer": false, 188 "justification": "A GitHub link is provided for the code but no license is specified; the underlying datasets are public but their licenses are not discussed in the context of the framework.", 189 "source": "haiku" 190 }, 191 "intended_use_specified": { 192 "applies": true, 193 "answer": true, 194 "justification": "The three-dimensional evaluation structure (router ability vs. scenario alignment vs. cross-domain robustness) makes clear what should and should not be concluded from each metric, and the limitations section specifies what the benchmark does not cover.", 195 "source": "haiku" 196 } 197 } 198 } 199 }, 200 "claims": [ 201 { 202 "claim": "ProbeDirichlet achieves 16.68% relative improvement over the best baseline in router ability (AUROC) on in-domain and OOD scenarios.", 203 "evidence": "Table 1 shows ProbeDirichlet AUROC averages of 68.70 (in-domain) and 65.46 (OOD) vs. EmbeddingMLP at 59.46 and 55.22.", 204 "supported": "strong" 205 }, 206 { 207 "claim": "ProbeDirichlet achieves 18.86% relative improvement in high-accuracy (HCR) scenarios over the best baseline.", 208 "evidence": "Table 2 HCR rows show ProbeDirichlet averages of 18.50 (in-domain) and 15.40 (OOD) vs. SemanticEntropy at 15.17 and 13.35.", 209 "supported": "strong" 210 }, 211 { 212 "claim": "Internal hidden states outperform both output-based (logit/verbose) and external embedding-based routing signals.", 213 "evidence": "Table 4 directly compares Longformer embeddings, LLM embeddings, and LLM hidden states using identical linear classifiers, showing hidden states outperform by large margins particularly on math tasks.", 214 "supported": "strong" 215 }, 216 { 217 "claim": "Diverse multi-domain training yields additive gains without interference between domains.", 218 "evidence": "Table 6 shows that adding BigMath training preserves Alpaca performance (71.85→71.96) while improving BigMath (49.19→66.49) and OOD tasks.", 219 "supported": "strong" 220 }, 221 { 222 "claim": "Linear probe architecture is sufficient; adding hidden layers degrades generalization without improving performance.", 223 "evidence": "Figure 3 shows MLP variants with 16-128 hidden dimensions do not improve AUROC over the linear baseline but exhibit larger train-validation loss gaps.", 224 "supported": "moderate" 225 }, 226 { 227 "claim": "Existing routing metrics (static and curve-based) are inadequate for fair comparison because they conflate router ability with large model strength.", 228 "evidence": "Figure 1 (right) illustrates that router rankings reverse under small threshold shifts, but this illustration uses a stylized example rather than real experimental data.", 229 "supported": "moderate" 230 }, 231 { 232 "claim": "ProbeDirichlet generalizes across Llama and Qwen model families with consistent improvements over EmbeddingMLP baselines.", 233 "evidence": "Table 5 shows ProbeDirichlet outperforms EmbeddingMLP across Llama-3.1-8B and Qwen2.5 (0.5B, 3B, 7B), with average improvements of ~10.5% in-domain and ~9.6% OOD.", 234 "supported": "moderate" 235 } 236 ], 237 "methodology_tags": [ 238 "benchmark-eval", 239 "observational" 240 ], 241 "key_findings": "RouterXBench proposes a three-dimensional evaluation framework (router ability via AUROC, scenario alignment via LPM/MPM/HCR, cross-domain robustness) that disentangles intrinsic routing ability from end-to-end system performance, exposing limitations in prior single-metric evaluations. ProbeDirichlet, a lightweight router using internal hidden-state representations aggregated via Dirichlet distributions, achieves 16.68% and 18.86% relative improvements over the best baselines in router ability and high-accuracy scenarios. The primary driver of generalization is training data diversity rather than architectural complexity: diverse multi-domain training yields additive gains across domains without interference. Single-run experiments on one small-large model pair (Llama-3.1-8B + GPT-5) limit the strength of these conclusions.", 242 "red_flags": [ 243 { 244 "flag": "single model pair", 245 "detail": "All main experiments use only Llama-3.1-8B as the small model and GPT-5 as the large model; the framework's claims about generality rest on a single routing configuration." 246 }, 247 { 248 "flag": "single run, no variance", 249 "detail": "Appendix A explicitly states 'we report single-run results for all experiments' with a fixed seed=42, providing no statistical significance estimates or confidence intervals for any reported improvement." 250 }, 251 { 252 "flag": "circular evaluation: GPT-5 as both large model and judge", 253 "detail": "GPT-5 serves as the large model being routed to AND as the LLM-as-a-Judge evaluator for open-ended tasks (Alpaca, Magpie). Footnote 3 acknowledges this but dismisses it, creating potential circular bias in ground-truth label construction." 254 }, 255 { 256 "flag": "arbitrary scenario thresholds", 257 "detail": "The thresholds defining scenario alignment (25-30% call rate for LPM, 85-95% relative performance for HCR) are presented as deployment scenarios but are chosen without empirical or domain-specific justification." 258 }, 259 { 260 "flag": "no license for released code", 261 "detail": "The GitHub repository is mentioned but no software license is specified, limiting the clarity of reuse rights." 262 } 263 ], 264 "cited_papers": [ 265 { 266 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 267 "relevance": "Foundational paper on cost-aware LLM routing; RouterXBench's framework directly addresses limitations of FrugalGPT's fixed-accuracy metric." 268 }, 269 { 270 "title": "RouteLLM: Learning to route LLMs from preference data", 271 "relevance": "State-of-the-art preference-based router and evaluation baseline; introduces curve-based AUC metric critiqued by this paper." 272 }, 273 { 274 "title": "RouterBench: A benchmark for multi-LLM routing system", 275 "relevance": "Direct predecessor benchmark for LLM routing evaluation; RouterXBench positions itself as more comprehensive." 276 }, 277 { 278 "title": "RouterEval: A comprehensive benchmark for routing LLMs", 279 "relevance": "Contemporary routing benchmark; cited as related benchmarking effort that RouterXBench extends." 280 }, 281 { 282 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 283 "relevance": "Key baseline routing system using fixed-cost metric; represents the static metric paradigm critiqued here." 284 }, 285 { 286 "title": "AutoMix: Automatically Mixing Language Models", 287 "relevance": "Routing baseline using Incremental Benefit per Cost as a single-score metric; represents the static metric approach." 288 }, 289 { 290 "title": "Semantic uncertainty: Linguistic invariances for uncertainty estimation in natural language generation", 291 "relevance": "Provides the SemanticEntropy baseline compared against ProbeDirichlet throughout the experiments." 292 }, 293 { 294 "title": "MMLU: Measuring massive multitask language understanding", 295 "relevance": "Core in-domain benchmark used for both training and evaluation in the RouterXBench framework." 296 } 297 ], 298 "engagement_factors": { 299 "practical_relevance": { 300 "score": 3, 301 "justification": "LLM routing for edge-cloud cost reduction is an immediate deployment concern for practitioners; the framework directly guides router selection in production systems." 302 }, 303 "surprise_contrarian": { 304 "score": 2, 305 "justification": "The finding that data diversity matters more than architecture complexity (linear probe suffices) and that existing evaluation metrics produce misleading rankings challenges common assumptions." 306 }, 307 "fear_safety": { 308 "score": 1, 309 "justification": "Mentions safety-critical applications (healthcare) as a motivating scenario for high-accuracy routing, but does not raise broader AI risk concerns." 310 }, 311 "drama_conflict": { 312 "score": 1, 313 "justification": "Positions against existing evaluation frameworks (FrugalGPT, RouteLLM) but the critique is methodological rather than confrontational." 314 }, 315 "demo_ability": { 316 "score": 2, 317 "justification": "Code is publicly available on GitHub with a fixed seed and documented setup, making reproduction straightforward for practitioners with the required models." 318 }, 319 "brand_recognition": { 320 "score": 1, 321 "justification": "No top-tier lab affiliation; uses GPT-5 (OpenAI) as large model which adds some recognition, but the contributing institutions are not widely recognized in this context." 322 } 323 }, 324 "hn_data": { 325 "threads": [], 326 "top_points": 0, 327 "total_points": 0, 328 "total_comments": 0 329 } 330 }