scan-v4.json (19691B)
1 { 2 "scan_version": 4, 3 "paper_type": "benchmark-creation", 4 "paper": { 5 "title": "Towards Fair and Comprehensive Evaluation of Routers in Collaborative LLM Systems", 6 "authors": [ 7 "Wanxing Wu", 8 "He Zhu", 9 "Yixia Li", 10 "Lei Yang", 11 "Jiehui Zhao", 12 "Hongru Wang", 13 "Jian Yang", 14 "Benyou Wang", 15 "Bingyi Jing", 16 "Guanhua Chen" 17 ], 18 "year": 2026, 19 "venue": "arXiv", 20 "arxiv_id": "2602.11877", 21 "doi": null 22 }, 23 "checklist": { 24 "claims_and_evidence": { 25 "abstract_claims_supported": { 26 "applies": true, 27 "answer": true, 28 "justification": "Abstract claims of '16.68% and 18.86% relative improvements' are supported by Tables 1 and 2. Claims of consistency across model families supported by Table 5. Agent scenario claim supported by Figure 5.", 29 "source": "opus" 30 }, 31 "causal_claims_justified": { 32 "applies": true, 33 "answer": true, 34 "justification": "Causal claims ('data diversity yields additive gains,' 'Dirichlet aggregation prevents overfitting') are supported by controlled ablation studies. Table 6 varies training data composition while holding architecture fixed. Table 3 varies aggregation while holding data fixed. Single-variable manipulation is adequate.", 35 "source": "opus" 36 }, 37 "generalization_bounded": { 38 "applies": true, 39 "answer": false, 40 "justification": "The title claims 'Collaborative LLM Systems' but experiments test only two-model edge-cloud routing with one primary model pair (GPT-5 + Llama-3.1-8B). The Limitations section acknowledges 'single small-large model pair' but the title and framing remain broader than the evidence.", 41 "source": "opus" 42 }, 43 "alternative_explanations_discussed": { 44 "applies": true, 45 "answer": true, 46 "justification": "Section 6 discusses why hidden states outperform embeddings (hierarchical vs surface-level information), why linear probes suffice (hidden states encode shared difficulty notion), and alternative data composition effects (interference vs additive gains, Table 6).", 47 "source": "opus" 48 }, 49 "proxy_outcome_distinction": { 50 "applies": true, 51 "answer": true, 52 "justification": "The paper explicitly separates router ability (AUROC, intrinsic discrimination) from end-to-end system accuracy, noting that 'end-to-end accuracy at a given cost reflects both' router skill and model strength (Section 3.2). The framework is designed to disentangle proxy from outcome.", 53 "source": "opus" 54 } 55 }, 56 "limitations_and_scope": { 57 "limitations_section_present": { 58 "applies": true, 59 "answer": true, 60 "justification": "A dedicated 'Limitations' section follows the Conclusion, discussing the assumption that the large model is always better, single model pair constraint, and single-run results.", 61 "source": "opus" 62 }, 63 "threats_to_validity_specific": { 64 "applies": true, 65 "answer": true, 66 "justification": "Specific threats identified: 'both models may perform similarly or converge on the same incorrect answer in certain domains' (with analysis in Appendix D.2), 'single small-large model pair,' and 'single-run results due to computational constraints.' These are specific to this study.", 67 "source": "opus" 68 }, 69 "scope_boundaries_stated": { 70 "applies": true, 71 "answer": true, 72 "justification": "States what was not tested: 'broader validation across diverse architectures, multiple seeds, and more complex OOD conditions would further strengthen the conclusions.' Also notes the assumption that large model capability exceeds small model's.", 73 "source": "opus" 74 } 75 }, 76 "conflicts_of_interest": { 77 "funding_disclosed": { 78 "applies": true, 79 "answer": false, 80 "justification": "No funding or acknowledgments section found in the paper. Authors include one from Deepexi Technology Co. Ltd. (industry) and several from universities.", 81 "source": "opus" 82 }, 83 "affiliations_disclosed": { 84 "applies": true, 85 "answer": true, 86 "justification": "Author affiliations are clearly listed: Southern University of Science and Technology, Institut Polytechnique de Paris, Peking University, Deepexi Technology Co. Ltd., University of Edinburgh, Beihang University, Chinese University of Hong Kong (Shenzhen).", 87 "source": "opus" 88 }, 89 "funder_independent_of_outcome": { 90 "applies": true, 91 "answer": false, 92 "justification": "No funding disclosed, so independence cannot be assessed. One co-author is from Deepexi Technology, an industry entity whose interest in the outcome is unknown.", 93 "source": "opus" 94 }, 95 "financial_interests_declared": { 96 "applies": true, 97 "answer": false, 98 "justification": "No competing interests or financial disclosure statement found in the paper.", 99 "source": "opus" 100 } 101 }, 102 "scope_and_framing": { 103 "key_terms_defined": { 104 "applies": true, 105 "answer": true, 106 "justification": "Section 3.1 formally defines the routing problem with equations covering router score, decision threshold, call rate, and system performance. AUROC, LPM, HCR, and MPM are all defined with explicit mathematical expressions.", 107 "source": "haiku" 108 }, 109 "intended_contribution_clear": { 110 "applies": true, 111 "answer": true, 112 "justification": "Two contributions are clearly stated throughout: RouterXBench evaluation framework (three-dimensional) and ProbeDirichlet router (hidden-state based with Dirichlet aggregation), each described in dedicated sections.", 113 "source": "haiku" 114 }, 115 "engagement_with_prior_work": { 116 "applies": true, 117 "answer": true, 118 "justification": "Section 2 and Section 3.2 explicitly critique prior work (FrugalGPT, HybridLLM, RouterBench, RouteLLM, AutoMix) by demonstrating specific limitations of their metrics with Figure 1, not merely listing references.", 119 "source": "haiku" 120 } 121 } 122 }, 123 "type_checklist": { 124 "benchmark-creation": { 125 "construct_design": { 126 "construct_validity_argued": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 3.3 explicitly argues that AUROC isolates router discriminative ability from large model capability, and Section 3.2 argues why existing metrics conflate these — a specific, formal construct validity argument with mathematical support.", 130 "source": "haiku" 131 }, 132 "difficulty_distribution_characterized": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper characterizes difficulty only at the dataset level (Alpaca=easy to MATH=hard) but does not formally characterize the difficulty distribution within benchmark items or provide easy/medium/hard tier breakdowns.", 136 "source": "haiku" 137 }, 138 "ceiling_floor_effects_checked": { 139 "applies": true, 140 "answer": false, 141 "justification": "Baseline AUROC scores near 50% (SelfAsk at 49.99 in-domain average) suggest floor effects for weaker methods, but ceiling/floor effects are never explicitly checked or discussed as a benchmark design property.", 142 "source": "haiku" 143 }, 144 "human_baseline_included": { 145 "applies": true, 146 "answer": false, 147 "justification": "No human baseline is provided anywhere in the paper; the benchmark evaluates automated routing decisions without any human reference point for routing quality.", 148 "source": "haiku" 149 }, 150 "scoring_rubric_justified": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 3.2 provides detailed justification for AUROC over static and curve-based metrics. Section 3.3 defines band parameters for LPM (25-30% call rate) and HCR (85-95% relative performance) with deployment scenario rationale, though specific cutoff values are asserted rather than empirically motivated.", 154 "source": "haiku" 155 } 156 }, 157 "robustness": { 158 "contamination_resistance_designed": { 159 "applies": true, 160 "answer": false, 161 "justification": "The benchmark uses widely known public datasets (MMLU, Alpaca, Big-Math) with no temporal splits, canary strings, or dynamic generation to prevent contamination of router training data from evaluation sets.", 162 "source": "haiku" 163 }, 164 "temporal_robustness_discussed": { 165 "applies": true, 166 "answer": false, 167 "justification": "The paper does not discuss whether RouterXBench will remain discriminative as models improve, whether chosen benchmarks will be saturated, or any plan for updating the evaluation suite over time.", 168 "source": "haiku" 169 }, 170 "failure_modes_discussed": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 3.2 discusses failure modes of the metric design (threshold sensitivity, AUC scenario-blindness). Appendix D.2 provides a concrete case study where routing fundamentally fails because both models converge on the same wrong answer.", 174 "source": "haiku" 175 }, 176 "baseline_implementations_provided": { 177 "applies": true, 178 "answer": true, 179 "justification": "Footnote 1 states 'Our code is publicly available at https://github.com/zhuchichi56/RouterXBench,' enabling reproduction of all reported baselines and the ProbeDirichlet router.", 180 "source": "haiku" 181 } 182 }, 183 "documentation": { 184 "dataset_documentation_complete": { 185 "applies": true, 186 "answer": true, 187 "justification": "Appendix B documents all six datasets with Table 7 statistics (domain, train/val/test sizes), ground truth construction methodology (xVerify for reasoning tasks, GPT-5-as-judge for open-ended), and evaluation protocols.", 188 "source": "haiku" 189 }, 190 "licensing_and_access_clear": { 191 "applies": true, 192 "answer": false, 193 "justification": "While code is available on GitHub, the paper states no license for RouterXBench and does not reference the licenses of underlying benchmark datasets (MMLU, Alpaca, etc.) or clarify usage terms.", 194 "source": "haiku" 195 }, 196 "intended_use_specified": { 197 "applies": true, 198 "answer": true, 199 "justification": "The paper specifies intended use as evaluating LLM routers in edge-cloud collaborative settings under three deployment scenarios (budget-sensitive, balanced, accuracy-critical), with clear definition of what each metric captures and does not capture.", 200 "source": "haiku" 201 } 202 } 203 } 204 }, 205 "claims": [ 206 { 207 "claim": "ProbeDirichlet achieves 16.68% relative improvement over best baseline in router ability (AUROC) on average across in-domain and OOD benchmarks", 208 "evidence": "Table 1 shows ProbeDirichlet in-domain average AUROC 68.70 vs EmbeddingMLP 59.46 (~15.5% relative), OOD 65.46 vs 55.22 (~18.5% relative); paper reports the average as 16.68%", 209 "supported": "strong" 210 }, 211 { 212 "claim": "ProbeDirichlet achieves 18.86% relative improvement over best baseline in accuracy-critical (HCR) scenarios", 213 "evidence": "Table 2 HCR section: ProbeDirichlet in-domain 18.50 vs SemanticEntropy 15.17 (~21.9%); OOD 15.40 vs 13.35 (~15.4%); reported average improvement 18.86%", 214 "supported": "moderate" 215 }, 216 { 217 "claim": "Internal hidden states significantly outperform output probabilities and external embeddings for routing decisions", 218 "evidence": "Table 4 shows LLM Hidden (71.34 Alpaca, 62.39 BigMath) vs LLM Embedding (62.47, 56.21) vs Longformer (61.95, 43.10) with consistent, substantial margins especially for math tasks", 219 "supported": "strong" 220 }, 221 { 222 "claim": "Data diversity yields additive gains without domain interference in cross-domain generalization", 223 "evidence": "Table 6: adding BigMath to Alpaca training preserves Alpaca AUROC (71.85→71.63) while dramatically improving BigMath (49.19→66.49) and MMLU (49.35→51.06); full mix shows further gains", 224 "supported": "strong" 225 }, 226 { 227 "claim": "Linear probes suffice for routing; MLP layers do not improve performance and increase overfitting", 228 "evidence": "Figure 3 shows MLP variants with 16-128 hidden dimensions match or fall below linear probe AUROC (~78%) while widening train-validation loss gaps, with dropout only partially mitigating overfitting", 229 "supported": "strong" 230 }, 231 { 232 "claim": "Existing routing metrics (static thresholds and AUC) are insufficient for fair evaluation due to threshold sensitivity and scenario blindness", 233 "evidence": "Figure 1 illustrative example shows reversed router rankings in call-rate range 20-40%, motivating the triple-perspective framework — but this is a constructed illustration, not an empirical study of evaluation failures", 234 "supported": "moderate" 235 } 236 ], 237 "methodology_tags": [ 238 "benchmark-eval", 239 "empirical" 240 ], 241 "key_findings": "RouterXBench proposes a three-dimensional evaluation framework for LLM routers—router ability (AUROC), scenario alignment (LPM/MPM/HCR), and cross-domain robustness—that disentangles intrinsic routing capability from large model performance, addressing fundamental limitations in existing single-metric evaluations. The ProbeDirichlet router, trained on multi-domain data using internal hidden states with Dirichlet layer aggregation, achieves 16.68% and 18.86% relative improvements over best baselines in router ability and accuracy-critical scenarios. Ablation analyses show that the key driver of performance is using internal hidden states rather than output probabilities or external embeddings, while data diversity is the primary driver of cross-domain generalization rather than architectural complexity. The Dirichlet aggregation itself contributes only marginally over simple mean pooling (68.70 vs 68.04 AUROC), with most gains attributable to the hidden-state signal choice rather than the architectural innovation.", 242 "red_flags": [ 243 { 244 "flag": "Single-run results throughout", 245 "detail": "Appendix A states all experiments are conducted with a fixed seed and single run due to computational constraints; no variance estimates or confidence intervals are provided for any comparison, including the marginal Dirichlet vs. MeanPool gap (68.70 vs 68.04 AUROC)." 246 }, 247 { 248 "flag": "GPT-5 serves as both evaluated model and judge", 249 "detail": "GPT-5 is simultaneously the large model being routed to AND the LLM-as-Judge evaluator for ground truth construction on open-ended generation tasks (Appendix B.1), creating circular evaluation where the oracle model defines its own routing ground truth." 250 }, 251 { 252 "flag": "Dirichlet contribution marginal relative to presented novelty", 253 "detail": "Table 3 shows the core architectural innovation (Dirichlet vs. MeanPool aggregation) yields only 0.66 AUROC points (68.70 vs 68.04). The main gains come from using hidden states at all, undercutting the paper's framing of Dirichlet as a key contribution." 254 }, 255 { 256 "flag": "No funding disclosure with commercial co-author", 257 "detail": "No funding acknowledgment despite two co-authors from Deepexi Technology Co. Ltd. and no competing interests statement anywhere in the paper." 258 }, 259 { 260 "flag": "No contamination resistance for public benchmark datasets", 261 "detail": "RouterXBench uses MMLU, Alpaca, and Big-Math as both training and evaluation data with no measures to prevent router training data from overlapping with publicly known evaluation sets." 262 }, 263 { 264 "flag": "No human baseline", 265 "detail": "No human performance baseline is provided for any of the routing tasks, making it impossible to assess whether AUROC differences are practically meaningful or how much headroom exists." 266 } 267 ], 268 "cited_papers": [ 269 { 270 "title": "RouteLLM: Learning to route LLMs from preference data", 271 "relevance": "Direct predecessor benchmark using preference-based routing and curve-based AUC metric explicitly criticized by this paper" 272 }, 273 { 274 "title": "RouterBench: A benchmark for multi-LLM routing system", 275 "relevance": "Prior multi-LLM routing benchmark (ICML 2024 workshop), direct comparison context for RouterXBench" 276 }, 277 { 278 "title": "RouterEval: A comprehensive benchmark for routing LLMs to explore model-level scaling up", 279 "relevance": "Concurrent comprehensive routing benchmark (EMNLP 2025 Findings) cited as prior work that RouterXBench extends" 280 }, 281 { 282 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 283 "relevance": "Key baseline paper for cost-efficient LLM routing using static threshold metrics; paradigmatic example of the limitations RouterXBench addresses" 284 }, 285 { 286 "title": "Hybrid LLM: Cost-efficient and quality-aware query routing", 287 "relevance": "Direct experimental baseline; representative logit-based and cost-quality routing approach" 288 }, 289 { 290 "title": "AutoMix: Automatically mixing language models", 291 "relevance": "Training-free routing baseline using Incremental Benefit per Cost single metric; motivates multi-dimensional evaluation" 292 }, 293 { 294 "title": "Semantic uncertainty: Linguistic invariances for uncertainty estimation in natural language generation", 295 "relevance": "Provides SemanticEntropy baseline method for routing; represents uncertainty-based routing signal category" 296 }, 297 { 298 "title": "INSIDE: LLMs' internal states retain the power of hallucination detection", 299 "relevance": "Prior work using LLM internal hidden states for quality assessment, directly motivates the hidden-state routing approach in ProbeDirichlet" 300 } 301 ], 302 "engagement_factors": { 303 "practical_relevance": { 304 "score": 3, 305 "justification": "Cost-accuracy routing is a live operational concern for any organization deploying LLMs; code is publicly available and the framework directly guides deployment decisions." 306 }, 307 "surprise_contrarian": { 308 "score": 1, 309 "justification": "Internal hidden states outperforming output logits aligns with growing literature; the multi-dimensional metric framework is incremental refinement, not a paradigm shift." 310 }, 311 "fear_safety": { 312 "score": 1, 313 "justification": "Healthcare reliability is cited as a motivating scenario but not experimentally studied; routing vulnerability is cited but not addressed." 314 }, 315 "drama_conflict": { 316 "score": 0, 317 "justification": "No controversy or conflict angle; straightforward systems and evaluation paper." 318 }, 319 "demo_ability": { 320 "score": 2, 321 "justification": "Code is publicly available on GitHub with the RouterXBench framework; practitioners could apply ProbeDirichlet to their own Llama or Qwen deployments." 322 }, 323 "brand_recognition": { 324 "score": 1, 325 "justification": "Primarily academic affiliations (SUSTech, Peking University, CUHK-Shenzhen, Beihang); Deepexi Technology is not widely recognized; no major AI lab branding." 326 } 327 }, 328 "hn_data": { 329 "threads": [], 330 "top_points": 0, 331 "total_points": 0, 332 "total_comments": 0 333 } 334 }