scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (18299B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "Towards Fair and Comprehensive Evaluation of Routers in Collaborative LLM Systems",
      6     "authors": [
      7       "Wanxin Wu",
      8       "He Zhu",
      9       "Yixia Li",
     10       "Lei Yang",
     11       "Jie Zhao"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv",
     15     "arxiv_id": "2602.11877",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The 16.68% and 18.86% relative improvement claims are supported by Tables 1 and 2; generalization across model families is supported by Table 5 and Figure 5.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims like 'diverse training improves robustness' and 'hidden states outperform output-based signals' are supported by controlled ablation studies (Table 3, Table 4, Table 6) that vary one factor at a time.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title claims 'fair and comprehensive evaluation' but the main experiments use a single small-large model pair (Llama-3.1-8B + GPT-5); the limitations section acknowledges this but the abstract still claims generality across diverse settings.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper attributes hidden-state superiority solely to capturing pre-generation uncertainty, but does not discuss alternative explanations such as the advantage being due to model-specific probing rather than a general principle.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper explicitly addresses the conflation problem — separating intrinsic router ability (AUROC) from end-to-end system performance, which is the core methodological contribution of the framework.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section appears before the References, discussing the single model-pair constraint and single-run reporting.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Specific threats noted include single small-large model pair, single random seed (seed=42), single-run results due to compute, and the model-convergence failure mode analyzed in Appendix D.2.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The limitations section explicitly states conclusions are bounded to a single model pair and warns that 'broader validation across diverse architectures, multiple seeds, and more complex OOD conditions would further strengthen the conclusions.'",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment section appears anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed in the header: Southern University of Science and Technology, Institut Polytechnique de Paris, Peking University, Deepexi Technology Co. Ltd., University of Edinburgh, Beihang University, and Chinese University of Hong Kong.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding source is disclosed, making this criterion not applicable.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or declaration of financial interests appears in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined with mathematical precision: router, AUROC, LPM, MPM, HCR, in-distribution, out-of-distribution, and the edge-cloud collaboration setting are all explicitly defined in Section 3.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper clearly states two contributions: (1) RouterXBench evaluation framework with three-dimensional metrics, and (2) ProbeDirichlet router using internal hidden states with Dirichlet aggregation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The related work section covers LLM routing, LLM collaboration, and uncertainty estimation, and Section 3.2 specifically analyzes limitations of prior metrics (FrugalGPT, HybridLLM, RouteLLM) as motivation.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper explicitly argues that AUROC measures routing ability independent of the large model's strength, and that LPM/MPM/HCR measure scenario-specific alignment — addressing the conflation problem in existing metrics with mathematical formalization.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The paper mentions a 'difficulty gradient' conceptually ('simpler benchmarks such as Alpaca, Magpie, to more challenging ones like MMLU, Big-Math, and MATH') but provides no quantitative characterization of difficulty distribution within or across benchmarks.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper does not explicitly check for ceiling or floor effects; AUROC values ranging from ~47% to ~74% suggest adequate discrimination, but this is never explicitly verified or discussed.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human baseline is included or discussed for any of the six benchmarks used in the evaluation framework.",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "AUROC is well-justified as threshold-independent, but the specific thresholds for scenario alignment (25-30% call rate for LPM, 85-95% relative performance for HCR) are stated as deployment scenarios without principled empirical or domain-specific justification.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "RouterXBench builds on existing public benchmarks (MMLU, Alpaca, etc.) without any contamination resistance measures such as temporal splits, canary strings, or dynamic generation.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper does not discuss whether the benchmark will remain useful as models improve or whether existing benchmarks will be saturated, nor is there a plan for updates.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": true,
    168           "justification": "Appendix D.2 provides a case study where routing fails because both small and large models converge on the same wrong answer, and the limitations section discusses this as a fundamental gap in routing frameworks.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "Code is publicly available at https://github.com/zhuchichi56/RouterXBench, and Appendix A and B provide implementation details including fixed random seed, training setup, and data preparation.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Table 7 provides basic statistics for the six datasets, and Appendix B describes data preparation, but there is no formal data card; ground-truth construction methodology using xVerify and GPT-5-as-Judge is described but not validated.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "A GitHub link is provided for the code but no license is specified; the underlying datasets are public but their licenses are not discussed in the context of the framework.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The three-dimensional evaluation structure (router ability vs. scenario alignment vs. cross-domain robustness) makes clear what should and should not be concluded from each metric, and the limitations section specifies what the benchmark does not cover.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "ProbeDirichlet achieves 16.68% relative improvement over the best baseline in router ability (AUROC) on in-domain and OOD scenarios.",
    203       "evidence": "Table 1 shows ProbeDirichlet AUROC averages of 68.70 (in-domain) and 65.46 (OOD) vs. EmbeddingMLP at 59.46 and 55.22.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "ProbeDirichlet achieves 18.86% relative improvement in high-accuracy (HCR) scenarios over the best baseline.",
    208       "evidence": "Table 2 HCR rows show ProbeDirichlet averages of 18.50 (in-domain) and 15.40 (OOD) vs. SemanticEntropy at 15.17 and 13.35.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "Internal hidden states outperform both output-based (logit/verbose) and external embedding-based routing signals.",
    213       "evidence": "Table 4 directly compares Longformer embeddings, LLM embeddings, and LLM hidden states using identical linear classifiers, showing hidden states outperform by large margins particularly on math tasks.",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "Diverse multi-domain training yields additive gains without interference between domains.",
    218       "evidence": "Table 6 shows that adding BigMath training preserves Alpaca performance (71.85→71.96) while improving BigMath (49.19→66.49) and OOD tasks.",
    219       "supported": "strong"
    220     },
    221     {
    222       "claim": "Linear probe architecture is sufficient; adding hidden layers degrades generalization without improving performance.",
    223       "evidence": "Figure 3 shows MLP variants with 16-128 hidden dimensions do not improve AUROC over the linear baseline but exhibit larger train-validation loss gaps.",
    224       "supported": "moderate"
    225     },
    226     {
    227       "claim": "Existing routing metrics (static and curve-based) are inadequate for fair comparison because they conflate router ability with large model strength.",
    228       "evidence": "Figure 1 (right) illustrates that router rankings reverse under small threshold shifts, but this illustration uses a stylized example rather than real experimental data.",
    229       "supported": "moderate"
    230     },
    231     {
    232       "claim": "ProbeDirichlet generalizes across Llama and Qwen model families with consistent improvements over EmbeddingMLP baselines.",
    233       "evidence": "Table 5 shows ProbeDirichlet outperforms EmbeddingMLP across Llama-3.1-8B and Qwen2.5 (0.5B, 3B, 7B), with average improvements of ~10.5% in-domain and ~9.6% OOD.",
    234       "supported": "moderate"
    235     }
    236   ],
    237   "methodology_tags": [
    238     "benchmark-eval",
    239     "observational"
    240   ],
    241   "key_findings": "RouterXBench proposes a three-dimensional evaluation framework (router ability via AUROC, scenario alignment via LPM/MPM/HCR, cross-domain robustness) that disentangles intrinsic routing ability from end-to-end system performance, exposing limitations in prior single-metric evaluations. ProbeDirichlet, a lightweight router using internal hidden-state representations aggregated via Dirichlet distributions, achieves 16.68% and 18.86% relative improvements over the best baselines in router ability and high-accuracy scenarios. The primary driver of generalization is training data diversity rather than architectural complexity: diverse multi-domain training yields additive gains across domains without interference. Single-run experiments on one small-large model pair (Llama-3.1-8B + GPT-5) limit the strength of these conclusions.",
    242   "red_flags": [
    243     {
    244       "flag": "single model pair",
    245       "detail": "All main experiments use only Llama-3.1-8B as the small model and GPT-5 as the large model; the framework's claims about generality rest on a single routing configuration."
    246     },
    247     {
    248       "flag": "single run, no variance",
    249       "detail": "Appendix A explicitly states 'we report single-run results for all experiments' with a fixed seed=42, providing no statistical significance estimates or confidence intervals for any reported improvement."
    250     },
    251     {
    252       "flag": "circular evaluation: GPT-5 as both large model and judge",
    253       "detail": "GPT-5 serves as the large model being routed to AND as the LLM-as-a-Judge evaluator for open-ended tasks (Alpaca, Magpie). Footnote 3 acknowledges this but dismisses it, creating potential circular bias in ground-truth label construction."
    254     },
    255     {
    256       "flag": "arbitrary scenario thresholds",
    257       "detail": "The thresholds defining scenario alignment (25-30% call rate for LPM, 85-95% relative performance for HCR) are presented as deployment scenarios but are chosen without empirical or domain-specific justification."
    258     },
    259     {
    260       "flag": "no license for released code",
    261       "detail": "The GitHub repository is mentioned but no software license is specified, limiting the clarity of reuse rights."
    262     }
    263   ],
    264   "cited_papers": [
    265     {
    266       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    267       "relevance": "Foundational paper on cost-aware LLM routing; RouterXBench's framework directly addresses limitations of FrugalGPT's fixed-accuracy metric."
    268     },
    269     {
    270       "title": "RouteLLM: Learning to route LLMs from preference data",
    271       "relevance": "State-of-the-art preference-based router and evaluation baseline; introduces curve-based AUC metric critiqued by this paper."
    272     },
    273     {
    274       "title": "RouterBench: A benchmark for multi-LLM routing system",
    275       "relevance": "Direct predecessor benchmark for LLM routing evaluation; RouterXBench positions itself as more comprehensive."
    276     },
    277     {
    278       "title": "RouterEval: A comprehensive benchmark for routing LLMs",
    279       "relevance": "Contemporary routing benchmark; cited as related benchmarking effort that RouterXBench extends."
    280     },
    281     {
    282       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    283       "relevance": "Key baseline routing system using fixed-cost metric; represents the static metric paradigm critiqued here."
    284     },
    285     {
    286       "title": "AutoMix: Automatically Mixing Language Models",
    287       "relevance": "Routing baseline using Incremental Benefit per Cost as a single-score metric; represents the static metric approach."
    288     },
    289     {
    290       "title": "Semantic uncertainty: Linguistic invariances for uncertainty estimation in natural language generation",
    291       "relevance": "Provides the SemanticEntropy baseline compared against ProbeDirichlet throughout the experiments."
    292     },
    293     {
    294       "title": "MMLU: Measuring massive multitask language understanding",
    295       "relevance": "Core in-domain benchmark used for both training and evaluation in the RouterXBench framework."
    296     }
    297   ],
    298   "engagement_factors": {
    299     "practical_relevance": {
    300       "score": 3,
    301       "justification": "LLM routing for edge-cloud cost reduction is an immediate deployment concern for practitioners; the framework directly guides router selection in production systems."
    302     },
    303     "surprise_contrarian": {
    304       "score": 2,
    305       "justification": "The finding that data diversity matters more than architecture complexity (linear probe suffices) and that existing evaluation metrics produce misleading rankings challenges common assumptions."
    306     },
    307     "fear_safety": {
    308       "score": 1,
    309       "justification": "Mentions safety-critical applications (healthcare) as a motivating scenario for high-accuracy routing, but does not raise broader AI risk concerns."
    310     },
    311     "drama_conflict": {
    312       "score": 1,
    313       "justification": "Positions against existing evaluation frameworks (FrugalGPT, RouteLLM) but the critique is methodological rather than confrontational."
    314     },
    315     "demo_ability": {
    316       "score": 2,
    317       "justification": "Code is publicly available on GitHub with a fixed seed and documented setup, making reproduction straightforward for practitioners with the required models."
    318     },
    319     "brand_recognition": {
    320       "score": 1,
    321       "justification": "No top-tier lab affiliation; uses GPT-5 (OpenAI) as large model which adds some recognition, but the contributing institutions are not widely recognized in this context."
    322     }
    323   },
    324   "hn_data": {
    325     "threads": [],
    326     "top_points": 0,
    327     "total_points": 0,
    328     "total_comments": 0
    329   }
    330 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs