scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (20566B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "benchmark-creation",
      4   "paper": {
      5     "title": "LLMRouterBench: A Massive Benchmark and Unified Framework for LLM Routing",
      6     "authors": [
      7       "Hao Li",
      8       "Yiqun Zhang",
      9       "Zhaoyan Guo",
     10       "Chenxu Wang",
     11       "Shengji Tang"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2601.07206",
     16     "doi": "10.48550/arXiv.2601.07206"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All major abstract claims are substantiated: 400K+ instances from 21 datasets and 33 models are documented in Table 2; routing method convergence is shown in Fig 4; OpenRouter underperformance is shown in Fig 7; the Oracle gap driven by model-recall failures is shown in Fig 5.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about embedding model impact are supported by ablation (Table 10, three embedding alternatives tested); diminishing returns from ensemble size are tested via Oracle-controlled subset experiments (Fig 6) with random vs. top-k comparison as a direct counterfactual.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The Limitations section explicitly states results do not generalize to domain-specific verticals, very long-context tasks, or multimodal benchmarks, and that only routing methods with public open-source implementations are covered.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper offers an alternative explanation for routing method convergence—that gains may be attributable to capturing coarse domain structure (math vs. code) rather than nuanced decision boundaries—evidenced by proximity to the Dataset Oracle in Fig 4.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper clearly distinguishes AvgAcc (what is measured) from Oracle performance (the ceiling), explicitly frames Gain@B and Gap@O as distinct quantities, and defines PerfGain and CostSave in terms of the Best Single baseline to clarify what improvement claims mean.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "A dedicated 'Limitations' section is present listing three specific limitations: coverage of routing methods, dataset domain coverage, and the approximate nature of latency analysis.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Limitations are specific: (1) only routing methods with open-source implementations are covered; (2) domain-specific verticals, long-context, and multimodal settings are excluded by name; (3) latency estimates rely on token-level statistics from OpenRouter and are explicitly noted as 'indicative rather than definitive.'",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "The Limitations section explicitly names excluded settings (domain-specific verticals, very long-context, multimodal) and notes that benchmark formulations 'can be applied to these settings by adding new dataset evaluators,' implicitly distinguishing what is and is not currently supported.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The Acknowledgements section states the work was supported by the Shanghai Municipal Science and Technology Major Project.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "All authors are affiliated with Shanghai Artificial Intelligence Laboratory; footnotes additionally note that Li, Guo, and Wang are affiliated with Northwestern Polytechnical University and that four authors were interns at Shanghai AI Lab.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The funder is a government science/technology grant with no direct interest in any particular routing method or LLM provider evaluated; no authors are employees of the evaluated commercial services (OpenAI, Anthropic, Google, OpenRouter).",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "There is no competing interests statement or declaration of financial interests such as patents, equity, or consulting relationships anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "LLM routing is defined precisely in the opening sentence; performance-oriented vs. performance-cost tradeoff paradigms are explicitly distinguished; all evaluation metrics (AvgAcc, Gain@R, Gain@B, Gap@O, PerfGain, CostSave, ParetoDist) are given formal mathematical definitions in Section 3.4.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 1 ends with a bulleted list of three explicit contributions: large-scale timely data, an open-source unified routing framework with 10 baselines, and a systematic re-examination of the routing landscape.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Table 1 directly compares LLMRouterBench against five prior benchmarks on seven dimensions; the Related Work section covers both routing paradigms with citations to ten prior methods and explains how this work extends or corrects prior limitations.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "benchmark-creation": {
    120       "construct_design": {
    121         "construct_validity_argued": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper argues that routing capability requires diverse task coverage spanning mathematics, code, logic, knowledge, affective, instruction following, and tool use; dataset exclusion criteria (no saturated or uniformly-poor datasets) are explicitly stated to ensure benchmark discriminates routing quality.",
    125           "source": "haiku"
    126         },
    127         "difficulty_distribution_characterized": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "Fig 5 explicitly characterizes query hardness by the number of models that answer correctly, showing a distribution from 1 to all 20 correct; the paper explicitly excludes saturated datasets (all models high) and uniformly difficult datasets (all models low) for each model pool.",
    131           "source": "haiku"
    132         },
    133         "ceiling_floor_effects_checked": {
    134           "applies": true,
    135           "answer": true,
    136           "justification": "Section 3.2 explicitly states saturated datasets are excluded for flagship models and excessively challenging datasets where all models perform uniformly poorly are excluded for lightweight models—a direct procedural check for ceiling and floor effects.",
    137           "source": "haiku"
    138         },
    139         "human_baseline_included": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No human baseline performance is reported for any dataset; the only baselines are Random Router, Best Single, and Oracle (all model-based).",
    143           "source": "haiku"
    144         },
    145         "scoring_rubric_justified": {
    146           "applies": true,
    147           "answer": true,
    148           "justification": "Each metric is mathematically defined and justified: Gain@B measures improvement over the best single model, Gap@O measures distance to the theoretical ceiling, and ParetoDist captures multi-objective optimality; the paper explains why these reference points are meaningful rather than arbitrary.",
    149           "source": "haiku"
    150         }
    151       },
    152       "robustness": {
    153         "contamination_resistance_designed": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The benchmark includes several older datasets (HumanEval, MBPP, BBH, MMLU-Pro) that are likely contaminated in modern LLM training sets; while some live benchmarks (LiveCodeBench, LiveMathBench) provide temporal freshness, contamination resistance is not systematically addressed as a design principle.",
    157           "source": "haiku"
    158         },
    159         "temporal_robustness_discussed": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "The paper does not discuss how quickly models will saturate the benchmark, whether the fixed 33-model pool will become outdated, or what the update strategy is; temporal robustness is entirely absent from the Limitations and Conclusions sections.",
    163           "source": "haiku"
    164         },
    165         "failure_modes_discussed": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The Limitations section covers coverage gaps (missing routing methods, missing task types) but does not discuss failure modes of the benchmark itself—such as routing methods overfitting to the fixed 70/30 train/test split, LLM-as-judge bias in three datasets, or the benchmark being gamed by embedding the evaluation datasets in model pretraining.",
    169           "source": "haiku"
    170         },
    171         "baseline_implementations_provided": {
    172           "applies": true,
    173           "answer": true,
    174           "justification": "All code and data are released at GitHub; Appendix B provides detailed implementation specifications for all 10 baselines including hyperparameters, embedding models, hardware, and 5-seed averaging; the framework uses modular adapters for plug-and-play reproduction.",
    175           "source": "haiku"
    176         }
    177       },
    178       "documentation": {
    179         "dataset_documentation_complete": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Table 9 documents all 21 datasets with abbreviations, categories, evaluation metrics, and sizes; Appendix B.1 describes collection methodology including inference hardware (A800-80G, vLLM 0.8.4), API providers, temperature/sampling settings, and retry policies; Tables 7 and 8 document all 33 models with parameters and costs.",
    183           "source": "haiku"
    184         },
    185         "licensing_and_access_clear": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "The paper states 'All datasets and models used in this paper are publicly available and properly cited. Our usage complies with their original licenses and intended research purposes,' and provides a public GitHub repository for code and data access.",
    189           "source": "haiku"
    190         },
    191         "intended_use_specified": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "The paper specifies what the benchmark enables (fair comparison, routing development, systematic re-evaluation) but does not state what conclusions should NOT be drawn—for example, it does not caution against using the benchmark as the sole basis for selecting a production router or against assuming results generalize to uncovered domains.",
    195           "source": "haiku"
    196         }
    197       }
    198     }
    199   },
    200   "claims": [
    201     {
    202       "claim": "Most routing methods exhibit similar performance under unified evaluation despite ongoing methodological innovation.",
    203       "evidence": "Fig 4 shows EmbedLLM (70.29%), GraphRouter (71.88%), MODEL-SAT (71.94%), and Avengers (73.10%) are within ~2.8pp on AvgAcc; Fig 4 shows similar convergence across Gain@R, Gain@B, and Gap@O metrics averaged over 5 seeds.",
    204       "supported": "strong"
    205     },
    206     {
    207       "claim": "Several recent routing approaches, including the commercial router OpenRouter, fail to reliably outperform the Best Single baseline.",
    208       "evidence": "Fig 7 shows OpenRouter achieves -24.7% PerfGain relative to Best Single (GPT-5); HybridLLM and FrugalGPT are marked N/A for CostSave (fail to match Best Single accuracy at lower cost); Table 12 confirms OpenRouter AvgAcc of 49.67 vs. GPT-5's 65.96.",
    209       "supported": "strong"
    210     },
    211     {
    212       "claim": "A substantial performance gap to Oracle remains, driven primarily by model-recall failures on queries where only one model answers correctly.",
    213       "evidence": "Fig 4(d) shows Gap@O of 20.7-33.3% across all routers; Fig 5 shows that on 410 queries (11.9% of test set) where ≤3 models are correct, Avengers and EmbedLLM achieve only 24.6% and 23.2% accuracy respectively.",
    214       "supported": "strong"
    215     },
    216     {
    217       "claim": "Backbone embedding model choice has limited impact on routing performance.",
    218       "evidence": "Table 10 shows that replacing gte-qwen2-7B-instruct with much weaker all-MiniLM-L6-v2 (22.7M params) changes Avengers AvgAcc by only 0.91pp (71.94 vs. 71.03), GraphRouter by 2.24pp, and EmbedLLM by 0.29pp.",
    219       "supported": "moderate"
    220     },
    221     {
    222       "claim": "Expanding the model ensemble yields diminishing returns; a carefully selected small subset can outperform a larger random pool.",
    223       "evidence": "Fig 6 shows Oracle AvgAcc rising steeply from 2 to ~8 models then plateauing through 20; top-2 curated models (Qwen3-8B, NVIDIA-Nemo) outperform random k-model selections at k=6-20.",
    224       "supported": "strong"
    225     },
    226     {
    227       "claim": "Avengers-Pro achieves near-Pareto-optimal performance-cost tradeoff among all evaluated routing methods.",
    228       "evidence": "Fig 8 shows Avengers-Pro ParetoDist of 0.001 (near-zero) while all other methods range from 0.037 (RouteLLM) to 0.394 (OpenRouter); Fig 8 shows Avengers-Pro configurations spanning the empirical Pareto frontier.",
    229       "supported": "strong"
    230     }
    231   ],
    232   "methodology_tags": [
    233     "benchmark-eval",
    234     "observational"
    235   ],
    236   "key_findings": "LLMRouterBench, comprising 400K+ instances across 21 datasets and 33 models, reveals that under unified evaluation most routing methods collapse to similar performance despite methodological diversity, and several recent approaches including the commercial router OpenRouter fail to outperform simply using the best single model. A large Oracle gap persists (20-33%), driven primarily by model-recall failures on hard queries where only one candidate model produces a correct answer. Common assumptions are empirically refuted: embedding model quality has negligible routing impact, and expanding ensemble size yields diminishing returns while careful model curation provides more benefit.",
    237   "red_flags": [
    238     {
    239       "flag": "No human baseline",
    240       "detail": "All comparisons are model-to-model; there is no human performance reported for any of the 21 datasets, making it impossible to contextualize whether the Oracle gap represents a fundamental routing difficulty or a tractable human-level task."
    241     },
    242     {
    243       "flag": "Contamination not addressed for older benchmarks",
    244       "detail": "Many included datasets (HumanEval, MBPP, BBH, MMLU-Pro, GPQA) are widely used and likely present in modern LLM pretraining data, but the paper does not discuss or test for contamination effects on routing outcomes."
    245     },
    246     {
    247       "flag": "No competing interests statement",
    248       "detail": "Despite evaluating commercial APIs (GPT-5, Claude, Gemini, OpenRouter) and spending $2,771 in API costs, no competing interests statement is provided."
    249     },
    250     {
    251       "flag": "LLM-as-judge bias unaddressed",
    252       "detail": "Three datasets (HLE, SimpleQA, ArenaHard) use LLM judges (o3-mini, DeepSeek-V3); the paper does not discuss how judge model bias or judge-model correlation might favor certain frontier models in these datasets."
    253     },
    254     {
    255       "flag": "Temporal robustness absent",
    256       "detail": "The paper presents a static benchmark with a fixed 33-model pool and 21 datasets with no discussion of update strategy or how quickly routing methods will overfit once the benchmark becomes standard."
    257     }
    258   ],
    259   "cited_papers": [
    260     {
    261       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    262       "relevance": "Direct predecessor benchmark for multi-LLM routing, used as primary comparison baseline in Table 1."
    263     },
    264     {
    265       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    266       "relevance": "Performance-cost routing baseline trained on preference data, evaluated and compared in experiments."
    267     },
    268     {
    269       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    270       "relevance": "Foundational cascaded inference routing method for performance-cost tradeoff, directly evaluated as baseline."
    271     },
    272     {
    273       "title": "HybridLLM: Cost-Efficient and Quality-Aware Query Routing",
    274       "relevance": "Binary routing method between small and large models, evaluated as performance-cost baseline."
    275     },
    276     {
    277       "title": "EmbedLLM: Learning Compact Representations of Large Language Models",
    278       "relevance": "Embedding-based routing baseline evaluated in performance-oriented setting; used for embedding ablation."
    279     },
    280     {
    281       "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion",
    282       "relevance": "Early performance-oriented routing method that motivated the field; cited as foundational work."
    283     },
    284     {
    285       "title": "GraphRouter: A Graph-Based Router for LLM Selections",
    286       "relevance": "Graph neural network routing baseline evaluated in both performance and performance-cost settings."
    287     },
    288     {
    289       "title": "RouterEval: A Comprehensive Benchmark for Routing LLMs to Explore Model-Level Scaling Up in LLMs",
    290       "relevance": "Competing routing benchmark compared in Table 1, providing context for LLMRouterBench's improvements."
    291     },
    292     {
    293       "title": "The Avengers: A Simple Recipe for Uniting Smaller Language Models to Challenge Proprietary Giants",
    294       "relevance": "Clustering-based routing method that achieves top performance in performance-oriented setting without neural training."
    295     },
    296     {
    297       "title": "RouterArena: An Open Platform for Comprehensive Comparison of LLM Routers",
    298       "relevance": "Recent routing benchmark that treats routing as a black box, cited as a limitation motivating per-prompt transparency."
    299     }
    300   ],
    301   "engagement_factors": {
    302     "practical_relevance": {
    303       "score": 3,
    304       "justification": "LLM routing directly addresses production cost and performance tradeoffs; the finding that commercial routers underperform simple baselines is immediately actionable for practitioners."
    305     },
    306     "surprise_contrarian": {
    307       "score": 3,
    308       "justification": "The core finding that methodological innovation has not produced differentiated routing methods, and that OpenRouter performs worse than the best single model, directly challenges the field's progress narrative."
    309     },
    310     "fear_safety": {
    311       "score": 0,
    312       "justification": "Paper raises no AI safety or risk concerns; it is purely about performance and cost optimization of LLM routing."
    313     },
    314     "drama_conflict": {
    315       "score": 2,
    316       "justification": "Showing that a widely used commercial router (OpenRouter) performs 24.7% worse than the Best Single model is mildly confrontational and newsworthy for the industry."
    317     },
    318     "demo_ability": {
    319       "score": 3,
    320       "justification": "All code and data are publicly released at GitHub with detailed implementation instructions, 10 plug-and-play baselines, and modular adapters for new routing methods."
    321     },
    322     "brand_recognition": {
    323       "score": 1,
    324       "justification": "Shanghai Artificial Intelligence Laboratory is a well-funded national lab but lacks the global brand recognition of Anthropic, Google DeepMind, or OpenAI in the ML research community."
    325     }
    326   },
    327   "hn_data": {
    328     "threads": [],
    329     "top_points": 0,
    330     "total_points": 0,
    331     "total_comments": 0
    332   }
    333 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs