scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30862B)
      1 {
      2   "paper": {
      3     "title": "LLMRouterBench: A Massive Benchmark and Unified Framework for LLM Routing",
      4     "authors": [
      5       "Hao Li",
      6       "Yiqun Zhang",
      7       "Zhaoyan Guo",
      8       "Chenxu Wang",
      9       "Shengji Tang",
     10       "Qiaosheng Zhang",
     11       "Yang Chen",
     12       "Biqing Qi",
     13       "Peng Ye",
     14       "Lei Bai",
     15       "Zhen Wang",
     16       "Shuyue Hu"
     17     ],
     18     "year": 2026,
     19     "venue": "arXiv",
     20     "arxiv_id": "2601.07206",
     21     "doi": "10.48550/arXiv.2601.07206"
     22   },
     23   "scan_version": 2,
     24   "active_modules": ["experimental_rigor", "data_leakage"],
     25   "methodology_tags": ["benchmark-eval"],
     26   "key_findings": "LLMRouterBench evaluates 10 routing methods across 21 datasets and 33 models (400K+ instances). Despite ongoing methodological innovation, most routing methods achieve nearly indistinguishable performance under unified evaluation, and several recent methods (including the commercial router OpenRouter) fail to outperform a simple Best Single baseline. A substantial gap to the Oracle persists, driven by model-recall failures on hard queries. Embedding model quality has minimal impact on routing, and expanding model pools yields diminishing returns compared to careful model curation.",
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper states in the abstract: 'All code and data are available at https://github.com/ynulihao/LLMRouterBench.' A working URL is provided."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Same URL provides all data. The abstract states 'All code and data are available.' The benchmark comprises 400K+ instances across 21 datasets and 33 models."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper mentions 'vLLM 0.8.4' and 'NVIDIA A800-80G GPUs' (Section B.1) and DeepSpeed for training, but does not provide a requirements.txt, Dockerfile, or comprehensive environment specification with library versions."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The GitHub repository is provided with all code and data. The paper specifies exact random seeds (42, 999, 2024, 2025, 3407), train/test splits (70/30), embedding models, hyperparameters for each baseline (Appendix B.3), and data collection settings (Appendix B.1). A competent researcher could reproduce results."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Results are averaged over 5 random seeds (Appendix Tables 11-13) but no confidence intervals, error bars, or ± notation are reported. Only point estimates appear in all tables and figures."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper makes comparative claims (e.g., 'many routing approaches achieve broadly comparable performance') but provides no statistical significance tests. Differences between methods are compared by inspection of point estimates only."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper defines and reports Gain@R, Gain@B, Gap@O, PerfGain, and CostSave metrics that contextualize improvements relative to baselines (Section 3.4). For example, 'up to a 4% average accuracy gain over Best Single' and '31.7% cost reduction' (Section 4.2.2)."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No justification is provided for the choice of 5 random seeds, 21 datasets, or 33 models. The scale is described but not justified through power analysis or formal reasoning."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "Results are averaged over 5 runs with different seeds, but no standard deviations, interquartile ranges, or other spread measures are reported in any table or figure. The reader cannot assess result stability."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Three reference baselines are defined: Random Router, Best Single, and Oracle (Section 3.4). Additionally, 10 routing methods are compared against each other."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Baselines include very recent methods (2024-2025): GraphRouter (ICLR 2025), Avengers-Pro (2025), MODEL-SAT (2025), RouteLLM (2024), plus the commercial OpenRouter. Models include GPT-5, Claude-4, Gemini-2.5-Pro."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Embedding model ablation (Table 10) tests 3 different embedding backbones across 3 routing methods. Model pool size ablation (Fig. 6) tests top-k vs. random selection across pool sizes 2-20."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Seven metrics are defined and used: AvgAcc, Gain@R, Gain@B, Gap@O for performance-oriented; PerfGain, CostSave, ParetoDist for performance-cost (Section 3.4)."
     97       },
     98       "human_evaluation": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "The paper evaluates routing methods on standard benchmark accuracy metrics. Human evaluation is irrelevant to the core claims about which router achieves higher automated accuracy scores."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Section B.2: 'All experiments are conducted using a 70% training and 30% test split, repeated five times with different random seeds.' Results are reported on the 30% test split."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Extensive per-dataset breakdowns in Tables 4, 5, 11, 12, plus per-domain analysis (mathematics, code, logical, knowledge, affective) in Fig. 3 and throughout."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Model-recall failures are analyzed in Fig. 5: 'on queries where at most three experts answer correctly (410 queries, 11.9% of test set), Avengers and EmbedLLM achieve low accuracy (24.6% and 23.2%).' OpenRouter's failure is discussed (-24.7% PerfGain)."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Multiple negative findings: routing methods don't differentiate much from each other, OpenRouter fails to beat Best Single, embedding models have limited impact, larger ensembles show diminishing returns, binary routers (HybridLLM, FrugalGPT) struggle on cost savings."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Abstract claims about complementarity (Fig. 3), similar method performance (Fig. 4), Oracle gap (Fig. 4d), embedding impact (Table 10), and diminishing returns (Fig. 6) are all supported by corresponding results sections."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Causal-type claims are backed by controlled experiments: 'embedding models have limited impact' via ablation (Table 10, swapping 3 embedding backbones), 'diminishing returns from adding models' via systematic pool size variation (Fig. 6). The ablation designs are adequate for these claims."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Claims are generally bounded to the tested setting. The paper qualifies with 'under unified evaluation' and 'in practice.' The Limitations section explicitly states untested domains: 'domain-specific verticals, very long-context tasks, and multimodal benchmarks are not included.'"
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper discusses that 'a large fraction of routing gains may be attributable to capturing coarse-grained domain structure rather than learning highly nuanced decision boundaries' (Section 4.2.1), supported by comparison to the Dataset Oracle. Model-recall failure is identified as the primary driver of the Oracle gap."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Claims are stated at the level of benchmark accuracy and cost metrics, matching the granularity of measurements. The paper acknowledges latency as an additional practical dimension not fully captured (Section 4.2.2). No proxy gap exists between what is measured and what is claimed."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "Some models include version dates (deepseek-r1-0528, deepseek-v3-0324, Qwen3-235b-a22b-2507, GLM-Z1-9B-0414) but several flagship models use marketing names without snapshot dates: GPT-5-medium, Claude-sonnet-4, Gemini-2.5-flash, Gemini-2.5-pro (Tables 7-8). Model behavior changes across API versions."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "The paper specifies 0-shot or 3-shot evaluation and mentions 'official prompts' for LLM-as-judge tasks (Table 9) but does not include the actual prompt text used for any benchmark or routing evaluation. The reader cannot reconstruct exact prompts."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Comprehensive hyperparameters in Appendix B: temperature 0.2, top_p 1.0 for all models (B.1); learning rates, batch sizes, epochs, warmup ratios for each routing method (B.3); k=64 for Avengers clustering; 10,000 epochs for GraphRouter."
    166       },
    167       "scaffolding_described": {
    168         "applies": false,
    169         "answer": false,
    170         "justification": "No agentic scaffolding is used. The routing methods are classification/optimization algorithms, not agent-based systems."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section B.1 documents the data collection pipeline: model deployment on A800 GPUs, API collection via OpenRouter, caching and retry logic (up to 10 retries), failure scoring (0), temperature/sampling settings. The modular Collector-Evaluator-Adaptor pipeline is described in Section 3.3."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "A dedicated 'Limitations' section identifies three specific limitations: incomplete routing method coverage, restricted domain/task coverage, and approximate latency analysis."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Limitations are specific to this study: 'we do not cover all existing approaches' with explanation that they focus on those with public implementations; 'domain-specific verticals, very long-context tasks, and multimodal benchmarks are not included'; latency estimates correspond to 'a specific provider configuration and should be interpreted as indicative.'"
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The paper explicitly states what is not covered: 'domain-specific verticals, very long-context tasks, and multimodal benchmarks, are not included' and 'These estimates correspond to a specific provider configuration and should be interpreted as indicative rather than definitive.'"
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The abstract states 'All code and data are available at https://github.com/ynulihao/LLMRouterBench.' This includes the 400K+ instances with per-prompt, per-model outputs."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Appendix B.1 describes data collection: open-source models deployed on A800 GPUs with vLLM 0.8.4, flagship models collected via OpenRouter and official APIs (GLM-4.6, Intern-S1), temperature 0.2, top_p 1.0, up to 10 retries, failures scored as 0."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No human participants. Data sources are standard benchmarks and LLM API outputs."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The data pipeline is documented through the modular framework (Section 3.3, Fig. 2): Collector handles API calls with caching/retries/cost tracking, Evaluator implements dataset-specific metrics, Adaptor converts to algorithm-specific inputs with consistent train/test splits. Table 2 provides summary statistics."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Acknowledgements section states: 'This work was supported by the Shanghai Municipal Science and Technology Major Project.'"
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Author affiliations are listed: Shanghai Artificial Intelligence Laboratory and Northwestern Polytechnical University. Internship affiliations are also noted."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The Shanghai Municipal Science and Technology Major Project is a government funding source with no financial stake in which routing method performs best."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial disclosure statement is present. Notably, several authors (Yiqun Zhang, Hao Li, Chenxu Wang, Peng Ye, Shuyue Hu) are also authors of Avengers and Avengers-Pro, methods that perform best in their benchmark. This conflict is not acknowledged."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No training data cutoff dates are stated for any of the 33 models used. The paper evaluates these models on benchmarks without discussing when their training data was collected."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether benchmark examples (e.g., HumanEval from 2021, BBH, MMLU-Pro) appeared in the training data of the evaluated models. Some benchmarks predate many models by years."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "Multiple benchmarks (HumanEval 2021, BBH 2022, MMLU-Pro, GPQA) were publicly available before the training cutoff of many evaluated models. Some temporal benchmarks are used (LiveCodeBench, LiveMathBench) but contamination is never explicitly discussed."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants in this study."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Detailed cost analysis in Table 6 (per-model per-dataset costs), Table 13 (per-routing-method costs), and Fig. 8 (cost vs accuracy Pareto frontier). Total API costs stated as $2,771.84 (Table 2)."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Table 2 states: 'A800 1000 GPU hours' for lightweight model inference, '$2,771.84' for API costs including '$500 from LLM-based judging.' Individual routing method parameter counts are reported in B.3."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "Section B.2: 'repeated five times with different random seeds (42, 999, 2024, 2025, and 3407).' Results are averaged over these seeds. However, no per-seed variance is shown."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "Explicitly stated in Section B.2: five runs with different random seeds. Appendix Tables 11-13 note results are 'averaged over five random seeds.'"
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Hyperparameters are specified for each method (Appendix B.3) but no search budget is reported. Some methods have modified hyperparameters from originals (e.g., GraphRouter epochs increased to 10,000, EmbedLLM batch size to 32,768) without justifying how these values were selected."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "For most methods, default or slightly modified configurations from original papers are used without formal justification. For Avengers-Pro, 101 configurations are swept (alpha from 0 to 1 in 0.01 increments), but selection criteria for final comparisons are not discussed."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The paper makes numerous pairwise comparisons across 10 routing methods, 33 models, and 21 datasets. No statistical tests are performed, let alone corrections for multiple comparisons."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Several authors (Yiqun Zhang, Hao Li, Chenxu Wang, Peng Ye, Shuyue Hu) are authors of Avengers and Avengers-Pro — the methods found to be best-performing (performance-oriented) and Pareto-optimal (cost setting) in the benchmark. MODEL-SAT was re-implemented due to incomplete official code. Neither author-evaluation bias nor re-implementation bias is acknowledged."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Routing method training costs vary enormously: RouterDC has ~7B parameters, MODEL-SAT ~14B, EmbedLLM ~12M, GraphRouter ~0.1M, and Avengers requires no training. Performance is not compared at matched training compute budgets."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "The paper selects benchmarks by excluding saturated and trivially-hard datasets, but does not discuss whether benchmark accuracy actually measures real-world routing quality. No formal construct validity analysis is provided."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": false,
    348         "answer": false,
    349         "justification": "No scaffolding is involved. All models are queried directly through APIs with uniform settings. Routing methods are classification algorithms, not scaffolded agents."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "Some temporal benchmarks are included (LiveCodeBench, LiveMathBench) which mitigate leakage, but the paper never explicitly discusses temporal leakage. Older benchmarks (HumanEval 2021, BBH 2022) are used alongside 2025-2026 models without addressing temporal overlap."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "Not discussed. The evaluation setup provides prompts to models; whether these prompts or their structure leaks information relevant to routing is not considered."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "Not discussed. The 70/30 train/test split is applied per-dataset, but whether benchmark examples share structural similarities or come from related sources is not analyzed."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, n-gram overlap analysis, or decontamination pipelines are mentioned."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "Models exhibit clear complementarity — no single model dominates all domains, which validates the central premise of LLM routing.",
    378       "evidence": "Fig. 3 shows different models leading different domains: Intern-S1-mini/Qwen3-8B in math, Fin-R1/Qwen-Coder in code, DS-Qwen3/MiniCPM in logic, Gemma-2-it in affective tasks. Tables 4-5 confirm this across all datasets.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Under unified evaluation, leading routing methods achieve broadly comparable performance — despite continued methodological innovation, there is little practical differentiation.",
    383       "evidence": "Fig. 4 shows EmbedLLM (71.24), GraphRouter (70.29), MODEL-SAT (71.88), and Avengers (71.94) all within ~2 points AvgAcc. Gain@B ranges 3.9-8.3% across all methods. Table 11 confirms across per-dataset results averaged over 5 seeds.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Several recent routing methods, including the commercial router OpenRouter, fail to outperform the Best Single model baseline.",
    388       "evidence": "Fig. 7 shows OpenRouter at -24.7% PerfGain, HybridLLM at -12.7%, FrugalGPT at -9.5% relative to GPT-5. OpenRouter achieves 49.67% AvgAcc vs. GPT-5's 65.96% (Table 12).",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "A substantial performance gap persists relative to the Oracle baseline, driven primarily by model-recall failures on hard queries.",
    393       "evidence": "Fig. 4d shows Gap@O of 20.7-23.7% for top methods. Fig. 5 shows that on queries where at most 3 models answer correctly (410 queries, 11.9%), Avengers achieves 24.6% and EmbedLLM 23.2% routing accuracy.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Backbone embedding models have limited impact on routing performance.",
    398       "evidence": "Table 10 shows replacing gte-qwen2-7B-instruct (3584-dim, 7B params) with nli-bert-base or all-MiniLM-L6-v2 (22.7M params) yields comparable results across GraphRouter, EmbedLLM, and Avengers. Differences are within ~1-3 points.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "Expanding the model ensemble yields diminishing returns, but a well-chosen subset can substantially outperform a larger random pool.",
    403       "evidence": "Fig. 6 shows Oracle AvgAcc plateaus around 8-10 models for random selection. Top-2 selection (Qwen3-8B + NVIDIA-Nemo) matches random pools of 6-8 models. Top-4 selection outperforms larger random pools.",
    404       "supported": "moderate"
    405     },
    406     {
    407       "claim": "Avengers-Pro achieves near Pareto-optimal performance-cost tradeoff, dominating the frontier.",
    408       "evidence": "Fig. 8 shows Avengers-Pro with ParetoDist = 0.001, far below other methods (GraphRouter 0.037-0.087, FrugalGPT 0.152, OpenRouter 0.394). Nearly all Avengers-Pro configurations lie on or near the Pareto frontier.",
    409       "supported": "strong"
    410     }
    411   ],
    412   "red_flags": [
    413     {
    414       "flag": "Authors evaluate their own methods in their own benchmark",
    415       "detail": "Several LLMRouterBench authors (Yiqun Zhang, Hao Li, Chenxu Wang, Peng Ye, Lei Bai, Shuyue Hu) are also authors of Avengers (Zhang et al., 2025d) and Avengers-Pro (Zhang et al., 2025c). These methods are found to be top-performing in both the performance-oriented setting (Avengers: highest AvgAcc among routing methods) and the performance-cost setting (Avengers-Pro: Pareto-optimal with ParetoDist near zero). This conflict is not disclosed or acknowledged."
    416     },
    417     {
    418       "flag": "No variance or uncertainty quantification despite multiple runs",
    419       "detail": "Results are averaged over 5 random seeds but no standard deviations, confidence intervals, or error bars are reported in any table or figure. The reader cannot assess whether the observed differences between methods (often 1-3 points) are within or beyond run-to-run variance."
    420     },
    421     {
    422       "flag": "No contamination analysis for benchmarks predating model training",
    423       "detail": "HumanEval (2021), BBH (2022), MMLU-Pro, and other benchmarks were publicly available years before many evaluated models were trained. No contamination analysis is performed. If models have memorized benchmark answers, the routing evaluation is compromised — a router that learned which model memorized which answers would not generalize to novel queries."
    424     },
    425     {
    426       "flag": "No statistical tests for comparative claims",
    427       "detail": "The paper makes many comparative claims ('methods are broadly comparable', 'embedding models have limited impact') based solely on comparing point estimates. Without significance tests, it is unclear whether observed differences reflect genuine routing quality differences or random variation across seeds."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    433       "authors": ["Carlos E Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik Narasimhan"],
    434       "year": 2023,
    435       "arxiv_id": "2310.06770",
    436       "relevance": "Major code generation benchmark used in the performance-cost evaluation setting; central to evaluating LLM coding capability."
    437     },
    438     {
    439       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    440       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang", "Tianhao Wu", "Joseph E Gonzalez", "M Waleed Kadous", "Ion Stoica"],
    441       "year": 2024,
    442       "arxiv_id": "2406.18665",
    443       "relevance": "Trains routers from preference data to balance LLM performance and cost; key baseline in the performance-cost routing evaluation."
    444     },
    445     {
    446       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    447       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    448       "year": 2024,
    449       "relevance": "Pioneering work on cascaded LLM inference for cost reduction; baseline in the performance-cost setting."
    450     },
    451     {
    452       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    453       "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang", "Robert Sim"],
    454       "year": 2024,
    455       "arxiv_id": "2404.14618",
    456       "relevance": "Routes queries between small and large LLMs based on predicted difficulty; baseline for performance-cost routing."
    457     },
    458     {
    459       "title": "Evaluating Large Language Models Trained on Code",
    460       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    461       "year": 2021,
    462       "arxiv_id": "2107.03374",
    463       "relevance": "Introduces HumanEval benchmark, one of the core evaluation datasets used in the performance-oriented setting."
    464     },
    465     {
    466       "title": "Humanity's Last Exam",
    467       "authors": ["Long Phan", "Alice Gatti", "Ziwen Han"],
    468       "year": 2025,
    469       "arxiv_id": "2501.14249",
    470       "relevance": "Frontier capability assessment benchmark used in the performance-cost setting to test flagship models on extremely hard questions."
    471     },
    472     {
    473       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    474       "authors": ["Naman Jain", "King Han", "Alex Gu"],
    475       "year": 2024,
    476       "arxiv_id": "2403.07974",
    477       "relevance": "Contamination-free code evaluation benchmark using temporal splits; used in both routing settings."
    478     },
    479     {
    480       "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion",
    481       "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"],
    482       "year": 2023,
    483       "relevance": "Early work on LLM ensemble selection and output fusion; foundational to the routing research field."
    484     },
    485     {
    486       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    487       "authors": ["Daya Guo", "Dejian Yang", "Haowei Zhang"],
    488       "year": 2025,
    489       "arxiv_id": "2501.12948",
    490       "relevance": "Major open-source reasoning model achieving near-GPT-5 performance at fraction of cost; central to cost-performance routing analysis."
    491     },
    492     {
    493       "title": "τ2-Bench: Evaluating Conversational Agents in a Dual-Control Environment",
    494       "authors": ["Victor Barres", "Honghua Dong", "Soham Ray", "Xujie Si", "Karthik Narasimhan"],
    495       "year": 2025,
    496       "arxiv_id": "2506.07982",
    497       "relevance": "Agentic tool-use benchmark used for latency-aware routing analysis; demonstrates latency as a third routing dimension."
    498     },
    499     {
    500       "title": "GraphRouter: A Graph-based Router for LLM Selections",
    501       "authors": ["Tao Feng", "Yanzhen Shen", "Jiaxuan You"],
    502       "year": 2025,
    503       "relevance": "Graph-based approach to LLM routing using heterogeneous task-query-LLM interactions; baseline in both routing settings."
    504     }
    505   ]
    506 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs