scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29213B)
      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "PROTEUS: SLA-Aware Routing via Lagrangian RL for Multi-LLM Serving Systems",
      6     "authors": ["Amit Singh Bhatti", "Vishal Vaddina", "Dagnachew Birru"],
      7     "year": 2026,
      8     "venue": "arXiv.org",
      9     "arxiv_id": "2601.19402",
     10     "doi": "10.48550/arXiv.2601.19402"
     11   },
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "PROTEUS accepts accuracy targets τ as runtime input and achieves 100% floor compliance across evaluated τ values on RouterBench (405K queries, 11 models) and SPROUT (45K queries, 14 models), with τ-µ correlation of 0.97–0.98. A single trained model serves the full accuracy spectrum (τ ∈ [0.85, 0.95]) without retraining, achieving 90.1% accuracy on RouterBench (1.3pp below oracle) and 94.0% on SPROUT (4.6pp below oracle) with cost savings up to 89.8% versus the best fixed model. The closest baseline (OmniRouter) achieves only 22% floor compliance despite also using Lagrangian optimization.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No source code repository, GitHub link, or archive is provided anywhere in the paper. The implementation is described in detail (Section 3.1) but no code is released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses two publicly available benchmark datasets: RouterBench (Hu et al., 2024) and SPROUT (Somerstep et al., 2025), both released by their respective authors."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions PyTorch, HuggingFace Transformers, DeBERTa-v3-small, and A100 GPU, but provides no requirements.txt, Dockerfile, or detailed dependency versions sufficient to recreate the environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method description is detailed but there are no explicit commands or procedures to reproduce the main experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables 1 and 2 report only point estimates (e.g., '90.1% accuracy', '0.973 correlation') with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims PROTEUS outperforms all baselines (Table 2) and that ablation components matter (Table 4) based solely on comparing point estimates. No statistical significance tests (p-values, t-tests, bootstrap) are reported."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Effect sizes are reported with baseline context throughout: '1.3 percentage points below oracle' (Section 4.2), 'cost savings of 90% versus GPT-4' (Section 4.2), ablation drops of '−0.27%' and '−0.53%' (Table 4), and cost ranges per tier (Table 1)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 405K (RouterBench) and 45K (SPROUT) queries are sufficient, no power analysis, and no discussion of whether the sample supports the precision of claims made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The dynamic adaptation experiment uses '5 random seeds' (Section 4.1) but reports only aggregate floor satisfaction percentages without standard deviation or spread. Main results in Tables 1 and 2 show single-run numbers with no variance measures."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Extensive baselines are included: 3 static (Random, Cheapest, Best Fixed), 4 learned routers (KNN, MLP, CARROT at 3 configurations, OmniRouter), and 3 ablated variants (Section 3.1)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include CARROT (Somerstep et al., 2025), OmniRouter (Mei et al., 2025), and RouteLLM (Ong et al., 2024), all from within 1-2 years of this work."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 4 presents a systematic ablation removing three components: Lagrangian constraint (λ), learnable γ, and critic network, with accuracy and cost impact on both datasets."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: accuracy, cost, τ-µ correlation, SLA compliance (floor and tolerance-band), Routing Efficiency (RE), Routing Performance Index (RPI), and latency (Table 3)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is automated using benchmark correctness labels. No human evaluation of routing quality, user satisfaction, or real-world quality of selected model responses is included."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper uses '70/15/15 train/val/test splits for RouterBench and official splits for SPROUT' (Section 3.1) and reports results on held-out test data."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 1 provides per-tier breakdowns (Economy/Standard/Premium) with accuracy and cost for each tier on both datasets. Table 2 reports per-dataset results for all methods."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No specific routing failure examples, error analysis, or qualitative discussion of where routing decisions go wrong. The paper reports ±2% tolerance compliance is only 44%/11% (Table 1) but does not analyze why or show individual failure cases."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 4 reports the critic provides negligible benefit (<0.05% accuracy change), effectively a negative result on that component's value. SLA compliance at ±2% tolerance is low (44%/11%), and dynamic floor satisfaction is only 77-86% (Section 4.1)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "All abstract claims are verified: 100% floor compliance (Table 1), τ-µ correlation 0.973/0.981 (Table 1), OmniRouter 22% compliance (Section 4.1/Figure 2a), 90.1%/94.0% accuracy (Table 2), 89.8% cost savings (Table 2 derived: $3.3 → $0.33)."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (Table 4) makes causal claims ('removing λ causes accuracy drops') through controlled single-variable manipulation, which is adequate for causal inference about component contributions."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The limitations section explicitly bounds generalization: 'RouterBench's model pool includes... all of which predate current frontier models,' 'SPROUT covers narrower task distributions,' 'production systems lack such labels by default,' and 'model costs ci are fixed at training time' (Section 5)."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not consider alternative explanations for PROTEUS's performance advantages. No discussion of whether results could be explained by benchmark-specific artifacts, encoder choice effects, or training data characteristics rather than the Lagrangian constraint mechanism."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper acknowledges the gap between benchmark accuracy and production quality: 'PROTEUS trains on ground-truth correctness labels... but production systems lack such labels by default, so deployment would require periodic sampling with human annotation or LLM-as-judge evaluation' (Section 5)."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "DeBERTa-v3-small is specified precisely, but the LLM pool models are referred to by marketing names only: 'GPT-3.5, GPT-4, Claude-2' (RouterBench) and 'GPT-4o, Claude-3.5-Sonnet, o3-mini' (SPROUT) without exact version identifiers or snapshot dates."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "The paper does not use prompting. PROTEUS encodes queries via DeBERTa-v3-small embeddings and routes using a learned policy. No prompts are sent to any LLM as part of the method."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Hyperparameters are reported in detail: batch size 32, AdamW lr=3×10⁻⁴, 10K steps, ηλ=0.4, gradient clipping max norm 1.0, γ learnable in [2,8], dual updates every 5 batches, 256-dim embeddings, 2-layer MLP policy (Section 2 and 3.1)."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. PROTEUS is a routing layer that selects among models, not an agent with tools, memory, or iterative execution."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Data splits are stated ('70/15/15 train/val/test' for RouterBench, 'official splits' for SPROUT) and cost normalization is described (EMA of percentile-based bounds), but no documentation of data filtering, query preprocessing, or transformation steps from raw benchmark data to training input."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A substantive 'Limitations' subsection is present in Section 5 (Conclusion), spanning approximately one full paragraph with multiple specific limitations discussed."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats are discussed: outdated model pools in RouterBench, compressed accuracy ranges making fine-grained τ distinctions difficult, reliance on ground-truth labels unavailable in production, and fixed costs becoming stale after API pricing changes (Section 5)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Specific scope boundaries are stated: 'RouterBench's model pool includes Llama-2, GPT-3.5, GPT-4, and Claude-2, all of which predate current frontier models,' 'SPROUT covers narrower task distributions,' and 'deployment would require periodic sampling with human annotation' (Section 5)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The underlying benchmark data (RouterBench and SPROUT) is publicly released by their respective authors, enabling independent verification of the routing results."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The benchmark data is described: 'RouterBench provides 405K inference outcomes across 11 models' spanning reasoning, mathematics, and coding tasks, and 'SPROUT complements RouterBench with 45K queries across 14 models' with instruction-following queries (Section 3.1)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard public benchmark datasets (RouterBench, SPROUT)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The paper describes train/val/test splits and cost normalization but does not document the full pipeline from benchmark data to training input, including any filtering, transformation, or quality checks applied to the raw benchmark data."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is disclosed. The acknowledgments section thanks open-source communities and dataset authors but does not mention any grants, corporate sponsors, or funding agencies."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All three authors are clearly listed as affiliated with 'Phi Labs, Quantiphi, Boston, USA' with email addresses at quantiphi.com."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed despite all authors being from Quantiphi (a consulting/AI services company), making it impossible to assess funder independence."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present. The authors work at Quantiphi, an AI services company that could commercially benefit from routing technology, but no financial interests are declared."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper evaluates a routing system, not a pre-trained model's capability on benchmarks. The LLM model performances come from pre-existing benchmark datasets (RouterBench, SPROUT), not from the authors querying models."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper tests a routing tool's ability to select among models, not model knowledge or capability on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The paper evaluates routing policy quality, not the underlying LLM capabilities. Benchmark contamination of the LLM pool is the responsibility of the benchmark authors."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants. The study is a benchmark evaluation of a routing system."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Inference cost and latency are reported extensively: Table 3 provides per-query routing latency (2.6-8.7ms) and throughput (115-385 q/s). Table 1 reports per-tier costs ($0.20-$1.69/1K queries). Cost savings vs baselines are quantified (Table 2)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Training completes in '4 hours per dataset on one A100 GPU' (Section 2.3). Hardware specified as 'a single A100 GPU' (Sections 2.2, 3.1)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The dynamic adaptation experiment uses '5 random seeds' (Section 4.1) but does not report mean/std across seeds. Main results in Tables 1 and 2 show no evidence of multi-seed evaluation or seed sensitivity analysis."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "For dynamic adaptation: '5 random seeds' is stated. For the main results (Tables 1 and 2), the number of experimental runs is not stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper states ηλ=0.4 was 'determined empirically' and mentions values >1.0 and <0.1 didn't work (Section 2.3), but reports no formal search budget, number of configurations tried, or search method."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper mentions 'dual variable convergence as monitored on validation accuracy' (Section 2.3) but does not describe the selection criterion in detail or show validation performance across configurations."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed, so there are no multiple comparisons to correct."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The '-Style' suffix on 'OmniRouter-Style' and 'RouteLLM-Style' (Section 3.1) indicates re-implementations rather than original code. No discussion of author-evaluation bias or whether these re-implementations faithfully represent the original methods."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper notes 'all learned routers use similar-sized encoders (22–125M params) with comparable latency' (Table 2 footnote) but does not report performance as a function of compute budget or provide matched-compute comparisons."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper describes what RouterBench and SPROUT contain and notes complementary strengths, but does not discuss whether benchmark correctness labels actually measure real-world quality or whether the evaluation setup has validity gaps."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. PROTEUS is a routing layer, not an agentic scaffold."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether the DeBERTa encoder's pre-training data includes benchmark questions, or whether the temporal ordering of benchmark creation vs model training affects the validity of correctness labels used for routing."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the query encoding or routing features could leak information about model correctness, or whether the benchmark setup provides hints not available in real deployment."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "While train/test splits are used, there is no discussion of whether train and test queries share structural similarities (same topic domains, similar difficulty distributions) that could inflate generalization claims."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are described. No canary strings, membership inference, n-gram overlap analysis, or decontamination steps are mentioned."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "PROTEUS achieves 100% floor compliance where accuracy meets or exceeds τ across all evaluated targets on both benchmarks.",
    365       "evidence": "Table 1 shows 100% floor compliance on both RouterBench and SPROUT. Figure 2(b) shows accuracy consistently exceeding the floor constraint across all τ values.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "The τ-µ correlation reaches 0.973 on RouterBench and 0.981 on SPROUT, demonstrating faithful translation of accuracy targets into routing behavior.",
    370       "evidence": "Table 1 reports these correlation values. Figure 2(a) shows PROTEUS adapting accuracy to match requested τ across both datasets.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "PROTEUS achieves 90.1% accuracy on RouterBench (1.3pp below oracle) and 94.0% on SPROUT (4.6pp below oracle) with significant cost savings.",
    375       "evidence": "Table 2 shows 90.1% (RB) and 94.0% (SP) accuracy versus oracle of 91.4% and 98.6%. Cost is $0.33/1K (RB) vs $3.3/1K for GPT-4, representing 90% savings.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "OmniRouter achieves only 22% floor compliance on RouterBench and 0% on SPROUT despite using Lagrangian optimization.",
    380       "evidence": "Section 4.1 and Figure 2(a) report these numbers. However, the comparison uses 'OmniRouter-Style' (a re-implementation), not the original code.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "A single trained PROTEUS model serves the full accuracy spectrum τ ∈ [0.85, 0.95] without retraining.",
    385       "evidence": "Table 1 shows tiered performance from Economy to Premium tiers, and Figure 2(c) shows dynamic τ adaptation across four scenarios, all from a single model.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "The Lagrangian constraint mechanism (λ) is the critical component, causing 0.27% (RB) and 0.53% (SP) accuracy drops when removed.",
    390       "evidence": "Table 4 ablation study shows these drops alongside 8-25% cost reductions when λ is removed, indicating the unconstrained variant sacrifices accuracy for cost.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "PROTEUS achieves highest routing efficiency with RE of 11.1 pp/ms (RB) and 9.5 pp/ms (SP), and RPI of 88.5 (RB) exceeding oracle RPI of 88.2.",
    395       "evidence": "Table 2 reports these metrics. The RPI exceeding oracle is explained by the cost-quality tradeoff: accepting 1.4% quality reduction saves 15% cost (Section 4.2).",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No code released",
    402       "detail": "The implementation is described in detail but no source code is provided, making independent verification of the results impossible."
    403     },
    404     {
    405       "flag": "Re-implemented baselines",
    406       "detail": "Key baselines are labeled 'OmniRouter-Style' and 'RouteLLM-Style' (Section 3.1), indicating re-implementations rather than original code. The authors' implementations may systematically underperform the originals (Lucic et al., 2018)."
    407     },
    408     {
    409       "flag": "No statistical tests or error bars",
    410       "detail": "All claims of superiority are based on comparing point estimates (Tables 1-2). With no significance tests, error bars, or multi-seed results for main experiments, it is unclear whether observed differences are statistically meaningful."
    411     },
    412     {
    413       "flag": "Novel metrics favor proposed method",
    414       "detail": "RE and RPI are metrics introduced by the authors (Section 3.1). RPI is defined such that PROTEUS exceeds even oracle performance (88.5 vs 88.2 in Table 2), which raises questions about metric design choices."
    415     },
    416     {
    417       "flag": "Missing funding disclosure from corporate authors",
    418       "detail": "All authors are from Quantiphi (an AI consulting company) but no funding source or competing interests are declared. Routing technology could be commercially relevant to the company."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    424       "authors": ["Qitian Jason Hu", "Jacob Bieker", "Xiuyu Li", "Nan Jiang", "Benjamin Keigwin", "Gaurav Ranganath", "Kurt Keutzer", "Shriyash Kaustubh Upadhyay"],
    425       "year": 2024,
    426       "arxiv_id": "2403.12031",
    427       "relevance": "Gold-standard benchmark for multi-LLM routing with 405K inference outcomes across 11 models, used as a primary evaluation dataset."
    428     },
    429     {
    430       "title": "CARROT: A Cost Aware Rate Optimal Router",
    431       "authors": ["Seamus Somerstep", "Felipe Maia Polo", "Allysson Flavio Melo de Oliveira"],
    432       "year": 2025,
    433       "arxiv_id": "2502.03261",
    434       "relevance": "Contemporary learned router baseline using RoBERTa encoder and cost-quality tradeoff scoring, also provides the SPROUT benchmark dataset."
    435     },
    436     {
    437       "title": "RouteLLM: Learning to Route LLMs with Preference Data",
    438       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang"],
    439       "year": 2024,
    440       "arxiv_id": "2406.18665",
    441       "relevance": "Learned LLM router using preference data and threshold calibration, representative of the indirect parameter-tuning approach PROTEUS aims to replace."
    442     },
    443     {
    444       "title": "OmniRouter: Budget and Performance Controllable Multi-LLM Routing",
    445       "authors": ["Kai Mei", "Wujiang Xu", "Minghao Guo", "Shuhang Lin", "Yongfeng Zhang"],
    446       "year": 2025,
    447       "arxiv_id": "2502.20576",
    448       "relevance": "Closest baseline to PROTEUS, uses batch-level Lagrangian optimization for LLM routing but achieves only 22% floor compliance."
    449     },
    450     {
    451       "title": "Automix: Automatically Mixing Language Models",
    452       "authors": ["Pranjal Aggarwal", "Aman Madaan", "Ankit Anand"],
    453       "year": 2024,
    454       "arxiv_id": "2310.12963",
    455       "relevance": "Cascade-based LLM routing method that routes through models until confidence thresholds are met."
    456     },
    457     {
    458       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    459       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    460       "year": 2023,
    461       "arxiv_id": "2305.05176",
    462       "relevance": "Early work on cost reduction through LLM cascading and selection, foundational to the routing literature."
    463     },
    464     {
    465       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    466       "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang"],
    467       "year": 2024,
    468       "arxiv_id": "2404.14618",
    469       "relevance": "Learned LLM routing approach with quality-aware query routing, published at ICLR 2024."
    470     },
    471     {
    472       "title": "Router-R1: Teaching LLMs Multi-Round Routing and Aggregation via Reinforcement Learning",
    473       "authors": ["Haozhen Zhang", "Tao Feng", "Jiaxuan You"],
    474       "year": 2025,
    475       "arxiv_id": "2506.09033",
    476       "relevance": "RL-based multi-round routing method, contemporary work applying reinforcement learning to LLM routing."
    477     },
    478     {
    479       "title": "LLM-Blender: Ensembling Large Language Models with Pairwise Ranking and Generative Fusion",
    480       "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"],
    481       "year": 2023,
    482       "relevance": "LLM ensembling approach using pairwise ranking, relevant to multi-model selection and routing."
    483     },
    484     {
    485       "title": "Holistic Evaluation of Language Models",
    486       "authors": ["Percy Liang", "Rishi Bommasani", "Tony Lee"],
    487       "year": 2023,
    488       "relevance": "HELM benchmark framework establishing that LLM costs span four orders of magnitude, motivating cost-aware routing."
    489     },
    490     {
    491       "title": "Learning to Route LLMs from Bandit Feedback: One Policy, Many Trade-offs",
    492       "authors": ["Wei Wang", "Tiankai Yang", "Hongjie Chen"],
    493       "year": 2025,
    494       "arxiv_id": "2510.07429",
    495       "relevance": "Bandit feedback approach to LLM routing that reduces annotation cost, cited as future work direction."
    496     },
    497     {
    498       "title": "Efficient Memory Management for Large Language Model Serving with PagedAttention",
    499       "authors": ["Woosuk Kwon", "Zhuohan Li", "Siyuan Zhuang"],
    500       "year": 2023,
    501       "relevance": "vLLM serving system that PROTEUS integrates with for production deployment."
    502     }
    503   ]
    504 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs