scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27759B)
      1 {
      2   "paper": {
      3     "title": "Towards Fair and Comprehensive Evaluation of Routers in Collaborative LLM Systems",
      4     "authors": [
      5       "Wanxing Wu",
      6       "He Zhu",
      7       "Yixia Li",
      8       "Lei Yang",
      9       "Jiehui Zhao",
     10       "Hongru Wang",
     11       "Jian Yang",
     12       "Benyou Wang",
     13       "Bingyi Jing",
     14       "Guanhua Chen"
     15     ],
     16     "year": 2026,
     17     "venue": "arXiv",
     18     "arxiv_id": "2602.11877"
     19   },
     20   "scan_version": 2,
     21   "active_modules": ["experimental_rigor", "data_leakage"],
     22   "methodology_tags": ["benchmark-eval"],
     23   "key_findings": "The paper introduces RouterXBench, a three-dimensional evaluation framework (router ability, scenario alignment, cross-domain robustness) that disentangles intrinsic routing quality from model performance. ProbeDirichlet, a lightweight linear router using internal hidden states with Dirichlet-distributed layer weighting, achieves 16.68% and 18.86% relative improvements over best baselines on AUROC and HCR respectively. Analysis shows that signal provenance (internal hidden states vs. output logits or embeddings) matters more than aggregation architecture, and that training data diversity yields additive cross-domain gains without interference.",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "GitHub repository URL provided: 'Our code is publicly available at https://github.com/zhuchichi56/RouterXBench' (Section 1, footnote 1)."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "All six benchmarks used (Alpaca, MMLU, Big-Math, Magpie, MMLU Pro, MATH) are publicly available datasets. Table 7 lists them with references."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No requirements.txt, Dockerfile, conda environment, or library version listing found in the paper. Only model hidden state dimension (4096) and seed are mentioned."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No step-by-step reproduction instructions in the paper. A GitHub repo is referenced but no README content or reproduction guide is described."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Tables 1, 2, and 3 report only point estimates. No confidence intervals, error bars, or ± notation anywhere in the results."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "Claims of improvement ('16.68% relative improvement') are based solely on comparing point estimates. No statistical tests (p-values, t-tests, bootstrap) are reported."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Relative improvements are reported with baseline context: '16.68% and 18.86% relative improvements over the best baselines' (Abstract, Section 5.2). Tables show absolute values for all methods, enabling readers to compute differences."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No justification for why 12K training examples (4K per domain), 1K test examples for some benchmarks, or the specific benchmark sizes were chosen. No power analysis."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Appendix A explicitly states: 'we report single-run results for all experiments.' No variance, standard deviation, or spread measure reported."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Six baselines compared across three signal modalities: verbose-based (SelfAsk, SemanticEntropy), logit-based (ConfidenceMargin, Entropy, MaxLogits), and embedding-based (EmbeddingMLP). Section 5.1."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include recent work: RouteLLM (Ong et al., 2025), SemanticEntropy (Kuhn et al., 2023), FusionFactory (Feng et al., 2025). Methods span 2022-2025 and represent state of the art across signal modalities."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Multiple ablation studies: Table 3 compares probe aggregation strategies (Final Layer vs Mean Pool vs Dirichlet), Table 4 compares input representations, Table 6 examines training data composition effects, Figure 3 studies probe architecture complexity."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Four metrics: AUROC for router ability, LPM for cost-sensitive scenarios, MPM for balanced deployment, and HCR for accuracy-critical scenarios (Section 3.3)."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "All evaluation is automated. Open-ended task quality is judged by GPT-5 (LLM-as-a-Judge, Appendix B.1). No human evaluation of routing decisions or system outputs."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Table 7 shows explicit train/val and test splits. Additionally, three benchmarks (Magpie, MMLU Pro, MATH) serve as out-of-domain test sets with no training data."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Tables 1 and 2 provide per-benchmark breakdowns for all methods across in-domain (Alpaca, BigMath, MMLU) and out-of-domain (Magpie, MATH, MMLU Pro subcategories) tasks."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Appendix D.2 'When Routing is Not Enough' shows a concrete failure case where both models converge on the same wrong answer, making routing ineffective. Includes specific example and analysis."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Figure 3 shows MLP probes provide no benefit over linear probes while increasing overfitting. Figure 4 shows diminishing returns with more training data. Appendix D.2 discusses fundamental limits of routing."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Abstract claims of '16.68% and 18.86% relative improvements' are supported by Tables 1 and 2. Claims of consistency across model families supported by Table 5. Agent scenario claim supported by Figure 5."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Causal claims ('data diversity yields additive gains,' 'Dirichlet aggregation prevents overfitting') are supported by controlled ablation studies. Table 6 varies training data composition while holding architecture fixed. Table 3 varies aggregation while holding data fixed. Single-variable manipulation is adequate."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title claims 'Collaborative LLM Systems' but experiments test only two-model edge-cloud routing with one primary model pair (GPT-5 + Llama-3.1-8B). The Limitations section acknowledges 'single small-large model pair' but the title and framing remain broader than the evidence."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Section 6 discusses why hidden states outperform embeddings (hierarchical vs surface-level information), why linear probes suffice (hidden states encode shared difficulty notion), and alternative data composition effects (interference vs additive gains, Table 6)."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The paper explicitly separates router ability (AUROC, intrinsic discrimination) from end-to-end system accuracy, noting that 'end-to-end accuracy at a given cost reflects both' router skill and model strength (Section 3.2). The framework is designed to disentangle proxy from outcome."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "'GPT-5' is stated without a version, snapshot date, or API version. 'Llama-3.1-8B-Instruct' and 'Qwen2.5-{0.5B,3B,7B}-Instruct' are more specific but GPT-5 lacks versioning. Per schema rules, marketing names without snapshot dates do not count."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Figure 6 provides the full LLM-as-a-Judge prompt template used for open-ended task evaluation. Exact reasoning tasks use xVerify (referenced with URL). Standard benchmark prompts are used for model evaluation."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "Probe training hyperparameters reported: learning rate 1e-4, 50 epochs, seed=42, 12K training examples (Section 5.1). However, LLM generation hyperparameters (temperature, top-p, max tokens) for GPT-5 and Llama inference are not stated."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "ProbeDirichlet is a linear probe on hidden states, not an agentic scaffold. The HotpotQA agent experiment (Figure 5) is an evaluation scenario, not the paper's method."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Appendix B describes benchmark preparation, Table 7 lists train/val/test splits with sizes, and Appendix B.1 documents ground truth label construction for both exact reasoning (xVerify) and open-ended (LLM-as-Judge) tasks."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "A dedicated 'Limitations' section follows the Conclusion, discussing the assumption that the large model is always better, single model pair constraint, and single-run results."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Specific threats identified: 'both models may perform similarly or converge on the same incorrect answer in certain domains' (with analysis in Appendix D.2), 'single small-large model pair,' and 'single-run results due to computational constraints.' These are specific to this study."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "States what was not tested: 'broader validation across diverse architectures, multiple seeds, and more complex OOD conditions would further strengthen the conclusions.' Also notes the assumption that large model capability exceeds small model's."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "While benchmarks are public and code is released, the raw experimental outputs (model responses, hidden state extractions, routing decisions) are not provided for independent verification."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Appendix B describes each benchmark's domain and purpose. Appendix B.1 details how ground truth labels are constructed using xVerify for exact tasks and LLM-as-a-Judge for open-ended tasks."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. All data comes from standard public benchmarks."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "The pipeline from benchmark data to binary labels is documented: queries go through models, responses are evaluated by xVerify or LLM-as-Judge (Eq. 16-17), and labels are constructed. Training splits specified in Table 7."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding or acknowledgments section found in the paper. Authors include one from Deepexi Technology Co. Ltd. (industry) and several from universities."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Southern University of Science and Technology, Institut Polytechnique de Paris, Peking University, Deepexi Technology Co. Ltd., University of Edinburgh, Beihang University, Chinese University of Hong Kong (Shenzhen)."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No funding disclosed, so independence cannot be assessed. One co-author is from Deepexi Technology, an industry entity whose interest in the outcome is unknown."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial disclosure statement found in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No training data cutoff dates stated for GPT-5 or Llama-3.1-8B-Instruct. These models are used to generate responses on benchmarks, and their training cutoffs are relevant to contamination risk."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether benchmark examples (MMLU from 2021, MATH from 2021, Alpaca from 2023) appeared in GPT-5 or Llama training data. This could affect ground truth label quality."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "MMLU (2021) and MATH (2021) are well-known benchmarks likely in GPT-5 and Llama-3.1 training data. No contamination analysis is performed despite this risk affecting the validity of model performance comparisons."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference cost or latency reported for ProbeDirichlet or any baseline method. For a paper about cost-efficient routing, the actual cost of running the router is notably absent."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No GPU hours, training time, or total compute budget stated. The Limitations section mentions 'computational constraints' but does not quantify them."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Appendix A: 'All experiments are conducted with a fixed random seed (seed=42)... we report single-run results for all experiments.' No seed sensitivity analysis."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Appendix A explicitly states: 'we report single-run results for all experiments.' The number of runs (1) is clearly stated."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search budget reported. Learning rate (1e-4), epochs (50), and training size (12K) appear fixed without explaining how these were selected or how many configurations were tried."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The final configuration is presented without explaining how it was selected. No validation-based selection procedure described for the main hyperparameters."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": false,
    325         "answer": false,
    326         "justification": "No statistical significance tests are performed in the paper, so multiple comparison correction is structurally inapplicable."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Authors compare their ProbeDirichlet against their own implementations of baselines without acknowledging author-evaluation bias. No independent evaluation or discussion of this bias."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "ProbeDirichlet is claimed to be 'lightweight' but no compute comparison against baselines is provided. SemanticEntropy requires multiple generations while ProbeDirichlet uses hidden states, but the compute difference is not quantified."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": true,
    341         "justification": "Section 3.2 extensively analyzes the construct validity of existing router metrics, demonstrating that AUC conflates router ability with model strength. The entire framework (RouterXBench) is designed to address validity gaps in router evaluation."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding comparisons between models. The router is a probe on hidden states. The HotpotQA agent experiment compares routing methods under the same setup."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of temporal leakage. Benchmarks like MMLU (2021) and MATH (2021) predate GPT-5 and Llama-3.1 training, meaning solutions could be in training data, affecting ground truth labels."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No discussion of whether the evaluation setup leaks information. For example, xVerify receives the query, model response, and gold answer — whether this introduces any artifacts is not discussed."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "No discussion of independence between training and test splits within benchmarks, or structural similarities between in-domain and out-of-domain test sets."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention method applied. No canary strings, membership inference, decontamination, or temporal split analysis."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "ProbeDirichlet achieves 16.68% relative improvement over the best baseline in router ability (AUROC).",
    375       "evidence": "Table 1 shows ProbeDirichlet AUROC averages of 68.70 (in-domain) and 65.46 (out-of-domain) vs. best baseline EmbeddingMLP at 59.46 and 55.22. Section 5.2.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "ProbeDirichlet achieves 18.86% relative improvement in high-accuracy scenarios (HCR).",
    380       "evidence": "Table 2 HCR section shows ProbeDirichlet at 18.50 (ID avg) and 15.40 (OOD avg) vs. best baseline SemanticEntropy at 15.17 and 13.35. Section 5.2.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Internal hidden states significantly outperform embedding-based and output-based methods for routing.",
    385       "evidence": "Table 4 compares identical linear models with different inputs: LLM hidden states (71.34/74.31) vs LLM embeddings (62.47/66.22) vs Longformer (61.95/66.19) on Alpaca/Magpie. Section 6.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Data diversity yields additive gains without interference across domains.",
    390       "evidence": "Table 6 shows adding domains preserves existing performance (Alpaca: 71.85→71.96→72.02) while independently boosting new domains (BigMath: 49.19→66.49→66.18). Section 6.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Lightweight linear probes suffice; increased model complexity provides no benefit.",
    395       "evidence": "Figure 3 shows MLP probes with various hidden dimensions achieve comparable or worse AUROC than linear baseline while exhibiting larger train-validation gaps. Section 6.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "ProbeDirichlet generalizes consistently across model families (Llama, Qwen) and scales.",
    400       "evidence": "Table 5 shows ProbeDirichlet outperforms EmbeddingMLP across Llama-3.1-8B and Qwen2.5-{0.5B, 3B, 7B} with average improvements of 10.5% (ID) and 9.6% (OOD). Section 6.",
    401       "supported": "moderate"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "Single-run results with no uncertainty quantification",
    407       "detail": "All results are single-run with seed=42. For a paper proposing a benchmark evaluation framework, the lack of variance estimates across seeds undermines the reliability of claimed improvements. The 16.68% improvement could be within noise."
    408     },
    409     {
    410       "flag": "GPT-5 serves dual role as large model and LLM-as-a-Judge evaluator",
    411       "detail": "For open-ended tasks, GPT-5 scores are used to construct ground truth labels comparing small model vs. SOTA performance. But GPT-5 IS the large model. The judge is thus scoring its own outputs vs. the small model's, creating potential circularity in label construction. Footnote 3 acknowledges but does not address this."
    412     },
    413     {
    414       "flag": "No benchmark contamination analysis",
    415       "detail": "MMLU (2021), MATH (2021), and Alpaca (2023) are well-known benchmarks likely present in GPT-5 and Llama training data. If both models have memorized answers, routing ground truth labels become unreliable. This is not discussed."
    416     },
    417     {
    418       "flag": "Cost-efficiency paper without cost reporting",
    419       "detail": "The paper motivates routing as a cost reduction strategy ('reduces computational cost') but never reports actual inference costs, latency, or compute budgets for any method. SemanticEntropy requires multiple generations while ProbeDirichlet uses single-pass hidden states, but this advantage is never quantified."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    425       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    426       "year": 2025,
    427       "relevance": "Proposes preference-based LLM routing with curve-based evaluation metrics; directly compared as context for RouterXBench framework."
    428     },
    429     {
    430       "title": "AutoMix: Automatically Mixing Language Models",
    431       "authors": ["Pranjal Aggarwal", "Aman Madaan", "Ankit Anand"],
    432       "year": 2024,
    433       "relevance": "Training-free LLM mixing approach using incremental benefit per cost; represents cost-aware routing baseline."
    434     },
    435     {
    436       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    437       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    438       "year": 2024,
    439       "relevance": "Seminal work on cost-efficient LLM usage through routing and cascading strategies."
    440     },
    441     {
    442       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    443       "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang"],
    444       "year": 2024,
    445       "arxiv_id": "2404.14618",
    446       "relevance": "Edge-cloud LLM routing with quality-aware routing; represents the hybrid deployment paradigm evaluated in this work."
    447     },
    448     {
    449       "title": "RouterEval: A Comprehensive Benchmark for Routing LLMs",
    450       "authors": ["Zhongzhan Huang", "Guoming Ling", "Yupei Lin"],
    451       "year": 2025,
    452       "relevance": "Concurrent routing benchmark exploring model-level scaling effects; complementary evaluation framework."
    453     },
    454     {
    455       "title": "EAGLE: Efficient Training-Free Router for Multi-LLM Inference",
    456       "authors": ["Zesen Zhao", "Shuowei Jin", "Z. Morley Mao"],
    457       "year": 2024,
    458       "arxiv_id": "2409.15518",
    459       "relevance": "Training-free multi-LLM routing using relative skill estimation."
    460     },
    461     {
    462       "title": "Semantic Uncertainty: Linguistic Invariances for Uncertainty Estimation in Natural Language Generation",
    463       "authors": ["Lorenz Kuhn", "Yarin Gal", "Sebastian Farquhar"],
    464       "year": 2023,
    465       "relevance": "Uncertainty estimation method used as a baseline routing signal; represents verbose-based routing approach."
    466     },
    467     {
    468       "title": "BEST-Route: Adaptive LLM Routing with Test-Time Optimal Compute",
    469       "authors": ["Dujian Ding", "Ankur Mallick", "Shaokun Zhang"],
    470       "year": 2025,
    471       "relevance": "Integrates routing with explicit test-time budget control for compute-aware LLM deployment."
    472     },
    473     {
    474       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations",
    475       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    476       "year": 2024,
    477       "relevance": "Multi-agent collaboration framework relevant to agentic workflow evaluation of routers."
    478     },
    479     {
    480       "title": "RouterDC: Query-Based Router by Dual Contrastive Learning for Assembling Large Language Models",
    481       "authors": ["Shuhao Chen", "Weisen Jiang", "Baijiong Lin"],
    482       "year": 2024,
    483       "relevance": "Learning-based routing using contrastive query-model embedding alignment."
    484     },
    485     {
    486       "title": "How Robust Are Router-LLMs? Analysis of the Fragility of LLM Routing Capabilities",
    487       "authors": ["Aly M. Kassem", "Bernhard Schölkopf", "Zhijing Jin"],
    488       "year": 2025,
    489       "relevance": "Analyzes robustness of LLM routing systems; directly relevant to OOD robustness evaluation."
    490     },
    491     {
    492       "title": "Smoothie: Label Free Language Model Routing",
    493       "authors": ["Neel Guha", "Mayee F Chen", "Trevor Chow"],
    494       "year": 2024,
    495       "relevance": "Training-free routing leveraging weak agreement signals; represents unsupervised routing baseline."
    496     }
    497   ]
    498 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs