scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27526B)
      1 {
      2   "paper": {
      3     "title": "Federate the Router: Learning Language Model Routers with Sparse and Decentralized Evaluations",
      4     "authors": [
      5       "Baris Askin",
      6       "Shivam Patel",
      7       "Anupam Nayak",
      8       "Andrea Vigano",
      9       "Jiin Woo",
     10       "Gauri Joshi",
     11       "Carlee Joe-Wong"
     12     ],
     13     "year": 2026,
     14     "venue": "arXiv preprint",
     15     "arxiv_id": "2601.22318",
     16     "doi": "10.48550/arXiv.2601.22318"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval", "theoretical"],
     21   "key_findings": "This paper introduces the first federated framework for LLM query routing, enabling clients to collaboratively learn routing policies from local offline data without centralizing private queries. Across RouterBench-Data and ProxRouter-Data, federated routers (both MLP and K-Means) consistently improve the accuracy-cost frontier over client-local baselines, with K-Means benefiting more from federation. Theoretical convergence guarantees and suboptimality bounds show federated training reduces routing error through increased effective query and model coverage. Adaptive personalization further improves robustness under extreme client heterogeneity.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No code repository URL or archive is provided anywhere in the paper. No GitHub/Zenodo link found."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper uses publicly available benchmarks: RouterBench-Data (Hu et al., 2024) and ProxRouter-Data (Patel et al., 2025), both cited with references to their public releases."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper specifies some components (all-mpnet-base-v2 encoder, MLP architecture details in Appendix C) and mentions using PSC Bridges-2 GPU, but provides no requirements.txt, Dockerfile, or library version listing."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithmic details are described but there are no runnable reproduction instructions."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results are reported as point estimates (AUC values, accuracy-cost curves). No confidence intervals, error bars, or ± notation appear in any figure or table."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims federated routers 'improve' over local routers based solely on comparing AUC scores (e.g., 0.75 vs 0.63-0.72) without any statistical significance tests."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "AUC values are reported for all methods with baseline context (e.g., Federated 0.75 vs Client-Local ranging 0.63-0.72 in Figure 2), providing magnitude of improvement alongside absolute baselines."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification for why N=10 clients was chosen, or why the Dirichlet concentration parameters (α=0.6, 0.03) were selected. No power analysis."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or spread measures across runs are reported. Results appear to be single-run experiments."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Multiple baselines are compared: client-local (no-FL) routers trained independently on each client's data, and centralized training on pooled data (Appendix D.1, Figure 9)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include RouterBench (2024) and ProxRouter (2025) benchmarks, and the router architectures (MLP-Router, K-Means-Router) follow recent work by Hu et al. (2024) and Jitkrittum et al. (2025)."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper ablates multiple components: federated vs local training (Section 6.1-6.2), model expansion (Section 6.3), client expansion (Appendix D.3), and adaptive personalization vs pure federated/local (Section 6.4)."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Results are evaluated using accuracy, average cost, and normalized AUC under the accuracy-cost tradeoff curve. The tradeoff parameter λ is swept to generate full Pareto frontiers."
     92       },
     93       "human_evaluation": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "Human evaluation is irrelevant to evaluating routing algorithms. The paper measures routing quality via accuracy-cost tradeoffs on benchmark data."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Appendix C states: 'Within each client, we split its allocated data into local train/test with fractions 0.75/0.25, and the global train/test sets are defined as the unions of client train/test splits.'"
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down per-client (Figures 10-11 show all 10 clients), per-router type (MLP vs K-Means), per-dataset (RouterBench vs ProxRouter in Appendix F), and per-heterogeneity regime (α=0.6 vs α=0.03)."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 6.4 explicitly discusses when federated routing fails: 'under extreme client data heterogeneity, a federated router can underperform on some clients' local distributions.' Specific clients where this occurs are shown in Figure 5."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 6.4 reports that under extreme heterogeneity (α=0.03), 'federated MLP-Router can indeed underperform local routers for some clients' (e.g., Client 4 and 6 in Figure 5 top row)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims are supported: 'federated collaboration improves the accuracy-cost frontier over client-local routers' (Figures 2-3), 'both via increased effective model coverage and better query generalization' (Sections 6.1-6.2), 'theoretical results validate that federated training reduces routing suboptimality' (Theorems 5.3, 5.5)."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims like 'federated collaboration improves the accuracy-cost frontier' are supported by controlled experiments comparing federated vs local training on the same data partitions, varying only the training procedure. This is analogous to a controlled ablation."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Claims are bounded: the abstract says 'Across two benchmarks' and results are specific to RouterBench-Data and ProxRouter-Data. The title focuses on 'sparse and decentralized evaluations' which matches the tested setting."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper does not discuss alternative explanations for why federated routing improves over local routing beyond the proposed mechanisms (better query coverage, model coverage). No robustness checks against confounds like data quantity effects independent of federation."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper measures accuracy and cost on routing benchmarks and claims routing quality improvements. The measurements directly correspond to the claims — no proxy gap exists between what is measured (accuracy-cost AUC) and what is claimed (better routing)."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "The sentence encoder is specified as 'all-mpnet-base-v2' (Song et al., 2020). The LLM pool models are specified with versions in Figure 8: GPT-4 1106 Preview, GPT-3.5 Turbo 1106, Claude Instant v1, Claude v1, Claude v2, WizardLM 13B V1.2, Llama 2 70B Chat, etc."
    151       },
    152       "prompts_provided": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "The paper does not use prompting. It trains routing algorithms (MLP, K-Means) on pre-existing benchmark evaluation data. No prompts are sent to LLMs as part of the method."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix C reports: learning rate η=10⁻³, weight decay 3×10⁻⁴, batch size 128, dropout p=0.1, gradient clipping max-norm 1.0, hidden layer widths (512, 512), Klocal=15, Kglobal=20, ninit=3 restarts, partial participation rate 0.6."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The approach trains standard MLP and K-Means models for routing."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 6 and Appendix B-C describe: Dirichlet partitioning with α=0.6 over task labels for query heterogeneity, Dirichlet α=0.45 for model assignment, train/test split 0.75/0.25, and encoder embedding procedure."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No dedicated limitations section exists. The conclusion (Section 7) is four sentences mentioning online routing as future work but does not discuss limitations of the current approach."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No threats to validity are discussed. The paper does not address concerns like whether synthetic Dirichlet partitioning reflects real federated heterogeneity, or whether single-run results are reliable."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show (e.g., real-world federated deployment, privacy guarantees beyond data locality, or scaling beyond 10 clients)."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "The underlying benchmark datasets (RouterBench-Data and ProxRouter-Data) are publicly available as cited. The federated partitioning procedure is fully described with parameters, enabling reproduction."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The data comes from published benchmarks: RouterBench-Data (11 LLMs over 8 datasets, Hu et al. 2024) and ProxRouter-Data (14 LLMs over 10 datasets, Patel et al. 2025). The federated simulation procedure is described in detail."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are standard public benchmarks (RouterBench-Data, ProxRouter-Data)."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The pipeline from benchmark data to federated experiments is documented: Dirichlet partitioning (α values specified), model assignment (Dirichlet α=0.45), single-model-per-query assumption, train/test split (0.75/0.25), and encoding via all-mpnet-base-v2."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Acknowledgements section discloses funding: US DOE grant DESC0025652, multiple NSF grants (CNS-2409138, CNS-2533813, CCF 2045694, etc.), AI2C Seed grant, and ONR N00014-23-1-2149."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All authors are affiliated with Carnegie Mellon University, clearly stated on the first page. They are not evaluating a product from their own institution."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Funding is from government agencies (NSF, DOE, ONR) and a university seed grant (AI2C), none of which have a financial interest in the routing results."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is included in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "The paper does not evaluate pre-trained model capabilities on benchmarks. It trains routing algorithms (MLP, K-Means) on pre-existing LLM evaluation data. Contamination of the routing model is addressed by train/test splitting."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Same as above — the paper evaluates routing algorithms, not pre-trained language models. Standard train/test splitting is used for the router."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "Same as above — contamination concerns about LLM training data seeing benchmark solutions are outside the scope of this routing paper."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The paper evaluates routing algorithms on benchmark data."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "The paper's core results are accuracy-cost tradeoff curves (Figures 2-5) where the x-axis reports average inference cost per query in dollars. Cost is a primary evaluation dimension."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "The acknowledgements mention using 'PSC Bridges-2 GPU at Pittsburgh Supercomputing Center through allocation CIS250087' but do not quantify GPU hours, training time, or total compute budget."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from a single run of the Dirichlet partitioning and training procedure."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is not stated. Results are presented without indicating how many times the experiments were repeated."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Hyperparameters are stated (Appendix C) but the search budget is not reported. Klocal and Kglobal are said to be 'chosen with validation experiments' but the search space and number of configurations tried are not disclosed."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper mentions Klocal=15 and Kglobal=20 are 'chosen with validation experiments' but does not show the validation results or explain the selection criterion."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical tests are performed, so multiple comparison correction is moot. The absence of statistical testing is captured by significance_tests."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "All methods (federated, local, personalized) are implemented by the authors. No discussion of potential author-implementation bias or independent evaluation."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The compute cost of federated training (communication rounds, local training steps) vs local-only training is not compared. Federated training involves T=unspecified rounds with τ local steps each, adding significant overhead that is not quantified."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The paper does not discuss whether RouterBench-Data and ProxRouter-Data actually measure realistic routing scenarios. No analysis of whether synthetic query-model evaluation data reflects real-world routing needs."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved. The paper trains and evaluates routing algorithms directly."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of temporal leakage. The train/test split is random, not temporal. Whether earlier benchmark evaluations could inform later ones is not addressed."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether query embeddings or evaluation features could leak information about the test set routing outcomes."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of non-independence between train and test examples. Queries from the same benchmark task may share structural similarities, but this is not addressed."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method beyond standard random train/test splitting."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Federated routers consistently improve the accuracy-cost frontier over client-local routers on the global test distribution.",
    373       "evidence": "Figure 2 shows federated AUC of 0.75 for both MLP and K-Means vs client-local AUC ranging 0.63-0.72 (MLP) and 0.55-0.70 (K-Means) on RouterBench-Data. Similar results on ProxRouter-Data (Figure 15).",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "Federated learning improves in-distribution local performance via better model coverage.",
    378       "evidence": "Figure 3 shows federated routers outperform client-local routers even on individual clients' local test sets. Figures 10-11 confirm this across all 10 clients with mean AUC improvement of 0.05 (MLP) and 0.11 (K-Means).",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Federated training achieves performance comparable to centralized training.",
    383       "evidence": "Figure 9 shows identical AUC (0.75) for both federated and centralized training for both router types on RouterBench-Data. Figure 19 shows similar parity on ProxRouter-Data.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Adaptive personalization improves robustness under extreme heterogeneity.",
    388       "evidence": "Section 6.4 and Figure 5 show that under α=0.03, personalized routing generally matches or exceeds the better of federated and local routers. However, for K-Means, federation alone already remains competitive (Figures 13-14 in appendix show mean AUC: personalized 0.75 vs federated 0.75 for MLP).",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Both router families support lightweight adaptation to new models without full retraining.",
    393       "evidence": "Figure 4 shows accuracy-cost frontiers improve after adding 3 withheld models via lightweight calibration (MLP AUC: 0.732→0.748, K-Means: 0.732→0.749).",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Federated training reduces routing suboptimality compared to local-only training (theoretical).",
    398       "evidence": "Theorem 5.3 shows suboptimality scales as O(1/√D) for federated vs O(1/√Di) for local, and Theorem 5.5 provides analogous bounds for K-Means-Router. Full proofs in Appendix G.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "No error bars or uncertainty quantification",
    405       "detail": "All experimental results are reported as single-point estimates without variance, confidence intervals, or multiple-run statistics. Given that the Dirichlet partitioning is stochastic, different random seeds could produce different client data distributions and thus different results. The stability of the reported improvements is unknown."
    406     },
    407     {
    408       "flag": "Synthetic federated simulation",
    409       "detail": "The federated setting is simulated by Dirichlet-partitioning existing centralized benchmark data across 10 synthetic clients. Real federated data heterogeneity, communication constraints, and privacy requirements may differ substantially from this simulation. The paper does not validate against any real federated deployment."
    410     },
    411     {
    412       "flag": "No limitations section",
    413       "detail": "The paper lacks a dedicated limitations section despite the 34-page length (including appendix). Key limitations like the synthetic federated setup, single-run results, and unknown generalization to real deployments are not discussed."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "RouteLLM: Learning to route LLMs from preference data",
    419       "authors": ["I. Ong", "A. Almahairi", "V. Wu", "W.-L. Chiang", "T. Wu", "J. E. Gonzalez", "M. W. Kadous", "I. Stoica"],
    420       "year": 2025,
    421       "relevance": "Core LLM routing method that trains routers from human preference data; direct comparison point for routing approaches."
    422     },
    423     {
    424       "title": "RouterBench: A benchmark for multi-LLM routing system",
    425       "authors": ["Q. J. Hu", "J. Bieker", "X. Li", "N. Jiang", "B. Keigwin", "G. Ranganath", "K. Keutzer", "S. K. Upadhyay"],
    426       "year": 2024,
    427       "relevance": "Primary benchmark used in this paper; establishes the evaluation framework for LLM routing systems."
    428     },
    429     {
    430       "title": "Hybrid LLM: Cost-efficient and quality-aware query routing",
    431       "authors": ["D. Ding", "A. Mallick", "C. Wang", "R. Sim", "S. Mukherjee", "V. Rühle", "L. V. S. Lakshmanan", "A. H. Awadallah"],
    432       "year": 2024,
    433       "relevance": "Parametric LLM routing approach for cost-quality tradeoffs; foundational work in the routing space."
    434     },
    435     {
    436       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    437       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    438       "year": 2024,
    439       "relevance": "Pioneering work on reducing LLM inference costs through model selection and cascading strategies."
    440     },
    441     {
    442       "title": "GraphRouter: A graph-based router for LLM selections",
    443       "authors": ["T. Feng", "Y. Shen", "J. You"],
    444       "year": 2025,
    445       "relevance": "Graph-based approach to LLM routing that models relationships between models and queries."
    446     },
    447     {
    448       "title": "EmbedLLM: Learning compact representations of large language models",
    449       "authors": ["R. Zhuang", "T. Wu", "Z. Wen", "A. Li", "J. Jiao", "K. Ramchandran"],
    450       "year": 2025,
    451       "relevance": "Learns LLM embeddings for model comparison and selection; relevant to router design."
    452     },
    453     {
    454       "title": "Smoothie: Label free language model routing",
    455       "authors": ["N. Guha", "M. F. Chen", "T. Chow", "I. S. Khare", "C. Re"],
    456       "year": 2024,
    457       "relevance": "Label-free LLM routing approach using weak supervision; alternative paradigm for router training."
    458     },
    459     {
    460       "title": "Eagle: Efficient training-free router for multi-LLM inference",
    461       "authors": ["Z. Zhao", "S. Jin", "Z. M. Mao"],
    462       "year": 2024,
    463       "arxiv_id": "2409.15518",
    464       "relevance": "Training-free multi-LLM routing using Elo-based model comparison scores."
    465     },
    466     {
    467       "title": "ProxRouter: Proximity-weighted LLM query routing for improved robustness to outliers",
    468       "authors": ["S. Patel", "N. Jali", "A. Mallick", "G. Joshi"],
    469       "year": 2025,
    470       "arxiv_id": "2510.09852",
    471       "relevance": "Nonparametric proximity-weighted routing approach; second benchmark dataset used in this paper."
    472     },
    473     {
    474       "title": "BEST-route: Adaptive LLM routing with test-time optimal compute",
    475       "authors": ["D. Ding", "A. Mallick", "S. Zhang", "C. Wang", "D. Madrigal"],
    476       "year": 2025,
    477       "relevance": "Adaptive LLM routing that optimizes compute at test time; advances the parametric routing paradigm."
    478     },
    479     {
    480       "title": "Universal LLM routing with correctness-based representation",
    481       "authors": ["W. Jitkrittum", "H. Narasimhan", "A. S. Rawat", "J. Juneja", "Z. Wang"],
    482       "year": 2025,
    483       "relevance": "Nonparametric routing using correctness-based representations; key comparison point for K-Means routing."
    484     },
    485     {
    486       "title": "Communication-efficient learning of deep networks from decentralized data",
    487       "authors": ["B. McMahan", "E. Moore", "D. Ramage", "S. Hampson", "B. A. y Arcas"],
    488       "year": 2017,
    489       "relevance": "Foundational FedAvg algorithm used as the basis for the federated MLP-Router training in this paper."
    490     }
    491   ]
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs