scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (30949B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Federate the Router: Learning Language Model Routers with Sparse and Decentralized Evaluations",
      6     "authors": [
      7       "Baris Askin",
      8       "Shivam Patel",
      9       "Anupam Nayak",
     10       "Andrea Vigano",
     11       "Jiin Woo",
     12       "Gauri Joshi",
     13       "Carlee Joe-Wong"
     14     ],
     15     "year": 2026,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2601.22318",
     18     "doi": "10.48550/arXiv.2601.22318"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims are supported: 'federated collaboration improves the accuracy-cost frontier over client-local routers' (Figures 2-3), 'both via increased effective model coverage and better query generalization' (Sections 6.1-6.2), 'theoretical results validate that federated training reduces routing suboptimality' (Theorems 5.3, 5.5).",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claims like 'federated collaboration improves the accuracy-cost frontier' are supported by controlled experiments comparing federated vs local training on the same data partitions, varying only the training procedure. This is analogous to a controlled ablation.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Claims are bounded: the abstract says 'Across two benchmarks' and results are specific to RouterBench-Data and ProxRouter-Data. The title focuses on 'sparse and decentralized evaluations' which matches the tested setting.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss alternative explanations for why federated routing improves over local routing beyond the proposed mechanisms (better query coverage, model coverage). No robustness checks against confounds like data quantity effects independent of federation.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper measures accuracy and cost on routing benchmarks and claims routing quality improvements. The measurements directly correspond to the claims — no proxy gap exists between what is measured (accuracy-cost AUC) and what is claimed (better routing).",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No dedicated limitations section exists. The conclusion (Section 7) is four sentences mentioning online routing as future work but does not discuss limitations of the current approach.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No threats to validity are discussed. The paper does not address concerns like whether synthetic Dirichlet partitioning reflects real federated heterogeneity, or whether single-run results are reliable.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show (e.g., real-world federated deployment, privacy guarantees beyond data locality, or scaling beyond 10 clients).",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Acknowledgements section discloses funding: US DOE grant DESC0025652, multiple NSF grants (CNS-2409138, CNS-2533813, CCF 2045694, etc.), AI2C Seed grant, and ONR N00014-23-1-2149.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All authors are affiliated with Carnegie Mellon University, clearly stated on the first page. They are not evaluating a product from their own institution.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Funding is from government agencies (NSF, DOE, ONR) and a university seed grant (AI2C), none of which have a financial interest in the routing results.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is included in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms are formally defined: routing policy (π: R^demb → M, Section 3), utility function (Eq. 1), suboptimality (Definition 5.2), and the federated data model (Eq. 2) are all precisely specified.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Four numbered contributions are explicitly enumerated at the end of Section 1: federated problem formulation, training procedures for both router families, theoretical convergence guarantees, and empirical evaluation.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 and Appendix A comprehensively cover federated learning for LLMs and LLM query routing, explicitly positioning the contribution relative to parametric/nonparametric routers and showing how the federated setting differs from prior centralized approaches.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "No code repository URL or archive is provided anywhere in the paper. No GitHub/Zenodo link found.",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The paper uses publicly available benchmarks: RouterBench-Data (Hu et al., 2024) and ProxRouter-Data (Patel et al., 2025), both cited with references to their public releases.",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper specifies some components (all-mpnet-base-v2 encoder, MLP architecture details in Appendix C) and mentions using PSC Bridges-2 GPU, but provides no requirements.txt, Dockerfile, or library version listing.",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The algorithmic details are described but there are no runnable reproduction instructions.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "All results are reported as point estimates (AUC values, accuracy-cost curves). No confidence intervals, error bars, or ± notation appear in any figure or table.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "The paper claims federated routers 'improve' over local routers based solely on comparing AUC scores (e.g., 0.75 vs 0.63-0.72) without any statistical significance tests.",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "AUC values are reported for all methods with baseline context (e.g., Federated 0.75 vs Client-Local ranging 0.63-0.72 in Figure 2), providing magnitude of improvement alongside absolute baselines.",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No justification for why N=10 clients was chosen, or why the Dirichlet concentration parameters (α=0.6, 0.03) were selected. No power analysis.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "No variance, standard deviation, or spread measures across runs are reported. Results appear to be single-run experiments.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Multiple baselines are compared: client-local (no-FL) routers trained independently on each client's data, and centralized training on pooled data (Appendix D.1, Figure 9).",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines include RouterBench (2024) and ProxRouter (2025) benchmarks, and the router architectures (MLP-Router, K-Means-Router) follow recent work by Hu et al. (2024) and Jitkrittum et al. (2025).",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "The paper ablates multiple components: federated vs local training (Section 6.1-6.2), model expansion (Section 6.3), client expansion (Appendix D.3), and adaptive personalization vs pure federated/local (Section 6.4).",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Results are evaluated using accuracy, average cost, and normalized AUC under the accuracy-cost tradeoff curve. The tradeoff parameter λ is swept to generate full Pareto frontiers.",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Human evaluation is irrelevant to evaluating routing algorithms. The paper measures routing quality via accuracy-cost tradeoffs on benchmark data.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Appendix C states: 'Within each client, we split its allocated data into local train/test with fractions 0.75/0.25, and the global train/test sets are defined as the unions of client train/test splits.'",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down per-client (Figures 10-11 show all 10 clients), per-router type (MLP vs K-Means), per-dataset (RouterBench vs ProxRouter in Appendix F), and per-heterogeneity regime (α=0.6 vs α=0.03).",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Section 6.4 explicitly discusses when federated routing fails: 'under extreme client data heterogeneity, a federated router can underperform on some clients' local distributions.' Specific clients where this occurs are shown in Figure 5.",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Section 6.4 reports that under extreme heterogeneity (α=0.03), 'federated MLP-Router can indeed underperform local routers for some clients' (e.g., Client 4 and 6 in Figure 5 top row).",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "The sentence encoder is specified as 'all-mpnet-base-v2' (Song et al., 2020). The LLM pool models are specified with versions in Figure 8: GPT-4 1106 Preview, GPT-3.5 Turbo 1106, Claude Instant v1, Claude v1, Claude v2, WizardLM 13B V1.2, Llama 2 70B Chat, etc.",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": false,
    245           "answer": false,
    246           "justification": "The paper does not use prompting. It trains routing algorithms (MLP, K-Means) on pre-existing benchmark evaluation data. No prompts are sent to LLMs as part of the method.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Appendix C reports: learning rate η=10⁻³, weight decay 3×10⁻⁴, batch size 128, dropout p=0.1, gradient clipping max-norm 1.0, hidden layer widths (512, 512), Klocal=15, Kglobal=20, ninit=3 restarts, partial participation rate 0.6.",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": false,
    257           "answer": false,
    258           "justification": "No agentic scaffolding is used. The approach trains standard MLP and K-Means models for routing.",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Section 6 and Appendix B-C describe: Dirichlet partitioning with α=0.6 over task labels for query heterogeneity, Dirichlet α=0.45 for model assignment, train/test split 0.75/0.25, and encoder embedding procedure.",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "The underlying benchmark datasets (RouterBench-Data and ProxRouter-Data) are publicly available as cited. The federated partitioning procedure is fully described with parameters, enabling reproduction.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "The data comes from published benchmarks: RouterBench-Data (11 LLMs over 8 datasets, Hu et al. 2024) and ProxRouter-Data (14 LLMs over 10 datasets, Patel et al. 2025). The federated simulation procedure is described in detail.",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants. Data sources are standard public benchmarks (RouterBench-Data, ProxRouter-Data).",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The pipeline from benchmark data to federated experiments is documented: Dirichlet partitioning (α values specified), model assignment (Dirichlet α=0.45), single-model-per-query assumption, train/test split (0.75/0.25), and encoding via all-mpnet-base-v2.",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": false,
    297           "answer": false,
    298           "justification": "The paper does not evaluate pre-trained model capabilities on benchmarks. It trains routing algorithms (MLP, K-Means) on pre-existing LLM evaluation data. Contamination of the routing model is addressed by train/test splitting.",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": false,
    303           "answer": false,
    304           "justification": "Same as above — the paper evaluates routing algorithms, not pre-trained language models. Standard train/test splitting is used for the router.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": false,
    309           "answer": false,
    310           "justification": "Same as above — contamination concerns about LLM training data seeing benchmark solutions are outside the scope of this routing paper.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants. The paper evaluates routing algorithms on benchmark data.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": true,
    362           "justification": "The paper's core results are accuracy-cost tradeoff curves (Figures 2-5) where the x-axis reports average inference cost per query in dollars. Cost is a primary evaluation dimension.",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "The acknowledgements mention using 'PSC Bridges-2 GPU at Pittsburgh Supercomputing Center through allocation CIS250087' but do not quantify GPU hours, training time, or total compute budget.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from a single run of the Dirichlet partitioning and training procedure.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "The number of experimental runs is not stated. Results are presented without indicating how many times the experiments were repeated.",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "Hyperparameters are stated (Appendix C) but the search budget is not reported. Klocal and Kglobal are said to be 'chosen with validation experiments' but the search space and number of configurations tried are not disclosed.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "The paper mentions Klocal=15 and Kglobal=20 are 'chosen with validation experiments' but does not show the validation results or explain the selection criterion.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": false,
    399           "answer": false,
    400           "justification": "No statistical tests are performed, so multiple comparison correction is moot. The absence of statistical testing is captured by significance_tests.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "All methods (federated, local, personalized) are implemented by the authors. No discussion of potential author-implementation bias or independent evaluation.",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "The compute cost of federated training (communication rounds, local training steps) vs local-only training is not compared. Federated training involves T=unspecified rounds with τ local steps each, adding significant overhead that is not quantified.",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": false,
    418           "justification": "The paper does not discuss whether RouterBench-Data and ProxRouter-Data actually measure realistic routing scenarios. No analysis of whether synthetic query-model evaluation data reflects real-world routing needs.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": false,
    423           "answer": false,
    424           "justification": "No scaffolding is involved. The paper trains and evaluates routing algorithms directly.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "No discussion of temporal leakage. The train/test split is random, not temporal. Whether earlier benchmark evaluations could inform later ones is not addressed.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of whether query embeddings or evaluation features could leak information about the test set routing outcomes.",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No discussion of non-independence between train and test examples. Queries from the same benchmark task may share structural similarities, but this is not addressed.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No concrete leakage detection or prevention method beyond standard random train/test splitting.",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "Federated LLM routing improves the accuracy-cost frontier over client-local routers on the global test distribution",
    459       "evidence": "Figure 2 shows federated AUC of 0.75 for both MLP and K-Means routers vs. client-local AUCs of 0.63–0.72 (MLP) and 0.55–0.70 (K-Means) on RouterBench-Data",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "Federated learning improves in-distribution local performance for individual clients via better model coverage",
    464       "evidence": "Appendix D.2 (Figures 10–11) shows mean AUC increase from 0.69 to 0.74 (MLP) and 0.64 to 0.75 (K-Means) across all 10 clients on their own local test sets",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Federated routers match centralized training performance despite preserving data locality",
    469       "evidence": "Appendix D.1 (Figure 9) shows federated and centralized routers achieving identical AUC of 0.75 for both MLP and K-Means on RouterBench-Data global test",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "Federated ERM provably reduces routing suboptimality compared to local-only training",
    474       "evidence": "Theorem 5.3 shows suboptimality bound scales as O(1/√D) for federated vs. O(1/√Di) for local training, with Di < D by construction",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "Adaptive personalization recovers performance when federated routers underperform under extreme heterogeneity",
    479       "evidence": "Figure 5 shows personalized router matching or exceeding the better of federated/local for most clients under α=0.03; mean AUC rises from 0.72 (local) / 0.75 (federated) to 0.75 (personalized) for MLP",
    480       "supported": "moderate"
    481     },
    482     {
    483       "claim": "Both router types support lightweight model pool expansion without full retraining",
    484       "evidence": "Figure 4 shows global AUC improving from 0.732 to 0.748 (MLP) and 0.732 to 0.749 (K-Means) after adding 3 withheld models via 10% calibration subset",
    485       "supported": "moderate"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval",
    490     "theoretical"
    491   ],
    492   "key_findings": "This paper introduces the first federated learning framework for LLM query routing, enabling privacy-preserving collaborative router training across clients with heterogeneous query distributions and sparse per-model evaluations. Simulated experiments with 10 clients on RouterBench-Data and ProxRouter-Data show federated routers achieve performance on par with centralized training (AUC ≈0.75) while substantially outperforming client-local routers (individual client AUC gains of 0.03–0.20). Both parametric MLP-Router and nonparametric K-Means-Router benefit from federation, with K-Means showing larger relative gains due to its dependence on per-centroid model coverage. Theoretical analysis via sample complexity bounds formally characterizes why federation reduces routing suboptimality through improved query and model coverage.",
    493   "red_flags": [
    494     {
    495       "flag": "No error bars or significance tests",
    496       "detail": "All AUC values and accuracy-cost curves are single-run point estimates with no confidence intervals, standard deviations, or statistical tests; differences of 0.02–0.05 AUC may be within run-to-run noise."
    497     },
    498     {
    499       "flag": "Simulation designed to favor federation",
    500       "detail": "Dirichlet partitioning creates complementary client distributions by construction—clients have non-overlapping task specializations—which is the ideal setup for federated aggregation and may not reflect real enterprise client data."
    501     },
    502     {
    503       "flag": "No code or random seeds released",
    504       "detail": "Without code or random seeds for the stochastic Dirichlet partitioning and model assignment, the exact experimental setup cannot be reproduced despite detailed hyperparameter reporting."
    505     },
    506     {
    507       "flag": "Only 10 simulated clients at fixed scale",
    508       "detail": "All experiments use exactly N=10 clients; the paper does not study how performance scales with client count, communication rounds, or data size, which are critical practical parameters."
    509     },
    510     {
    511       "flag": "No real-world deployment validation",
    512       "detail": "All experiments are offline simulations on existing benchmark evaluation data; there is no real federated deployment, case study with actual privacy constraints, or evaluation of communication overhead."
    513     }
    514   ],
    515   "cited_papers": [
    516     {
    517       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    518       "relevance": "Primary benchmark dataset providing 11 LLMs evaluated over 8 datasets; the main experimental data for all federated routing experiments"
    519     },
    520     {
    521       "title": "Communication-Efficient Learning of Deep Networks from Decentralized Data (FedAvg)",
    522       "relevance": "Foundation of the federated framework; MLP-Router training uses FedAvg (Algorithm 1) directly"
    523     },
    524     {
    525       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    526       "relevance": "Key prior work on parametric LLM routing; directly relevant as the centralized analogue to this work"
    527     },
    528     {
    529       "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing",
    530       "relevance": "Related parametric router establishing the accuracy-cost tradeoff formulation used throughout this paper"
    531     },
    532     {
    533       "title": "ProxRouter: Proximity-Weighted LLM Query Routing for Improved Robustness to Outliers",
    534       "relevance": "Second benchmark dataset (ProxRouter-Data) used for all appendix experiments; nonparametric routing baseline"
    535     },
    536     {
    537       "title": "Universal LLM Routing with Correctness-Based Representation",
    538       "relevance": "Related K-Means routing approach; the federated K-Means-Router extends this nonparametric paradigm"
    539     },
    540     {
    541       "title": "Advances and Open Problems in Federated Learning",
    542       "relevance": "Comprehensive FL survey providing the statistical heterogeneity framing and open problems that motivate the paper"
    543     },
    544     {
    545       "title": "Tackling the Objective Inconsistency Problem in Heterogeneous Federated Optimization",
    546       "relevance": "Convergence analysis framework used directly in Theorem 5.1 and Proposition G.4 for federated MLP-Router"
    547     }
    548   ],
    549   "engagement_factors": {
    550     "practical_relevance": {
    551       "score": 3,
    552       "justification": "Directly addresses a real enterprise pain point—routing LLM API calls across providers while keeping proprietary query data local—with a deployable algorithmic framework."
    553     },
    554     "surprise_contrarian": {
    555       "score": 2,
    556       "justification": "Reframing LLM routing as a federated learning problem (training the router, not the LLM) is a non-obvious and novel framing that opens a new research direction."
    557     },
    558     "fear_safety": {
    559       "score": 1,
    560       "justification": "Privacy motivation addresses data sensitivity concerns in enterprise settings but is framed as a practical constraint rather than a safety risk."
    561     },
    562     "drama_conflict": {
    563       "score": 1,
    564       "justification": "Incremental technical contribution extending FL to routing; no controversial claims or conflicts with mainstream approaches."
    565     },
    566     "demo_ability": {
    567       "score": 1,
    568       "justification": "No code released; practitioners cannot immediately try the system, though the benchmark data (RouterBench-Data) is publicly available."
    569     },
    570     "brand_recognition": {
    571       "score": 1,
    572       "justification": "Carnegie Mellon University is well-regarded but not a hyperscaler lab; no well-known commercial products evaluated or involved."
    573     }
    574   },
    575   "hn_data": {
    576     "threads": [
    577       {
    578         "hn_id": "46925532",
    579         "title": "Convergent Discovery of Critical Phenomena Mathematics Across Disciplines",
    580         "points": 4,
    581         "comments": 3,
    582         "url": "https://news.ycombinator.com/item?id=46925532",
    583         "created_at": "2026-02-07T17:16:44Z"
    584       }
    585     ],
    586     "top_points": 4,
    587     "total_points": 4,
    588     "total_comments": 3
    589   }
    590 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs