scan.json (26484B)
1 { 2 "paper": { 3 "title": "PersonalizedRouter: Personalized LLM Routing via Graph-based User Preference Modeling", 4 "authors": ["Zhongjie Dai", "Tao Feng", "Jiaxuan You"], 5 "year": 2025, 6 "venue": "Transactions on Machine Learning Research", 7 "arxiv_id": "2511.16883", 8 "doi": "10.48550/arXiv.2511.16883" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "PersonalizedRouter, a graph-based framework for personalized LLM routing, outperforms existing LLM selection baselines by 9.83-15.38% on small-scale experiments and 16.19-59.69% on a 1,000-user benchmark (PersonaRoute-Bench). The framework demonstrates few-shot generalization to new users and new LLMs, achieving 64.81-96.01% and 85.90% of fully trained performance respectively. A small real-user validation (40 users) shows 6.05% improvement over the best baseline, though the evaluation relies heavily on simulated users with fixed preferences.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper provides a GitHub repository link (ulab-uiuc/PersonalizedRouter) and a Hugging Face Collection link in the abstract section." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper references a Hugging Face Collection for PersonaRoute-Bench, and the source datasets (Alpaca, GSM8K, SQuAD, Multi-News) are all publicly available." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "Section 4.4 mentions PyTorch and PyG on an NVIDIA A6000 48GB GPU, but no version numbers for frameworks, no requirements.txt, and no environment specification file." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper provides implementation details (Section 4.4) and a code link, but no step-by-step reproduction instructions, no commands to run, and no 'Reproducing Results' section in the paper itself." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 3-8 are reported as single point estimates with no confidence intervals, error bars, or ± notation." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims improvements (e.g., 15.38%, 9.83%) by comparing raw numbers without any statistical significance tests (no p-values, t-tests, or other tests)." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Tables report both absolute scores and percentage improvements with baseline context (e.g., PersonalizedRouter 0.255 vs GraphRouter 0.221, 15.38% improvement in Table 3), providing enough context to judge effect magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for the choice of 600 cases per dataset, 9 users in small-scale, 1,000 simulated users in large-scale, or 40 real users. No power analysis discussed." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviation, variance, or any spread measure reported across experimental runs. All results appear to be single-run numbers." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Four baselines are compared: HybridLLM, FrugalGPT, GraphRouter, RouterDC, plus an Oracle upper bound (Section 4.3)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include GraphRouter (2024), RouterDC (2024), HybridLLM (2024), and FrugalGPT (2023), all recent and representative of the state of the art in LLM routing." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 5.6 presents an ablation of GNN depth (0-5 layers), showing that the GNN component is necessary (0 layers performs worst) and 2-3 layers is optimal." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Two main metrics are used: Reward score (for multi-cost-efficiency simulation) and Accuracy (for LLM-as-a-Judge strategy). Table 4 also reports time cost." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section 5.5 includes a Human-as-a-Judge scenario with 40 real users selecting preferred LLM responses across 80 queries, validating the router against real user preferences." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section 4.2 describes a 70:10:20 train/validation/test split, with results reported on the held-out test set." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": false, 98 "justification": "Results are reported as aggregate scores per method. No per-task-dataset breakdown (e.g., performance on Alpaca vs GSM8K vs SQuAD vs Multi-News separately) is provided." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "No error analysis or qualitative examples of routing failures. The paper does not discuss specific cases where PersonalizedRouter made incorrect routing decisions or why." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 5.6 reports that deeper GNN layers (>3) degrade performance due to over-smoothing. Table 6 shows that the few-shot new-user model underperforms the best baseline by -30.69% under multi-cost-efficiency." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of 15.38% and 9.83% improvement are supported by Table 3. Claims of 16.19% and 59.69% on PersonaRoute-Bench are supported by Table 4. Few-shot generalization numbers match Tables 6-7." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's causal claims about GNN depth (Section 5.6) use controlled ablation. The main comparative claims are supported by controlled experiments on the same data with the same evaluation protocol." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper claims 'strong generalization to real-user scenarios' (Section 5.5) based on only 40 users with 80 queries. The title implies general 'Personalized LLM Routing' but evaluation is limited to 4 task types, simulated preferences, and a narrow set of candidate LLMs." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for PersonalizedRouter's improvements. It does not consider whether the gains come from the graph structure vs. heterogeneous node features vs. simply having user-specific parameters." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper measures routing accuracy on simulated user preferences (reward scores and LLM-judge labels) and frames this as 'personalized LLM selection' for real users. The gap between simulated preferences and actual user satisfaction is acknowledged only briefly in the limitations section." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Candidate LLMs are listed by family and size (e.g., 'LLaMA-3 (8B)', 'Mixtral-8x7B') without exact versions or snapshot dates. Judge models ('DeepSeek-V3', 'Kimi k2', 'Llama-3.3 70B') also lack version specifics." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "LLM judge instruction prompts are provided in full in Tables 10-11 (Appendix A.3), and user profiles used as system prompts are listed in Table 12 (Appendix A.4)." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "GNN training hyperparameters are well-reported (Section 4.4: 2-layer GAT, hidden dim 32, batch 32, 400 epochs, Adam, LambdaLR from 1e-3 to 0), but LLM API settings (temperature, top-p, max tokens) for generating candidate responses are not stated." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "The paper does not use agentic scaffolding. PersonalizedRouter is a routing framework, not an agentic system." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.2 documents the interaction dataset construction: uniform sampling of 600 cases per dataset, LLM response collection, reward calculation formula, judge labeling procedure, and train/val/test splitting (70:10:20)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 is titled 'Limitations' and discusses the gap between simulated and real user behavior and the lack of an explicit utility function." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "Section 7 states 'user behavior is often more complex, potentially involving a mixture of preferences and evolving over time' and suggests future work on utility functions. These are generic observations rather than specific threats to the validity of the current results." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what its results do NOT show. No specific exclusions are listed regarding task types, LLM categories, or user populations where the approach may fail." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper provides a Hugging Face Collection link for PersonaRoute-Bench and a GitHub repository link. Source datasets (Alpaca, GSM8K, SQuAD, Multi-News) are all publicly available." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.2 describes the data collection process: sampling queries from 4 datasets, generating LLM responses via API, computing reward scores or collecting judge labels, with formulas and procedures documented." 192 }, 193 "recruitment_methods_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "For the human study (Section 5.5), the paper states only 'we recruited 40 users' without describing recruitment channels, eligibility criteria, or potential selection bias." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "While the overall construction procedure is described, specific filtering steps and counts are missing. The paper does not explain the sampling procedure for selecting 600 cases from each dataset or whether any LLM responses were excluded." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "The acknowledgments section states: 'We sincerely appreciate the research gift from Lenovo that made this project possible.'" 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are affiliated with the University of Illinois at Urbana-Champaign, clearly stated in the paper header. No conflict with the evaluated products." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Lenovo provided a research gift but has no direct financial stake in which LLM router performs best. No Lenovo products are being evaluated in the experiments." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the candidate LLMs (LLaMA-3, Mixtral, Qwen, etc.) used to generate responses on benchmark datasets." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether candidate LLMs were trained on the evaluation datasets (GSM8K published 2021, SQuAD published 2016 — both likely in training data of modern LLMs)." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "GSM8K (2021) and SQuAD (2016) are well-known benchmarks available online before the training of all candidate LLMs. No contamination analysis or discussion is provided, though contamination could distort the interaction data quality." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": true, 246 "answer": false, 247 "justification": "No mention of pre-registration for the 40-user human study in Section 5.5." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": true, 251 "answer": false, 252 "justification": "No mention of IRB or ethics board approval for the human study involving 40 participants." 253 }, 254 "demographics_reported": { 255 "applies": true, 256 "answer": false, 257 "justification": "The paper states '40 users' without reporting any demographic information (experience level, background, age, etc.)." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": true, 261 "answer": false, 262 "justification": "No inclusion or exclusion criteria are stated for the 40 human participants. The paper says only 'we recruited 40 users.'" 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "This is a preference elicitation study where users select their preferred response, not an experiment with random assignment to treatment conditions." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "This is a preference selection study, not a blinded experiment. Users see all 10 LLM responses without condition assignments to blind." 273 }, 274 "attrition_reported": { 275 "applies": true, 276 "answer": false, 277 "justification": "No information on how many of the 40 recruited users completed the full questionnaire or whether any responses were excluded." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 4 reports time costs for each method (e.g., PersonalizedRouter: 10:15 under multi-cost-efficiency, 11:37 under LLM-as-a-Judge) with reduction percentages. LLM cost per million tokens is listed in Tables 16-18." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The paper mentions using an NVIDIA A6000 48GB GPU (Section 4.4) but does not state total GPU hours, total API spend, or aggregate computational budget across all experiments." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No results across multiple random seeds are reported. All results appear to be from single runs with no seed sensitivity analysis." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs producing the reported results is never stated anywhere in the paper." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Final hyperparameters are reported (Section 4.4) but no hyperparameter search budget, search method, or number of configurations tried is disclosed." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "GNN depth is justified via ablation (Figure 2), but other hyperparameter choices (hidden dim 32, batch size 32, learning rate 1e-3) are not justified or shown to be selected on a validation set." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper makes many comparisons across methods and settings without any statistical tests at all, let alone corrections for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "GraphRouter, a key baseline, is prior work by the same research group (Feng et al., 2024b, same PI Jiaxuan You). The bias of evaluating their own system against their own prior baseline is not acknowledged." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Table 4 reports both performance (Reward/Accuracy) and time cost with reduction percentages for all methods, allowing performance-at-compute comparison. PersonalizedRouter achieves 96.63% time reduction vs RouterDC." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "PersonaRoute-Bench uses simulated users with fixed α/β weight pairs or LLM-generated profiles. The paper does not discuss whether these simulated preferences adequately measure real-world personalized routing capability." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved in the routing evaluation. The router directly predicts LLM assignments without agentic scaffolding." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether candidate LLMs' training data includes the benchmark datasets used for evaluation (GSM8K from 2021, SQuAD from 2016)." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information, such as interaction data structure embedding answer-relevant information." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of potential non-independence between training and test interaction data, which are drawn from the same users and datasets with random splitting." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied in any of the experiments." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "PersonalizedRouter outperforms existing LLM selection methods by 15.38% (multi-cost-efficiency) and 9.83% (LLM-as-a-Judge) in the standard setting with 10 LLMs and 9 users.", 365 "evidence": "Table 3 (Section 5.1): Reward 0.255 vs GraphRouter 0.221 (15.38%); Accuracy 0.447 vs RouterDC 0.407 (9.83%).", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "On PersonaRoute-Bench with 1,000 users, PersonalizedRouter surpasses the best baseline by 16.19% and 59.69% under two simulation strategies.", 370 "evidence": "Table 4 (Section 5.2): Reward 0.244 vs RouterDC 0.210 (16.19%); Accuracy 0.313 vs HybridLLM 0.196 (59.69%).", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "PersonalizedRouter demonstrates few-shot generalization, achieving 64.81% and 85.90% of the fully trained model's performance when adapting to new users and new LLMs.", 375 "evidence": "Table 6 (Section 5.3): new users multi-cost 0.07/0.108 = 64.81%. Table 7 (Section 5.4): new LLMs multi-cost 0.201/0.234 = 85.90%.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "PersonalizedRouter outperforms the best baseline by 6.05% on a small-scale real-user dataset with 40 users.", 380 "evidence": "Table 8 (Section 5.5): Accuracy 0.438 vs RouterDC 0.413 (6.05%). Only 40 users with 80 queries.", 381 "supported": "weak" 382 }, 383 { 384 "claim": "GNN depth of 2-3 layers is optimal; deeper networks suffer from over-smoothing that degrades prediction performance.", 385 "evidence": "Figure 2 (Section 5.6): performance peaks at 2-3 layers and declines with 4-5 layers under both simulation strategies.", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Simulated user evaluation", 392 "detail": "The vast majority of evaluation uses simulated users with fixed preference parameters (α/β weight pairs or LLM-generated profiles). The real-user validation has only 40 users with 80 queries — too small to support claims of 'strong generalization to real-user scenarios.'" 393 }, 394 { 395 "flag": "No error bars or significance tests", 396 "detail": "All results are single-run point estimates with no uncertainty quantification. The claimed improvements (e.g., 15.38%, 9.83%) could be within random noise, particularly given the small scale of some experiments." 397 }, 398 { 399 "flag": "Self-comparison with prior work from same group", 400 "detail": "GraphRouter, a key baseline, is prior work by the same research group (same PI Jiaxuan You). The authors implement both their own method and this baseline, introducing potential self-comparison bias per Lucic et al. (2018)." 401 }, 402 { 403 "flag": "Selective framing of few-shot results", 404 "detail": "The abstract reports 64.81% for new-user generalization (multi-cost-efficiency), but under the same setting, the few-shot model underperforms the best baseline by -30.69% (Table 6). The LLM-as-a-Judge setting shows 96.01%, which is much stronger but not highlighted in the abstract." 405 }, 406 { 407 "flag": "No contamination discussion for standard benchmarks", 408 "detail": "GSM8K (2021) and SQuAD (2016) are well-known benchmarks likely present in the training data of modern LLMs. Contamination could affect LLM performance scores used to construct interaction data, but this is never discussed." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 414 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 415 "year": 2023, 416 "arxiv_id": "2305.05176", 417 "relevance": "Foundational LLM cost-optimization routing method using response scoring under cost budgets." 418 }, 419 { 420 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 421 "authors": ["Dujian Ding", "Ankur Mallick", "Chi Wang"], 422 "year": 2024, 423 "arxiv_id": "2404.14618", 424 "relevance": "Binary LLM routing method balancing cost and quality, key baseline for LLM selection research." 425 }, 426 { 427 "title": "GraphRouter: A Graph-based Router for LLM Selections", 428 "authors": ["Tao Feng", "Yanzhen Shen", "Jiaxuan You"], 429 "year": 2024, 430 "relevance": "GNN-based LLM selection using heterogeneous graphs, direct predecessor to PersonalizedRouter." 431 }, 432 { 433 "title": "RouterDC: Query-based Router by Dual Contrastive Learning for Assembling Large Language Models", 434 "authors": ["Chen Shuhao", "Jiang Weisen", "Lin Baijiong"], 435 "year": 2024, 436 "relevance": "Embedding-based LLM routing using cosine similarity, competitive baseline in LLM selection." 437 }, 438 { 439 "title": "RouteLLM: Learning to Route LLMs with Preference Data", 440 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"], 441 "year": 2024, 442 "arxiv_id": "2406.18665", 443 "relevance": "LLM routing using real user preference data from Chatbot Arena with matrix factorization." 444 }, 445 { 446 "title": "PolyRouter: A Multi-LLM Querying System", 447 "authors": ["Dimitris Stripelis", "Zijian Hu", "Jipeng Zhang"], 448 "year": 2024, 449 "arxiv_id": "2408.12320", 450 "relevance": "Explores multiple routing strategies (KNN, MLP) for multi-LLM query assignment." 451 }, 452 { 453 "title": "Cost-effective Online Multi-LLM Selection with Versatile Reward Models", 454 "authors": ["Xiangxiang Dai", "Jin Li", "Xutong Liu"], 455 "year": 2024, 456 "arxiv_id": "2405.16587", 457 "relevance": "Bandit-based LLM routing with exploration-exploitation balance for online LLM selection." 458 }, 459 { 460 "title": "DeepSeek-V3 Technical Report", 461 "authors": ["DeepSeek-AI"], 462 "year": 2025, 463 "arxiv_id": "2412.19437", 464 "relevance": "Major open LLM technical report; used as judge model in PersonalizedRouter evaluation." 465 }, 466 { 467 "title": "PREMIUM: LLM Personalization with Individual-Level Preference Feedback", 468 "authors": ["Yihang Sun", "Tao Feng", "Ge Liu", "Jiaxuan You"], 469 "year": 2025, 470 "relevance": "Related work on LLM personalization through individual preference modeling from the same research group." 471 }, 472 { 473 "title": "FusionFactory: Fusing LLM Capabilities with Multi-LLM Log Data", 474 "authors": ["Tao Feng", "Haozhen Zhang", "Zijie Lei"], 475 "year": 2025, 476 "arxiv_id": "2507.10540", 477 "relevance": "Multi-LLM capability fusion approach from the same group, related to LLM selection and routing." 478 } 479 ] 480 }