scan.json (30924B)
1 { 2 "paper": { 3 "title": "MMR-Bench: A Comprehensive Benchmark for Multimodal LLM Routing", 4 "authors": ["Haoxuan Ma", "Guannan Lai", "Han-Jia Ye"], 5 "year": 2026, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2601.17814", 8 "doi": "10.48550/arXiv.2601.17814" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "MMR-Bench provides an offline, cost-aware benchmark for evaluating routing strategies across 10 multimodal LLMs on 7 vision-language benchmarks (11,000 instances). The paper demonstrates that multimodal routing signals consistently outperform unimodal variants, that a routed system can match the strongest single model's accuracy at roughly 33% of its cost, and that matrix-factorization-based routers offer the most robust cross-dataset performance. Routing policies trained on multimodal data transfer zero-shot to text-only benchmarks (GSM8K, MMLU, ARC) without retraining.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The abstract states 'The code will be available at: https://github.com/Hunter-Wrynn/MMR-Bench' — this is a promise of future release, not a working URL at time of publication." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper's key data artifact — the offline outcome table with precomputed utilities and costs for each instance–model pair (over 100k pairs) — is not released. The underlying benchmarks (OCRBench, MMStar, etc.) are public, but the precomputed outcome table that enables offline routing evaluation is only promised as part of the future code release." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, conda environment, or detailed dependency listing is provided. Section D mentions using CLIP encoders and standard ML libraries but does not specify versions or provide reproducible environment specifications." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The paper describes methodology conceptually and provides implementation details in the appendix, but there are no concrete commands or scripts to reproduce the results." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Table 2 and all other results tables report only point estimates (e.g., nAUC 0.7042, Ps 0.7533) with no confidence intervals, error bars, or ± notation. The paper notes 'results for stochastic routers are averaged over multiple runs' (Sec. 5.1) but provides no uncertainty measures." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes comparative claims (e.g., 'LinearMFRouter attains the highest nAUC') based solely on comparing numbers in tables. No statistical significance tests (p-values, t-tests, bootstrap, etc.) are reported anywhere." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 2 reports absolute performance values (nAUC, Ps) for all methods alongside baselines (Random, Best Single Model, Oracle), providing sufficient context to assess magnitudes. Table 3 reports explicit deltas (e.g., ΔnAUC = +0.3403 for KMeans). The '33% of cost' claim is contextualized against the strongest single model." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The benchmark comprises 11,000 instances across 7 datasets and 10 models, but no justification is given for why this size is sufficient. No power analysis or discussion of whether the sample allows for detecting meaningful routing differences." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Section 5.1 states 'results for stochastic routers are averaged over multiple runs' but no standard deviations, interquartile ranges, or spread measures are reported in any table or figure. The reader cannot assess result stability." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper includes comprehensive baselines: Random (lower bound), Oracle (upper bound), Best Single Model, and four router families (Linear, MLP, KNN, KMeans) with variants, totaling 10 baseline methods (Table 1)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The model zoo includes state-of-the-art models (GPT-5, Gemini 2.5, Claude 3.7 Sonnet, InternVL3, Qwen2.5-VL). Routing baselines reference contemporary work (RouterBench 2024, RouterEval 2025, EmbedLLM 2024). Table 1 compares against recent routing benchmarks." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 6 provides systematic ablations: (1) adaptive vs equal-weight fusion (Table 3), (2) text-only vs image-only vs multimodal inputs (Figure 2), (3) within-scenario cross-dataset transfer (Table 4), and (4) cross-modality transfer (Table 5). Each isolates a specific factor." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Three complementary metrics are defined and reported: nAUC (normalized area under cost-performance curve), Peak Score (Ps), and Quality-Neutral Cost (QNC). All three are formally defined in Section 4.3 and reported across experiments." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "The paper evaluates routing strategies on benchmarks with objective ground-truth answers (accuracy, scoring). Human evaluation is irrelevant to the claims about cost-accuracy trade-offs in automated routing." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section 5.1 states 'frozen 2:8 train/test splits. Routers are trained on the training split, test-time evaluation is purely offline by applying the learned router to benchmark features and indexing into the fixed {u_i,j, c_i,j}.'" 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 2 provides per-dataset breakdowns across all 7 datasets grouped into 3 scenarios (OCR, General VQA, Math Reasoning). Table S4 provides per-dataset QNC. Figure S1 shows per-dataset Pareto frontiers." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 3.2 (Q1) discusses where text-only routing fails: 'text-in-image queries, dense OCR, charts, spatial reasoning, and low-resolution crops.' Section F provides qualitative case studies. Table 3 shows KNN regression with adaptive fusion (ΔnAUC = −0.0074). Table S4 shows QNC = ∞ for several methods." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 3 reports negative results: KNN shows mild regression with adaptive fusion (ΔnAUC = −0.0074). QNC = +∞ for KNN on multiple datasets (Table S4), meaning it cannot reach target accuracy. The paper also reports that text-only routing fails on vision-governed tasks." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims are supported: (1) multimodal signals improve routing — supported by Figure 2, Table 2; (2) routed system matches strongest model at ~33% cost — supported by Figure 4 Pareto frontiers; (3) zero-shot generalization to new datasets and text-only benchmarks — supported by Tables 4 and 5." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's causal claims ('incorporating multimodal signals improves routing quality') are supported by controlled comparisons: Figure 2 compares text-only, image-only, and multimodal routers under identical model sets and cost accounting, isolating the modality signal as the only variable. Table 3 compares adaptive vs equal-weight fusion with all else held constant." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section 3.1 formally bounds scope to 'the two-modality case (text and image).' The conclusion states the benchmark covers specific routing scenarios. Specific claims reference the tested models and datasets. Though the title says 'Comprehensive,' the formal problem definition and experimental claims are appropriately scoped." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "Section 6 provides explanations for why their approach works (modality-agnostic difficulty cues, fusion-induced regularization, cost-aware calibration) but does not consider alternative explanations that might account for the observed improvements, such as whether the gains come from better feature engineering rather than genuine multimodal signals, or whether dataset-specific artifacts drive results." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures accuracy/utility and normalized cost — exactly what it claims to evaluate (cost-accuracy trade-offs in routing). The metrics (nAUC, Ps, QNC) are precisely defined in Section 4.3. No proxy gap exists between measurements and claims." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Section C.1 lists models: GPT-5-0807 and GPT-5-Nano-0807 include date suffixes, but 'Claude 3.7 Sonnet,' 'Gemini 2.5 Pro,' 'Gemini 2.5 Flash,' 'InternVL3-78B,' 'Qwen2.5-VL-3B/7B/72B,' and 'Gemma3-4B' lack specific version identifiers or snapshot dates. Marketing names without API versions are insufficient per the schema." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "Section B.2 describes prompt construction ('MCQ datasets include the question, candidate options, and an explicit selection instruction') but does not provide the actual prompt text. It says inputs are formatted using each model's 'official chat template' without showing the templates. The reader cannot reconstruct exact prompts." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Router hyperparameters are partially described in Section D (fusion temperature η=5.0, ridge regression, SVD rank). However, critical MLLM inference hyperparameters (temperature, top-p, max tokens) are not reported. The paper only notes that 'thinking' traces were disabled for some models." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The routing system is a direct feature-extraction → model-selection pipeline, not an agentic workflow." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section B.2 documents preprocessing: prompt construction formats for MCQ vs VQA datasets, image preprocessing (resize/interpolate/pad per model's official implementation), token logging for cost accounting, and use of VLMEvalKit framework." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. The conclusion (Section 7) briefly summarizes contributions but does not discuss limitations substantively." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of specific threats like benchmark selection bias, model version sensitivity, or the offline evaluation assumption." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "While Section 3.1 scopes the problem to two modalities (text and image), the paper does not explicitly state what the results do NOT show. No discussion of excluded settings (video, audio, >2 modalities), populations (different deployment scenarios), or claims the authors are NOT making." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "The raw outcome table (per-instance utilities and costs for each model) is not released. Only aggregated results are shown in tables and figures. The code and data are promised for future release." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.2 describes dataset selection with specific benchmarks named. Section B describes preprocessing. Section C describes cost computation methodology. The paper explains how model outputs were collected using VLMEvalKit with standardized inputs." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks (OCRBench, MMStar, MathVista, etc.)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is documented: benchmark selection (Section 4.2) → model evaluation with VLMEvalKit (Section B.2) → cost normalization (Section C.2, Equation S1) → outcome table construction → frozen train/test splits → router training and evaluation. Each step is described with the methodology clearly laid out." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding sources, acknowledgments, or grant numbers are mentioned anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: all three authors are from the School of Artificial Intelligence and National Key Laboratory for Novel Software Technology at Nanjing University. They are not affiliated with the companies whose models they evaluate." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence of funders cannot be assessed. The absence of a funding disclosure does not confirm the absence of funding." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement or financial interest disclosure is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses 10 MLLMs including GPT-5, Gemini 2.5, Claude 3.7, and various open-weight models but does not state training data cutoff dates for any of them." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper evaluates pre-trained models on public benchmarks (some dating to 2023-2024) without any discussion of whether the test examples appeared in the models' training data." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "Several benchmarks (OCRBench 2024, MathVista 2023, MathVerse 2024, SEED-Bench 2023) were published before the likely training cutoffs of models like GPT-5 and Gemini 2.5. This contamination risk is not discussed. Contamination would affect the utility estimates that form the basis of all routing evaluations." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table S2 reports per-model output token prices ($/1M tokens) from OpenRouter. Cost normalization is formally defined (Equation S1). The entire evaluation framework is cost-aware with normalized costs reported throughout." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The total computational budget for collecting model outputs across 11,000 instances × 10 models is not stated. No mention of total API spend, GPU hours, or wall-clock time for the data collection process." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Section 5.1 states 'results for stochastic routers are averaged over multiple runs' but does not report seed sensitivity analysis or variation across seeds." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper says 'averaged over multiple runs' (Section 5.1) but does not specify the exact number of runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Router hyperparameters (k for clusters/neighbors, λ for cost weight, SVD rank, MLP architecture) appear tuned but no search budget (number of configurations tried, search method, compute spent) is reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The paper sweeps over the number of clusters k and cost weight λ to generate a family of routing policies and reports the Pareto frontier (Section 3.2). This reports all operating points rather than cherry-picking a single best configuration. Routers are trained on training splits and evaluated on held-out test splits." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Multiple comparisons are made across 7 datasets, 3 scenarios, and multiple routing methods. No statistical tests are performed at all, let alone corrections for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors propose MMR-Bench and evaluate routing methods on it. They do not acknowledge the potential bias of designing and evaluating on their own benchmark, nor discuss whether independent evaluation might yield different conclusions." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "The entire evaluation framework reports performance as a function of normalized cost. Figure 4 and Figure S1 plot cost-accuracy Pareto frontiers. The QNC metric explicitly measures cost relative to the strongest single model. Section 5.3 discusses cost efficiency." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper motivates why multimodal routing benchmarks are needed (Section 1-2) and compares against existing benchmarks (Table 1), but does not critically examine whether MMR-Bench actually measures routing quality as claimed. No discussion of potential validity gaps, such as whether offline precomputed utilities accurately reflect real-world routing decisions." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. Models are evaluated directly with standardized prompts and decoding settings. The routing system is a feature-extraction → selection pipeline, not an agentic scaffold." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The paper does not discuss temporal leakage. Several benchmarks (MathVista 2023, SEED-Bench 2023, OCRBench 2024) were published well before the training cutoffs of models like GPT-5 (2025+), meaning models may have encountered benchmark solutions during training. This would distort the utility estimates that underpin all routing evaluations." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup could leak answer information. For example, MCQ datasets include answer options in the prompt, which may provide signal about the correct answer." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether examples across the 7 benchmarks share structural similarities or whether train/test splits within datasets maintain independence (this is inherited from the original benchmarks without verification)." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention methods are used. No canary strings, membership inference, n-gram overlap analysis, or temporal split verification." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Multimodal signals improve routing quality compared to unimodal signals (text-only or image-only).", 365 "evidence": "Figure 2 shows multimodal router consistently outperforms text-only and image-only variants across cost levels. Section 3.2 (Q2) demonstrates the gap widens in settings where visual clutter or linguistic complexity dominates. Table 3 shows adaptive fusion gains for most router families.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "The routed system can match or exceed the strongest single model's accuracy at roughly 33% of its cost.", 370 "evidence": "Figure 4 shows the Pareto frontier of routing strategies dominating single-model baselines. Section 5.3 states 'with only 33% of the strongest single model's cost, our router already matches its accuracy.'", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Routing policies trained on a subset of models and tasks generalize zero-shot to new datasets and text-only benchmarks.", 375 "evidence": "Table 4 shows cross-dataset routing within scenarios consistently beats the best single model (e.g., Math Ps 0.7914 vs 0.7592). Table 5 shows multimodal-trained router exceeding best single model on GSM8K (96.7 vs 94.5), MMLU (92.4 vs 91.2), and ARC (66.7 vs 65.7).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Matrix-factorization-based routers achieve the most robust performance across heterogeneous workloads.", 380 "evidence": "Table 2 shows LinearMFRouter achieves highest full-dataset nAUC (0.7042) and Ps (0.7533). Section 5.3 states MF routers are 'consistently superior on general VQA' and have better cross-category robustness.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Adaptive fusion reveals a clear modality gap between image and text signals.", 385 "evidence": "Table 3 shows large gains for KMeans (ΔnAUC +0.3403) and consistent gains for MLP and Linear routers. QNC improves from +∞ to finite values for KMeans and MLP. However, KNN shows mild regression (ΔnAUC −0.0074).", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No variance or error bars reported", 392 "detail": "All results (Tables 2, 3, 4, 5, S4) report only point estimates despite stating results are 'averaged over multiple runs.' The reader cannot assess whether observed differences are meaningful or within noise. This is a Big Numbers No Error Bars pattern." 393 }, 394 { 395 "flag": "No statistical significance testing", 396 "detail": "Claims of method superiority (e.g., 'LinearMFRouter attains the highest nAUC') are based on comparing raw numbers without any statistical tests. With unknown variance, observed differences could be within noise." 397 }, 398 { 399 "flag": "Benchmark contamination risk unaddressed", 400 "detail": "The utility estimates that form the basis of all routing evaluations depend on model accuracy on public benchmarks. Models like GPT-5 may have been trained on benchmark data (MathVista 2023, SEED-Bench 2023), which would distort utility values and consequently routing decisions. This fundamental threat is not discussed." 401 }, 402 { 403 "flag": "No limitations section", 404 "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries. Key unaddressed limitations include: the offline evaluation assumption (real routing involves dynamic model updates), the small model zoo (10 models), and the restriction to text+image modalities despite the 'Comprehensive' title." 405 }, 406 { 407 "flag": "Artifacts not yet available", 408 "detail": "Code, outcome tables, and scoring scripts are promised ('The code will be available at...') but not released. The paper's reproducibility claims cannot be verified." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "GPT-4 technical report", 414 "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"], 415 "year": 2023, 416 "arxiv_id": "2303.08774", 417 "relevance": "Foundational commercial LLM evaluated in the routing benchmark; relevant to survey scope on LLM capabilities." 418 }, 419 { 420 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 421 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 422 "year": 2023, 423 "arxiv_id": "2305.05176", 424 "relevance": "Early work on cost-efficient LLM usage through model routing and cascading, directly relevant to AI deployment methodology." 425 }, 426 { 427 "title": "AutoMix: Automatically mixing language models", 428 "authors": ["Pranjal Aggarwal", "Aman Madaan"], 429 "year": 2024, 430 "relevance": "Dynamic model mixing for LLMs; relevant to efficient agentic AI deployment and model selection." 431 }, 432 { 433 "title": "RouterBench: A benchmark for multi-LLM routing system", 434 "authors": ["Qitian Jason Hu", "Jacob Bieker", "Xiuyu Li"], 435 "year": 2024, 436 "arxiv_id": "2403.12031", 437 "relevance": "Text-only LLM routing benchmark; direct predecessor to MMR-Bench, relevant to evaluation methodology for LLM systems." 438 }, 439 { 440 "title": "RouterEval: A comprehensive benchmark for routing LLMs to explore model-level scaling up in LLMs", 441 "authors": ["Zhongzhan Huang", "Guoming Ling", "Yupei Lin"], 442 "year": 2025, 443 "arxiv_id": "2503.10657", 444 "relevance": "Contemporary text-only routing benchmark; relevant to evaluation methodology for LLM model selection." 445 }, 446 { 447 "title": "EmbedLLM: Learning compact representations of large language models", 448 "authors": ["Richard Zhuang", "Tianhao Wu", "Zhaojin Wen"], 449 "year": 2024, 450 "arxiv_id": "2410.02223", 451 "relevance": "LLM representation learning for model selection; relevant to efficient LLM deployment and routing." 452 }, 453 { 454 "title": "Capability instruction tuning: A new paradigm for dynamic LLM routing", 455 "authors": ["Yi-Kai Zhang", "De-Chuan Zhan", "Han-Jia Ye"], 456 "year": 2025, 457 "relevance": "Dynamic LLM routing through capability-aware instruction tuning; directly relevant to agentic AI deployment." 458 }, 459 { 460 "title": "Large language model routing with benchmark datasets", 461 "authors": ["Tal Shnitzer", "Anthony Ou", "Mírian Silva"], 462 "year": 2023, 463 "arxiv_id": "2309.15789", 464 "relevance": "Early work on LLM routing using benchmark performance data; foundational for the model selection problem." 465 }, 466 { 467 "title": "MoE-LLaVA: Mixture of experts for large vision-language models", 468 "authors": ["Bin Lin", "Zhenyu Tang", "Yang Ye"], 469 "year": 2024, 470 "arxiv_id": "2401.15947", 471 "relevance": "Mixture-of-experts architecture for multimodal LLMs; relevant to efficient MLLM deployment and intra-model routing." 472 }, 473 { 474 "title": "GraphRouter: A graph-based router for LLM selections", 475 "authors": ["Tao Feng", "Yanzhen Shen", "Jiaxuan You"], 476 "year": 2024, 477 "arxiv_id": "2410.03834", 478 "relevance": "Graph-based approach to LLM routing; relevant to model selection methodology for AI systems." 479 }, 480 { 481 "title": "LLM-Blender: Ensembling large language models with pairwise ranking and generative fusion", 482 "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"], 483 "year": 2023, 484 "arxiv_id": "2306.02561", 485 "relevance": "LLM ensembling approach; relevant to multi-model AI systems and output selection." 486 }, 487 { 488 "title": "Qwen2.5-VL technical report", 489 "authors": ["Shuai Bai", "Keqin Chen", "Xuejing Liu"], 490 "year": 2025, 491 "arxiv_id": "2502.13923", 492 "relevance": "Open-weight multimodal LLM used in the routing benchmark; relevant to MLLM capabilities." 493 } 494 ] 495 }