scan.json (27401B)
1 { 2 "paper": { 3 "title": "Orchestrating Intelligence: Confidence-Aware Routing for Efficient Multi-Agent Collaboration across Multi-Scale Models", 4 "authors": [ 5 "Jingbo Wang", 6 "Sendong Zhao", 7 "Jiatong Liu", 8 "Haochun Wang", 9 "Wanting Li", 10 "Bing Qin", 11 "Ting Liu" 12 ], 13 "year": 2026, 14 "venue": "arXiv", 15 "arxiv_id": "2601.04861", 16 "doi": "10.48550/arXiv.2601.04861" 17 }, 18 "scan_version": 2, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "All evaluation datasets (GSM8K, MATH, MedQA, GPQA, MBPP, HumanEval) are standard public benchmarks with published citations. No proprietary data was collected." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions 'NVIDIA A100 80G GPUs with vLLM for accelerated inference' (Section 4.1) but provides no requirements.txt, library versions, Dockerfile, or detailed environment setup." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described at a conceptual level but lacks the specifics needed to replicate." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables 1-3 and Figures 3-7 are reported as point estimates only. No confidence intervals, error bars, or ± notation appears anywhere." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "Claims like 'OI-MAS consistently outperforms all vanilla backbone models' and 'average improvement of 7.68%' (Section 4.2) are based entirely on comparing raw numbers without any statistical significance tests." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "The paper reports percentage improvements with baseline context: 'accuracy gains ranging from 9.52% to 21.13%' over vanilla models, 'average improvement of 7.68%' over MasRouter (Section 4.2), and cost reduction '17.05%–78.47%' (Section 4.2). Absolute values in tables allow computation." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification for benchmark subset sizes or power analysis. The paper states a 4:1 train/test split (Appendix A) but does not justify the resulting test set sizes." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No standard deviation, variance, or spread measures are reported across runs. Temperature is set to 0 for LLM decoding, but the RL-trained routing policy has stochastic training. All results appear to be single-run." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Five multi-agent baselines are included: LLM-Debate, GPTSwarm, AFlow, MaAS, and MasRouter. Vanilla single-model results are also shown (Table 1)." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "MasRouter (Yue et al., 2025), MaAS (Zhang et al., 2025b), and AFlow (Zhang et al., 2025c) are all from 2025, representing recent and competitive approaches." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Table 3 presents ablation of three core components: model router (w/o Gψ), cost term (w/o C(·)), and confidence weighting (w/o Conf(·)), evaluated on MedQA and MBPP." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "The paper reports accuracy/Pass@1, inference cost (in dollars), and wall-clock latency (Figure 4). Cost-accuracy trade-offs are analyzed in Figure 3." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "No human evaluation is performed. All evaluation is fully automated through benchmark accuracy and pass@1 metrics. Human evaluation of reasoning quality or output quality could have been informative." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Appendix A states 'For all datasets except HumanEval, we adopt a train/test split with a ratio of 4:1.' HumanEval is used exclusively for out-of-distribution testing." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down per benchmark in Table 1. Figure 5 shows per-difficulty-level model selection on MATH. Figure 6 shows per-role model allocation. The case study (Appendix D) provides per-turn detail." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": false, 109 "justification": "No error analysis or systematic failure case discussion. The case study (Figure 8) shows a success case only. No examples of where OI-MAS fails or makes poor routing decisions." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The ablation study (Table 3) shows that removing the model router slightly improves accuracy on MedQA/MBPP but at substantially higher cost. Hyperparameter sensitivity (Figure 7) shows overly large λ degrades accuracy." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "The abstract claims 'improving accuracy by up to 12.88% while reducing cost by up to 79.78%.' Table 1 shows OI-MAS avg (78.23) vs AFlow Large avg (65.35), a 12.88% gap. Figure 3 shows cost reductions of 17.05%–78.47%. Claims are supported by the reported numbers." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims about component contributions are supported by controlled ablation (Table 3), where single components are removed in isolation. The ablation design constitutes controlled single-variable manipulation." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper's title claims 'Multi-Scale Models' generally, but only tests four models from two families (Qwen2.5, Llama3.1). Claims like 'more reliable and efficient multi-agent reasoning systems' (Conclusion) extend beyond the five benchmarks and four models tested." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The Limitations section discusses memory, scalability, and safety concerns, but does not address alternative explanations for the observed improvements. For example, the performance gains could partly stem from the training objective or ensemble effects rather than routing quality." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures accuracy and Pass@1 on specific benchmarks and frames results in terms of those metrics. Claims match measurement granularity — e.g., 'accuracy on GSM8K' rather than broad claims about 'reasoning ability.'" 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Specific model versions are given: Qwen2.5-3B, Qwen2.5-7B, Llama3.1-8B, Llama3.1-70B (Section 4.1). These include family, version, and parameter count." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "Agent roles (Generator, Decomposer, Verifier, etc.) are described by name and function but no actual prompt text is provided. The reader cannot reconstruct the prompts sent to models." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 4.1 reports: L=4 (max turns), λ=200 (cost penalty), θ=0.3 (role selection threshold), temperature=0, learning rate α=0.01. Sensitivity analysis for λ and L in Figure 7." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The multi-agent scaffolding is described in detail: role routing (Eq. 2), model routing (Eq. 3), early stopping mechanism, confidence-aware optimization (Eq. 5), and the overall workflow is illustrated in Figure 2." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": false, 168 "justification": "The paper states a 4:1 train/test split (Appendix A) but does not describe how the split was performed (random, stratified by difficulty, etc.) or any preprocessing applied to the benchmark data." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "A dedicated 'Limitations' section follows the conclusion, discussing three specific limitations: agent memory representation, scalability guarantees, and agent safety." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "The limitations are specific to this study: 'the work does not explicitly investigate how agent memory should be represented, maintained, and governed over time' and 'it is not guaranteed that the same balance will hold uniformly in highly concurrent, large-scale deployments.'" 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The limitations section discusses what wasn't investigated but doesn't explicitly bound what the results do not show. For example, it doesn't state that results don't generalize beyond the four models tested or the five benchmark domains." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw experimental data (model outputs, routing decisions, per-example results) is made available for independent verification. Only aggregated results are shown." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "The paper uses well-documented standard benchmarks (GSM8K, MATH, MedQA, GPQA, MBPP, HumanEval) with full citations. Appendix A describes each dataset." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. All data comes from standard public benchmarks." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": false, 207 "justification": "The routing policy training pipeline is described conceptually (RL with confidence-aware objective) but intermediate steps, data transformations, and filtering during training are not documented." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding source, grants, or acknowledgments section mentioning financial support is present in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: Harbin Institute of Technology (5 authors), Chinese Academy of Sciences (1 author). No product being evaluated is affiliated with the authors." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "No funding is disclosed, so independence of the funder cannot be assessed. Absence of disclosure is not absence of conflict." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests statement or financial interests declaration is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for any of the four backbone models (Qwen2.5-3B/7B, Llama3.1-8B/70B), despite these being pre-trained models evaluated on public benchmarks." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether the pre-trained models may have seen benchmark data (GSM8K, MATH, MBPP, etc., all published 2021) during pre-training." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "GSM8K, MATH, MBPP, MedQA, and HumanEval were all published in 2021, well before the training data collection for Qwen2.5 and Llama3.1. No contamination risk is discussed." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "Inference cost is a primary metric. Figure 3 shows cost-accuracy trade-offs, Table 2 reports cost in dollars, and Appendix C details the token-based pricing scheme used." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Hardware is mentioned (A100 80G GPUs) but total GPU hours, training time for the routing policy, or total computational budget are not quantified." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No seed sensitivity analysis. While LLM decoding uses temperature=0, the RL-based routing policy training involves stochasticity. No results across multiple seeds are reported." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of experimental runs is never stated. Results appear to come from a single training run of the routing policy." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "Figure 7 shows sensitivity analysis for λ and L, but does not report the total number of configurations tried, the search method, or compute spent on hyperparameter search." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": false, 317 "justification": "The final hyperparameters (λ=200, L=4) are stated but no explanation is given for how they were selected (e.g., selected on validation set, or chosen from the sensitivity analysis)." 318 }, 319 "multiple_comparison_correction": { 320 "applies": false, 321 "answer": false, 322 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The paper compares OI-MAS against baselines presumably re-implemented by the authors, but never acknowledges self-comparison bias. The statement 'all baselines are evaluated under identical hardware and decoding settings' (Section 4.1) addresses hardware fairness but not implementation bias." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "Figure 3 explicitly plots accuracy vs. inference cost for OI-MAS and all baselines across four benchmarks, enabling direct performance-per-compute comparison." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "No discussion of whether GSM8K, MATH, MedQA, GPQA, or MBPP actually measure the capabilities the paper claims to evaluate. The benchmarks are taken at face value." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "Different baselines use different scaffolding architectures (debate, swarm optimization, tree search, supernet, cascaded controller) yet performance differences are attributed to OI-MAS's routing mechanism. The confounding effect of scaffold design differences is not addressed." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "No discussion of temporal leakage. Most benchmarks (GSM8K, MATH, MBPP, MedQA, HumanEval) were published in 2021, and models were trained on data collected years later." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup leaks information not available in real usage." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether benchmark problems share structural similarities with pre-training data or whether train/test examples within the 4:1 split are truly independent." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention method (canary strings, membership inference, decontamination) is applied." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "OI-MAS consistently outperforms all vanilla backbone models with accuracy gains ranging from 9.52% to 21.13%.", 371 "evidence": "Table 1 shows OI-MAS achieves 78.23% average vs the best vanilla model (Llama3.1-70B) at 68.71%, across five benchmarks (Section 4.2).", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "OI-MAS achieves an average improvement of 7.68% over MasRouter and outperforms it on four of five benchmarks.", 376 "evidence": "Table 1: OI-MAS avg 78.23% vs MasRouter 70.55%. OI-MAS matches on GSM8K (95.80%) and exceeds on MATH (+7.56%), MedQA (+7.56%), GPQA (+8.98%), MBPP (+14.28%).", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "OI-MAS reduces inference cost by 17.05%–78.47% compared to baseline multi-agent systems while maintaining or improving accuracy.", 381 "evidence": "Figure 3 shows cost-accuracy plots across four benchmarks (Section 4.2). Cost computed via token-based API pricing (Appendix C).", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "OI-MAS achieves 23.12s wall-clock latency per query on GPQA, outperforming all baselines.", 386 "evidence": "Figure 4 shows latency comparison on GPQA only. Nearest baseline is MasRouter at 36.82s, MaAS at 192.62s.", 387 "supported": "weak" 388 }, 389 { 390 "claim": "The routing policy adapts model selection to problem difficulty, using smaller models for easier problems and larger models for harder ones.", 391 "evidence": "Figure 5 shows model selection distribution across five MATH difficulty levels, with clear progression from small to large models as difficulty increases (Section 5.1).", 392 "supported": "strong" 393 }, 394 { 395 "claim": "OI-MAS generalizes out-of-distribution from MBPP to HumanEval with 91.46% Pass@1 at the lowest cost among all baselines.", 396 "evidence": "Table 2 shows OOD results: OI-MAS achieves 91.46% at $0.097 vs MaAS 89.63% at $0.222. Only one OOD setting tested.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "The confidence-aware mechanism is critical for maintaining task accuracy.", 401 "evidence": "Table 3 ablation: removing Conf(·) drops MedQA from 78.99% to 76.47% and MBPP from 91.59% to 87.39%. Only two benchmarks tested.", 402 "supported": "moderate" 403 } 404 ], 405 "methodology_tags": ["benchmark-eval"], 406 "key_findings": "OI-MAS introduces a hierarchical role-model routing framework for multi-agent systems that jointly selects agent roles and model scales at each reasoning step, using confidence-aware optimization to balance accuracy and cost. On five benchmarks (GSM8K, MATH, MedQA, GPQA, MBPP), it achieves up to 12.88% accuracy improvement and up to 79.78% cost reduction over multi-agent baselines. The routing policy learns interpretable patterns: larger models for generative roles and harder problems, smaller models for structural roles and easy subtasks. The approach generalizes from MBPP to HumanEval without retraining.", 407 "red_flags": [ 408 { 409 "flag": "No error bars or statistical tests", 410 "detail": "All performance claims are based on comparing point estimates across systems. With RL-based routing policy training, results may vary across training runs, but no variance, confidence intervals, or significance tests are provided." 411 }, 412 { 413 "flag": "Proxy cost metric", 414 "detail": "All models are run locally on A100 GPUs, but costs are computed using Together AI API token pricing (Appendix C). One model's price (Qwen2.5-3B) is estimated via a scaling law rather than actual pricing. The API pricing proxy may not reflect actual local inference costs." 415 }, 416 { 417 "flag": "Complete contamination silence", 418 "detail": "Four pre-trained models are evaluated on five benchmarks published in 2021 (GSM8K, MATH, MBPP, MedQA, HumanEval). No training cutoff dates, no contamination analysis, no decontamination. Results may partly reflect memorization." 419 }, 420 { 421 "flag": "No failure analysis", 422 "detail": "The paper shows only success cases. The case study (Figure 8) illustrates correct routing. No examples of poor routing decisions, failure modes, or systematic errors are discussed." 423 }, 424 { 425 "flag": "Ablation on subset of benchmarks", 426 "detail": "The ablation study (Table 3) is performed on only MedQA and MBPP. Key claims about component importance are not validated on GSM8K, MATH, or GPQA." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework", 432 "authors": ["Sirui Hong", "Mingchen Zhuge", "Jonathan Chen"], 433 "year": 2023, 434 "relevance": "Foundational multi-agent framework for software development using structured role-based collaboration." 435 }, 436 { 437 "title": "ChatDev: Communicative Agents for Software Development", 438 "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"], 439 "year": 2024, 440 "relevance": "Multi-agent system for software development through communicative agent collaboration." 441 }, 442 { 443 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations", 444 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 445 "year": 2024, 446 "relevance": "General multi-agent conversation framework enabling complex LLM-based applications." 447 }, 448 { 449 "title": "RouteLLM: Learning to Route LLMs from Preference Data", 450 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"], 451 "year": 2024, 452 "relevance": "LLM routing framework that learns to select between models using preference data, directly related to model routing efficiency." 453 }, 454 { 455 "title": "MasRouter: Learning to Route LLMs for Multi-Agent Systems", 456 "authors": ["Yanwei Yue", "Guibin Zhang", "Boyang Liu"], 457 "year": 2025, 458 "arxiv_id": "2502.11133", 459 "relevance": "Most directly comparable baseline — routes LLMs in multi-agent settings via cascaded controller, but with static pre-inference decisions." 460 }, 461 { 462 "title": "AFlow: Automating Agentic Workflow Generation", 463 "authors": ["Jiayi Zhang", "Jinyu Xiang", "Zhaoyang Yu"], 464 "year": 2025, 465 "relevance": "Automates discovery of agentic workflows through Monte Carlo Tree Search, representative of automated agent design approaches." 466 }, 467 { 468 "title": "GPTSwarm: Language Agents as Optimizable Graphs", 469 "authors": ["Mingchen Zhuge", "Wenyi Wang", "Louis Kirsch"], 470 "year": 2024, 471 "relevance": "Formulates LLM agents as optimizable computational graphs, enabling joint optimization of prompts and communication topology." 472 }, 473 { 474 "title": "CodeAgent: Enhancing Code Generation with Tool-Integrated Agent Systems for Real-World Repo-Level Coding Challenges", 475 "authors": ["Kechi Zhang", "Jia Li", "Ge Li"], 476 "year": 2024, 477 "relevance": "Tool-integrated agent system for code generation, relevant to agentic AI capabilities evaluation." 478 }, 479 { 480 "title": "CodeTree: Agent-Guided Tree Search for Code Generation with Large Language Models", 481 "authors": ["Jierui Li", "Hung Le", "Yingbo Zhou"], 482 "year": 2025, 483 "relevance": "Agent-guided tree search approach to code generation, relevant to LLM programming capabilities." 484 }, 485 { 486 "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors", 487 "authors": ["Weize Chen", "Yusheng Su", "Jingwei Zuo"], 488 "year": 2024, 489 "relevance": "Multi-agent collaboration platform studying emergent behaviors in LLM agent teams." 490 }, 491 { 492 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 493 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 494 "year": 2024, 495 "relevance": "Directly relevant cost-efficiency approach for LLM inference through model cascading and routing." 496 }, 497 { 498 "title": "X-MAS: Towards Building Multi-Agent Systems with Heterogeneous LLMs", 499 "authors": ["Rui Ye", "Xiangrui Liu", "Qimin Wu"], 500 "year": 2025, 501 "relevance": "Multi-agent systems using heterogeneous LLM pools, closely related to OI-MAS's multi-scale model approach." 502 } 503 ] 504 }