scan.json (27893B)
1 { 2 "paper": { 3 "title": "Optimal-Agent-Selection: State-Aware Routing Framework for Efficient Multi-Agent Collaboration", 4 "authors": [ 5 "Jingbo Wang", 6 "Sendong Zhao", 7 "Haochun Wang", 8 "Yuzheng Fan", 9 "Lizhe Zhang", 10 "Yan Liu", 11 "Ting Liu" 12 ], 13 "year": 2025, 14 "venue": "arXiv", 15 "arxiv_id": "2511.02200", 16 "doi": "10.48550/arXiv.2511.02200" 17 }, 18 "scan_version": 2, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "STRMAC, a state-aware routing framework for multi-agent collaboration, achieves up to 23.8% accuracy improvement over baselines on collaborative reasoning tasks (PDDP clinical prediction and EBFC fact-checking) while drastically reducing token consumption to 5.9–21.6% of competing methods. A self-evolving data generation strategy reduces training data collection overhead by up to 90.1% compared to exhaustive search. Routers trained on execution data from smaller models (Llama-3.1-8B) transfer effectively to larger agent models (Llama-3.1-70B, Qwen2.5-32B) and even closed-source GPT-4o.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "PDDP is constructed from MIMIC-III (publicly available with data use agreement) and EBFC from AMBIFC (Glockner et al. 2024), both publicly accessible datasets. However, the specific multi-agent task formulations and router training data are not released." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper mentions 'A100 with 80G memory' and lists model names but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions, README, or scripts to replicate experiments are provided." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results in Tables 1, 2, 3, and 4 are point estimates with no confidence intervals, error bars, or ± notation." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims STRMAC outperforms baselines based solely on comparing accuracy numbers. No p-values, t-tests, or any statistical significance tests are reported." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The paper reports percentage improvements with baseline context throughout, e.g., 'improvements of +4.4%, +8.1%, and +1.7%' (Section 4.2) and '+6.0%, +14.8% and +23.8%' over Random-Chain, with absolute accuracy values provided." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "No justification is given for the dataset sizes used in PDDP or EBFC. The number of test instances is never explicitly stated." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No standard deviation, variance, or spread measures are reported. All results appear to be single-run numbers with no indication of result stability across runs." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Six baselines are compared: Random-Chain, LLM-Debate (Du et al. 2023), MACNET (Qian et al. 2024), MOA (Wang et al. 2024a), MDAgents (Kim et al. 2024), and IG-MAS (Wang et al. 2025)." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Baselines include recent methods: IG-MAS (2025), MDAgents (NeurIPS 2024), MACNET (2024), and MOA (2024). These represent current state of the art in multi-agent collaboration." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": false, 86 "justification": "While Section 5.1 shows the benefit of iterative data generation (pruning + router-guided exploration), there is no ablation of the core STRMAC architecture: the state encoder, agent embeddings, contrastive learning objective, or cosine similarity routing are never individually removed or modified to measure their contribution." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Three metrics are used: Accuracy (Acc), Token consumption (Token), and Cost-Adjusted Score (CAS), which combines accuracy and cost." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "No human evaluation is performed. All evaluation is automated against ground-truth labels on PDDP (discharge disposition) and EBFC (fact verification) tasks." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "The paper mentions 'validation performance' for tuning learning rate but never explicitly describes a train/test split, the number of test examples, or confirms that reported results are on data unseen during any selection decisions." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Results are broken down by task (PDDP vs EBFC) and by model (Llama3.1-70B, Llama3.1-8B, Qwen2.5-32B) in Tables 1 and 4. Individual agent performance is shown in Table 5, and path distribution analysis in Figure 3." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 5.4 discusses that collaborative paths 'do not always result in higher accuracy, suggesting that only carefully selected, relevant agents can provide meaningful improvements, while the inclusion of less relevant agents may introduce noise and negatively impact performance.'" 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 5.4 shows that multi-agent collaboration paths sometimes have lower accuracy than single-agent paths (e.g., Figure 3 shows AgentBHC alone at 70.9% vs AgentBHC→AgentSH at 60.4% for Llama70B), demonstrating that collaboration can hurt performance." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims 'up to 23.8% improvement over baselines' which matches EBFC Qwen32B (48.5%→72.3% in Table 1), and '90.1% reduction in data collection overhead' which matches EBFC Llama70B (9.9% proportion in Table 2)." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper's causal claims ('STRMAC improves accuracy') are supported by controlled comparisons: the same models and datasets are used across all methods, with only the collaboration/routing strategy varying. This controlled single-variable manipulation is adequate for the causal claims made." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title claims 'Optimal-Agent-Selection' and 'Efficient Multi-Agent Collaboration' broadly, but results are limited to two specific tasks (clinical discharge prediction and fact-checking) with specific multi-agent setups where agents hold private information subsets. The paper does not bound its claims to these specific settings." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "No alternative explanations are discussed for why STRMAC outperforms baselines. For instance, the router may simply learn to pick the single strongest agent (which Section 5.4 partially confirms), but this is presented as a feature rather than explored as an alternative explanation for the gains." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper measures classification accuracy on PDDP (4-class) and EBFC (binary) and frames this as 'collaborative reasoning' and 'multi-agent reasoning accuracy' throughout. No discussion of whether classification accuracy captures reasoning quality, or whether these specific benchmarks measure the broader collaborative reasoning capability claimed." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "The main experiments specify 'Llama-3.1-70B-Instruct', 'Llama-3.1-8B-Instruct', and 'Qwen2.5-32B-Instruct' — specific model versions available on HuggingFace. GPT-4o supplementary experiments reference OpenAI (2024) but lack a snapshot date." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": false, 155 "justification": "No prompts or system instructions are provided. The paper describes agents receiving 'private context' and 'shared context' but never shows the actual prompt text used for any agent." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "Learning rate range (1×10⁻⁴ to 5×10⁻⁵) and router model (mDeBERTaV3-base, 86M params) are reported, but no temperature, top-p, max tokens, or other LLM inference settings are stated for the agent models." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The multi-agent routing framework IS the contribution and is described in detail: Section 3.2 covers the state encoder and agent embeddings, Section 3.3 the contrastive learning objective, and the overall workflow is shown in Figure 2." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": false, 170 "justification": "Dataset construction is deferred to 'the protocols of (Wang et al. 2025)' without restating the details. How medical records were fragmented across agents, how AMBIFC claims were selected, and filtering criteria are not documented in this paper." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "There is no limitations, threats-to-validity, or discussion section addressing limitations. The paper moves directly from analysis to conclusion." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No threats to validity are discussed anywhere in the paper." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper makes no explicit statements about what the results do NOT show, what settings are excluded, or what claims the authors are NOT making." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "While the underlying MIMIC-III and AMBIFC datasets are publicly available, the specific task formulations, agent-specific data splits, router training execution paths, and processed data are not released." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 4.1 describes PDDP (from MIMIC-III, clinical prediction with fragmented records across agents) and EBFC (from AMBIFC, fact-checking with evidence distribution to agents), including what each agent receives and the task objective." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data comes from existing datasets (MIMIC-III, AMBIFC)." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": false, 209 "justification": "The self-evolving data generation pipeline (Section 3.4) is well-described for router training data, but the end-to-end pipeline from raw MIMIC-III/AMBIFC data to final agent inputs and evaluation setup is not documented. The number of test instances is never stated." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding sources, acknowledgments, or grant numbers are mentioned anywhere in the paper." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: Harbin Institute of Technology (academic) and China Mobile Group Heilongjiang Co., Ltd (industry)." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding is disclosed, so independence cannot be assessed. China Mobile is listed as an affiliation but the paper does not evaluate any China Mobile product." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests statement, patent disclosures, or financial interest declarations are present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff dates are stated for Llama-3.1, Qwen2.5, or GPT-4o. This is important because the benchmark data (MIMIC-III from 2016, AMBIFC from 2024) could appear in training data." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether MIMIC-III clinical data or AMBIFC claims appeared in any model's pre-training data." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "MIMIC-III (published 2016) is widely used and could be in training data of models trained after 2016. No contamination analysis or temporal split is discussed." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": true, 292 "justification": "Token consumption is reported for all methods across all experiments in Tables 1 and 4. The CAS metric (Equation 4.1) explicitly penalizes token cost. STRMAC uses 338–972 tokens vs thousands for baselines." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "Hardware is mentioned ('A100 with 80G memory') but total GPU hours, training time for the router, or total API cost for data generation are not quantified." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "No mention of multiple random seeds. All results appear to be from single runs. No seed sensitivity analysis." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The paper never states how many runs produced the reported results." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "Learning rate is 'tuned within the range of 1×10⁻⁴ to 5×10⁻⁵ according to validation performance' but no details on number of configurations tried, search method, or compute spent on tuning." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "The paper mentions tuning 'according to validation performance' but does not describe the selection process, how many configurations were evaluated, or whether selection was done on a proper validation set separate from the test data." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors implement their own versions of all baselines and compare against them without acknowledging the systematic bias of evaluating one's own system against one's own re-implementations of competing methods." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": true, 334 "justification": "The CAS metric (Section 4.2) explicitly evaluates performance as a function of inference cost (token consumption), and token counts are reported alongside accuracy for all methods, enabling cost-performance comparison." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "No discussion of whether PDDP and EBFC actually measure 'collaborative reasoning' as claimed. The paper does not question whether classification accuracy on these tasks captures the multi-agent reasoning capability it claims to improve." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": true, 344 "justification": "The same base LLMs are used across all methods, with only the collaboration/routing framework varying. The scaffold differences are the intentional independent variable being studied, and the paper controls for model choice." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "MIMIC-III was published in 2016 and AMBIFC in 2024. No discussion of whether these datasets or their contents appeared in model training data." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether agent prompts or the task setup inadvertently leak information about the correct answer." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether the training instances (used for router training) and test instances are independent or share structural similarities." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention method is applied." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "STRMAC achieves up to 23.8% accuracy improvement over baselines on collaborative reasoning tasks.", 373 "evidence": "Table 1 shows EBFC with Qwen2.5-32B: Random-Chain 48.5% → STRMAC 72.3%, a +23.8% absolute improvement. Improvements are consistent across models and tasks.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Self-evolving data generation reduces training data collection overhead by up to 90.1% compared to exhaustive search.", 378 "evidence": "Table 2 shows EBFC with Llama70B requires only 9.9% of exhaustive search paths (126.1 vs 1275.1 theoretical maximum), representing a 90.1% reduction.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "STRMAC achieves highest Cost-Adjusted Score (CAS) across all models and tasks.", 383 "evidence": "Table 1 shows STRMAC CAS values of 48.6–82.4 across all settings, consistently exceeding all baselines. Token consumption is 5.9–21.6% of IG-MAS.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Routers trained on smaller model data (Llama-8B) generalize to larger agent models.", 388 "evidence": "Table 3 shows Llama70B accuracy with same-model training (64.0%) vs Llama8B-sourced training (63.4%) on PDDP, and 85.2% vs 83.2% on EBFC — marginal differences.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "STRMAC transfers effectively to closed-source GPT-4o using open-source model training data and embeddings.", 393 "evidence": "Table 4 shows STRMAC achieves 64.4% (PDDP) and 89.1% (EBFC) on GPT-4o, outperforming all baselines while using open-source model execution data and Llama70B agent embeddings.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "Agent execution order significantly impacts multi-agent collaboration performance.", 398 "evidence": "Figure 1 shows that two different orderings of the same five agents produce substantially different accuracies across four models. No statistical significance test accompanies this claim.", 399 "supported": "weak" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "No error bars or variance reporting", 405 "detail": "All results across Tables 1–4 are single point estimates. Without variance across runs or statistical tests, it is impossible to assess whether reported differences are statistically meaningful or within noise range." 406 }, 407 { 408 "flag": "No ablation of core architecture", 409 "detail": "The paper's main novelty (state encoder, agent embeddings, contrastive learning, cosine similarity routing) is never ablated. We cannot determine which components actually contribute to the reported improvements." 410 }, 411 { 412 "flag": "No limitations discussed", 413 "detail": "The paper contains no limitations section, no threats to validity, and no discussion of what the results do not show. This is a significant omission." 414 }, 415 { 416 "flag": "Only two narrow benchmarks", 417 "detail": "Results are reported on only PDDP (clinical discharge prediction) and EBFC (fact-checking), both with specific private-information-sharing setups. The broad claims about 'efficient multi-agent collaboration' are poorly supported by this narrow evaluation." 418 }, 419 { 420 "flag": "Router may just learn single-agent selection", 421 "detail": "Section 5.4 shows the dominant routing path across all models is to select only AgentBHC (the agent with the most comprehensive information). The router's main 'skill' may be identifying which single agent has the best information, not enabling meaningful multi-agent collaboration." 422 }, 423 { 424 "flag": "Test set details missing", 425 "detail": "The number of test instances, train/test split ratios, and whether reported results are on truly held-out data are never stated." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "GPT-4 Technical Report", 431 "authors": ["Josh Achiam et al."], 432 "year": 2023, 433 "arxiv_id": "2303.08774", 434 "relevance": "Foundational LLM technical report relevant to understanding model capabilities evaluated in multi-agent systems." 435 }, 436 { 437 "title": "ChatDev: Communicative Agents for Software Development", 438 "authors": ["Chen Qian et al."], 439 "year": 2023, 440 "arxiv_id": "2307.07924", 441 "relevance": "Multi-agent LLM system for code generation with static role-based pipelines, a key baseline paradigm." 442 }, 443 { 444 "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors in Agents", 445 "authors": ["Weize Chen et al."], 446 "year": 2023, 447 "arxiv_id": "2308.10848", 448 "relevance": "Multi-agent collaboration framework exploring emergent behaviors, relevant to multi-agent system design." 449 }, 450 { 451 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 452 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 453 "year": 2023, 454 "arxiv_id": "2305.05176", 455 "relevance": "Foundational LLM routing work balancing cost and performance across models." 456 }, 457 { 458 "title": "RouteLLM: Learning to Route LLMs from Preference Data", 459 "authors": ["Isaac Ong et al."], 460 "year": 2024, 461 "relevance": "LLM routing system learning from preference data, relevant to model selection and cost-quality tradeoffs." 462 }, 463 { 464 "title": "RouterDC: Query-based Router by Dual Contrastive Learning for Assembling Large Language Models", 465 "authors": ["Shuhao Chen et al."], 466 "year": 2024, 467 "relevance": "Contrastive learning-based LLM router that STRMAC directly builds upon for its routing architecture." 468 }, 469 { 470 "title": "MDAgents: An Adaptive Collaboration of LLMs for Medical Decision-Making", 471 "authors": ["Yubin Kim et al."], 472 "year": 2024, 473 "relevance": "Adaptive multi-agent medical reasoning framework, used as a baseline in this work." 474 }, 475 { 476 "title": "GPTSwarm: Language Agents as Optimizable Graphs", 477 "authors": ["Mingchen Zhuge et al."], 478 "year": 2024, 479 "relevance": "Trainable graph-based multi-agent communication framework optimizing collaboration structure." 480 }, 481 { 482 "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate", 483 "authors": ["Yilun Du et al."], 484 "year": 2023, 485 "relevance": "Foundational multi-agent debate approach for improving LLM reasoning, used as a baseline." 486 }, 487 { 488 "title": "Mixture-of-Agents Enhances Large Language Model Capabilities", 489 "authors": ["Junlin Wang et al."], 490 "year": 2024, 491 "arxiv_id": "2406.04692", 492 "relevance": "Layered aggregation approach for combining multiple LLM outputs, used as a baseline." 493 }, 494 { 495 "title": "Scaling Large-Language-Model-based Multi-Agent Collaboration", 496 "authors": ["Chen Qian et al."], 497 "year": 2024, 498 "arxiv_id": "2406.07155", 499 "relevance": "Studies scaling multi-agent collaboration and demonstrates agent ordering impacts performance (MACNET baseline)." 500 }, 501 { 502 "title": "AgentCoder: Multi-Agent-Based Code Generation with Iterative Testing and Optimisation", 503 "authors": ["Dong Huang et al."], 504 "year": 2023, 505 "arxiv_id": "2312.13010", 506 "relevance": "Multi-agent system for code generation with iterative refinement, relevant to agentic AI development." 507 }, 508 { 509 "title": "MedAgents: Large Language Models as Collaborators for Zero-shot Medical Reasoning", 510 "authors": ["Xiangru Tang et al."], 511 "year": 2023, 512 "arxiv_id": "2311.10537", 513 "relevance": "Multi-agent medical reasoning system using static role-based pipelines, predecessor to adaptive approaches." 514 } 515 ] 516 }