scan.json (28479B)
1 { 2 "paper": { 3 "title": "S-DAG: A Subject-Based Directed Acyclic Graph for Multi-Agent Heterogeneous Reasoning", 4 "authors": [ 5 "Jiangwen Dong", 6 "Zehui Lin", 7 "Wanyu Lin", 8 "Mingjin Zhang" 9 ], 10 "year": 2025, 11 "venue": "AAAI 2026 (arXiv preprint)", 12 "arxiv_id": "2511.06727", 13 "doi": "10.48550/arXiv.2511.06727" 14 }, 15 "scan_version": 2, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "S-DAG proposes a GNN-based framework that decomposes multi-subject questions into a directed acyclic graph of subject dependencies, then assigns domain-specific expert LLMs (7B-13B) to each node for structured multi-agent collaboration. On curated multi-subject subsets of MMLU-Pro, GPQA, and MedMCQA, S-DAG achieves 59.73% average accuracy, outperforming multi-model baselines like Symbolic-MoE (57.53%) and approaching large monolithic models (Qwen2.5-72B at 60.08%). Ablation studies show the DAG structure reduces inference time by 60% and LLM calls by 50% compared to a fully connected graph while improving accuracy.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "A GitHub repository URL is provided in the abstract: https://github.com/WanyuGroup/AAAI2026_S-DAG." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper states 'We curate and release multi-subject subsets of standard benchmarks (MMLU-Pro, GPQA, MedMCQA)' and the underlying datasets are all publicly available benchmarks. The curated subsets are presumably available through the code repository." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper mentions 'A100 GPUs with 40 GB memory' and Adam optimizer, but provides no requirements.txt, Dockerfile, or detailed environment specifications listing library versions." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions are provided in the paper. The paper describes the methodology but does not include a reproducibility section with runnable commands." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": true, 46 "justification": "Table 3 reports ± standard deviations for all results (e.g., '50.98 ± 0.19' for S-DAG on MMLU-Pro). Results are 'averaged across three trials' with standard deviations." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "No statistical significance tests are reported. Claims like 'significantly outperforms' are based on comparing point estimates without any p-values, t-tests, or other significance tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Results are presented as absolute accuracy percentages with baselines for context (e.g., S-DAG 59.73% vs Symbolic-MoE 57.53% vs MAD 56.39%), allowing the reader to assess the magnitude of improvements." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification for the test set sizes (503, 129, 169 samples) or the 200-sample profiling set. No power analysis or discussion of whether these sizes are sufficient for the claims made." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": true, 66 "justification": "Standard deviations are reported across three trials in Table 3 (e.g., '49.82 ± 0.24')." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Multiple baselines are compared: single-model (CoT with GPT4o-mini, Qwen2.5-72B, Llama3.3-70B, Qwen2.5-7B, Self-Refine) and multi-model (MoE, GraphRouter, MAD, Symbolic-MoE)." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Baselines include recent work: Symbolic-MoE (2025), GraphRouter (2025), Heterogeneous Swarm (2025), and current LLMs (Qwen2.5, Llama3.3). These are competitive and contemporary." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table 4 presents a structured ablation study isolating three components: GNN coordination, LLM profiling, and DAG topology (w/o GNN, random model, fully connected graph variants)." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper reports accuracy (Table 3, primary metric) as well as inference time and number of LLM calls per instance (Table 4, efficiency metrics), evaluating both quality and efficiency." 89 }, 90 "human_evaluation": { 91 "applies": false, 92 "answer": false, 93 "justification": "The evaluation is on MCQ benchmarks with objectively correct answers. Human evaluation is clearly irrelevant for scoring multiple-choice question accuracy." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 2 shows explicit train/test splits (e.g., MMLU-Pro: 1173 train / 503 test). A separate 200-sample profiling set is also used. The splits are clearly documented." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down per benchmark (MMLU-Pro, GPQA, MedMCQA) in Table 3, and per ablation variant in Table 4." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": false, 108 "justification": "No failure case analysis, error examples, or discussion of where S-DAG breaks down. Only aggregate accuracy numbers are reported." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The ablation study (Table 4) shows configurations that perform worse: random model selection drops to 41-42%, fully connected graph underperforms the DAG structure. These demonstrate what doesn't work." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": false, 120 "justification": "The abstract claims the approach 'significantly outperforms existing task-level model selection and multi-agent collaboration baselines in accuracy and efficiency.' The actual improvement over Symbolic-MoE is 2.2 percentage points (59.73% vs 57.53%) without statistical significance testing. The word 'significantly' overstates the evidence." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims are made through ablation studies (Table 4) with controlled single-variable manipulation: removing GNN, removing LLM profiling, and changing graph topology. Each ablation changes one factor and measures the effect. This is adequate for causal claims about component contributions." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The title claims 'Multi-Agent Heterogeneous Reasoning' broadly, but results are limited to curated multi-subject subsets of three MCQ benchmarks. The paper does not bound generalization to MCQ formats or acknowledge that real-world heterogeneous reasoning extends far beyond multiple-choice questions." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "No alternative explanations are discussed. For example, the profiling set (200 samples from test distribution) could leak distributional information, or the improvement could stem from having more model parameters total (14 models × 7-13B) rather than from the DAG structure itself." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper measures MCQ accuracy on curated benchmark subsets but frames this as 'heterogeneous reasoning' and 'real-world reasoning tasks.' No discussion of whether MCQ accuracy is a valid proxy for multi-domain reasoning capability." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Expert models are specified with Hugging Face links (Table 6), and 'qwen-turbo-0919' is named. However, baseline models like 'GPT4o-mini' lack snapshot dates or API versions. Marketing names without version identifiers do not meet the standard." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Full prompt text is provided in the appendix: Table 5 (dataset preprocessing prompt), Table 7 (multi-agent information flow prompts for Subject Expert, Supporting, and Dominant agents), and Table 8 (baseline prompts)." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "The paper states 'the decoding temperature is set to 0.7, and the maximum output length is fixed at 4096 tokens across all LLMs. Adam optimizer is used to train GNN and MLP models.'" 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "The multi-agent scaffolding is described in detail: the S-DAG information flow mechanism, three agent types (Subject Expert, Supporting, Dominant), and how agents communicate via directed edges are documented in Section 3.3, Appendix B, and Figure 5." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Appendix A describes the preprocessing pipeline: LLM-based subject analysis with qwen-turbo-0919, three-round consistency filtering, weight normalization, threshold-based pruning (0.1), and DAG construction with dominant/support classification (Figure 4)." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "There is no dedicated 'Limitations' section. Section D ('Broader Impacts') mentions potential risks ('subject-specific models that may vary in quality or harbor domain-specific biases') but this addresses societal impacts rather than methodological limitations." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "No specific threats to validity are discussed. The Broader Impacts section contains only generic concerns about bias and deployment risks, not threats specific to this study's experimental design." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show, what populations/settings are excluded, or what limitations the MCQ format imposes on generalization claims." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "The underlying benchmarks (MMLU-Pro, GPQA, MedMCQA) are all publicly available. The curated multi-subject subsets and code are linked via the GitHub repository." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "The dataset curation process is described in detail in Appendix A: LLM-based subject analysis, three-round consistency filtering, weight normalization, threshold-based pruning, and train/test/profiling split. Table 2 documents the resulting dataset sizes." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data comes from standard public benchmarks (MMLU-Pro, GPQA, MedMCQA)." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline from raw benchmarks to curated datasets is documented: subject decomposition via LLM → three-round filtering → weight normalization → threshold pruning (0.1) → DAG construction → train/test split. Figure 4 illustrates the DAG construction." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Section 6 (Acknowledgments) discloses: 'This research was supported by Project P0049179 under the Innovation and Technology Fund – Guangdong–Hong Kong Technology Cooperation Funding Scheme (ITF-TCFS), funded by the Innovation and Technology Commission (Funding Body Ref. No. GHP/386/23SZ).'" 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly stated: Hong Kong Polytechnic University, Department of Data Science and AI, and Department of Computing. The authors are not affiliated with any of the LLM providers evaluated." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "The funder is a Hong Kong government technology fund (Innovation and Technology Commission), which has no financial interest in the outcome of LLM routing research." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is provided in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "No training data cutoff dates are stated for any of the 14 expert LLMs or the baseline models. The models could have been trained on the benchmark data." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether the pre-trained expert LLMs may have seen MMLU-Pro, GPQA, or MedMCQA examples during their pre-training or fine-tuning." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "MMLU-Pro, GPQA, and MedMCQA are all publicly available and could have been included in training corpora of the models used. No contamination analysis is performed." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": true, 289 "justification": "Table 4 reports average inference time per instance (15.02s for S-DAG) and number of LLM calls per instance (4.1 for S-DAG), providing efficiency metrics." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "The paper states experiments use 'A100 GPUs with 40 GB memory' but does not report total GPU hours, training time for the GNN, or total computational budget." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper states 'the seed is fixed' and 'Results are averaged across three trials' but does not report sensitivity to different random seeds. Fixing one seed does not address seed sensitivity." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": true, 306 "justification": "The paper explicitly states 'Results are averaged across three trials, and we compute the standard deviations as the statistical indicator.'" 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search budget is reported. Key hyperparameters like λ_node, λ_edge, and the relevance weight threshold (0.1) appear tuned but the search process is not documented." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": false, 316 "justification": "No explanation of how the final configuration was selected. The threshold (0.1), weight cutoff for dominant subjects (1/K), and other design choices are presented without justification or selection methodology." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "Multiple comparisons are made across 8 baselines and 3 datasets, but no statistical tests are performed at all, let alone corrections for multiple comparisons." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors implement all baselines themselves but do not acknowledge author-evaluation bias. Their implementations of MoE, GraphRouter, MAD, and Symbolic-MoE may not achieve optimal performance." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": true, 331 "justification": "Table 4 reports accuracy alongside inference time and LLM calls, showing that S-DAG achieves higher accuracy (59.73%) with lower compute (15.02s, 4.1 calls) than the fully connected variant (57.29%, 38.45s, 8.2 calls). The paper also compares small expert models vs large 70B models." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "No discussion of whether the curated MCQ subsets actually measure 'heterogeneous reasoning' as claimed. The construct validity of using multiple-choice questions to assess multi-domain reasoning capability is not examined." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "When comparing S-DAG (pool of 7-13B models with DAG routing) against single 70B models, the paper attributes S-DAG's competitive performance to 'structured coordination' without addressing whether the combined parameter count of 14 expert models confounds the comparison." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of temporal leakage. The expert models may have been trained after the benchmarks were published, meaning solutions could be in the training data." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the evaluation setup leaks information. The 200-sample profiling set is drawn from the test set distribution, which could leak distributional information about test examples." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of independence between train and test data. The curated subsets are drawn from the same benchmarks, and no verification of independence is provided." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention method is applied despite using public benchmarks with pre-trained models." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "S-DAG achieves the highest average accuracy (59.73%) across three benchmarks, outperforming both single-model and multi-model baselines.", 370 "evidence": "Table 3 shows S-DAG at 59.73% avg vs Symbolic-MoE 57.53%, MAD 56.39%, GraphRouter 56.03%, MoE 54.56%. Results include standard deviations across 3 trials.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "S-DAG composed of smaller domain-specific expert models (7B-13B) achieves performance competitive with large-scale monolithic LLMs (70B).", 375 "evidence": "Table 3: S-DAG 59.73% vs Qwen2.5-72B 60.08% and Llama3.3-70B 60.04%. S-DAG is 0.3pp below the best open-source single models.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The GNN module for S-DAG generation is superior to LLM-generated DAGs.", 380 "evidence": "Table 4 ablation: GNN-based S-DAG achieves 59.73% vs 53.51% without GNN coordination but with profiled model selection. The 'w/o GNN, profiled model' variant uses LLM-generated weights directly.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "The DAG structure reduces inference time and LLM calls compared to a fully connected graph while improving accuracy.", 385 "evidence": "Table 4: S-DAG achieves 59.73% accuracy, 15.02s inference, 4.1 LLM calls vs fully connected graph at 57.29%, 38.45s, 8.2 calls.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Subject-aware LLM profiling significantly improves performance over random model selection.", 390 "evidence": "Table 4: With profiled model selection and GNN, 59.73%; with GNN but random models, 42.19%. A 17.5pp improvement.", 391 "supported": "strong" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Small margins without significance tests", 397 "detail": "S-DAG's advantage over Symbolic-MoE is only 2.2pp (59.73% vs 57.53%) and over MAD is 3.3pp. The abstract claims 'significantly outperforms' but no statistical significance tests are performed. With test sets of 129-503 samples, these margins may not be statistically significant." 398 }, 399 { 400 "flag": "Curated subsets favor the proposed method", 401 "detail": "The authors created custom 'multi-subject' subsets specifically designed for their method's strength (subject decomposition). Standard benchmark results are not reported. The curation uses their own LLM-based subject analysis, creating a circular dependency." 402 }, 403 { 404 "flag": "LLM-generated ground truth for GNN training", 405 "detail": "The S-DAG ground truth labels used to train the GNN are generated by qwen-turbo-0919 with a simple prompt. This introduces a circular dependency: an LLM decides subject relevance, the GNN learns from this, and then the system is evaluated on LLM-judged correctness. No human validation of the LLM-generated subject labels is reported." 406 }, 407 { 408 "flag": "Profiling set drawn from test distribution", 409 "detail": "200 profiling samples are 'randomly selected from the test set' to assess LLM capabilities. While these may be separate from the evaluated test set, drawing profiling data from the test distribution could leak information about what the test set looks like." 410 }, 411 { 412 "flag": "No contamination analysis", 413 "detail": "All 14 expert LLMs could have been trained on MMLU-Pro, GPQA, or MedMCQA data. No training cutoff dates are stated and no contamination checks are performed." 414 }, 415 { 416 "flag": "No limitations section", 417 "detail": "The paper lacks any dedicated discussion of methodological limitations, threats to validity, or scope boundaries. The Broader Impacts section only discusses generic societal concerns." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Symbolic Mixture-of-Experts: Adaptive Skill-based Routing for Heterogeneous Reasoning", 423 "authors": ["J. C.-Y. Chen", "S. Yun", "E. Stengel-Eskin", "T. Chen", "M. Bansal"], 424 "year": 2025, 425 "arxiv_id": "2503.05641", 426 "relevance": "Multi-expert LLM routing system using skill-based selection; primary baseline for heterogeneous reasoning comparison." 427 }, 428 { 429 "title": "GraphRouter: A Graph-based Router for LLM Selections", 430 "authors": ["T. Feng", "Y. Shen", "J. You"], 431 "year": 2025, 432 "relevance": "Uses GNN for LLM routing as edge prediction; directly comparable approach to S-DAG's GNN-based selection." 433 }, 434 { 435 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 436 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 437 "year": 2024, 438 "relevance": "Cost-efficient LLM usage via reliability-based model selection; relevant to LLM routing and cost-performance tradeoffs." 439 }, 440 { 441 "title": "Mixture-of-agents Enhances Large Language Model Capabilities", 442 "authors": ["J. Wang", "J. Wang", "B. Athiwaratkun", "C. Zhang", "J. Zou"], 443 "year": 2024, 444 "arxiv_id": "2406.04692", 445 "relevance": "Multi-agent LLM collaboration framework combining diverse models; key MoA baseline for multi-agent reasoning." 446 }, 447 { 448 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversations", 449 "authors": ["Q. Wu", "G. Bansal", "J. Zhang", "Y. Wu", "B. Li"], 450 "year": 2024, 451 "relevance": "Multi-agent conversation framework for LLM applications; foundational work in LLM agent orchestration." 452 }, 453 { 454 "title": "Heterogeneous Swarms: Jointly Optimizing Model Roles And Weights for Multi-LLM Systems", 455 "authors": ["S. Feng", "Z. Wang", "P. Goyal", "Y. Wang"], 456 "year": 2025, 457 "arxiv_id": "2502.04510", 458 "relevance": "Optimizes multi-LLM collaboration via DAG structure; closely related approach to S-DAG's graph-structured reasoning." 459 }, 460 { 461 "title": "Knowledge Card: Filling LLMs' Knowledge Gaps with Plug-in Specialized Language Models", 462 "authors": ["S. Feng", "W. Shi", "Y. Bai", "V. Balachandran"], 463 "year": 2024, 464 "relevance": "Dynamically selects fine-tuned specialist LLMs to fill knowledge gaps; relevant to subject-specific expert assignment." 465 }, 466 { 467 "title": "GPTSwarm: Language Agents As Optimizable Graphs", 468 "authors": ["M. Zhuge", "W. Wang", "L. Kirsch", "F. Faccio"], 469 "year": 2024, 470 "relevance": "Represents LLM agents as optimizable graph structures; relevant to graph-based multi-agent collaboration design." 471 }, 472 { 473 "title": "MasRouter: Learning To Route LLMs for Multi-agent Systems", 474 "authors": ["Y. Yue", "G. Zhang", "B. Liu", "G. Wan"], 475 "year": 2025, 476 "arxiv_id": "2502.11133", 477 "relevance": "Cascaded routing controller for multi-agent LLM systems; directly relevant to LLM routing in MAS." 478 }, 479 { 480 "title": "When One LLM Drools, Multi-LLM Collaboration Rules", 481 "authors": ["S. Feng", "W. Ding", "A. Liu", "Z. Wang"], 482 "year": 2025, 483 "arxiv_id": "2502.04506", 484 "relevance": "Study of multi-LLM collaboration showing single LLMs struggle with multi-domain tasks; motivates the multi-agent approach." 485 }, 486 { 487 "title": "Improving Factuality And Reasoning in Language Models Through Multiagent Debate", 488 "authors": ["Y. Du", "S. Li", "A. Torralba", "J. B. Tenenbaum", "I. Mordatch"], 489 "year": 2023, 490 "relevance": "Multi-agent debate framework for improving LLM reasoning; baseline method (MAD) in S-DAG experiments." 491 }, 492 { 493 "title": "Smoa: Improving Multi-agent Large Language Models With Sparse Mixture-of-agents", 494 "authors": ["D. Li", "Z. Tan", "P. Qian", "Y. Li"], 495 "year": 2024, 496 "arxiv_id": "2411.03284", 497 "relevance": "Sparse mixture-of-agents approach for multi-LLM collaboration; related work on efficient multi-agent composition." 498 } 499 ] 500 }