scan.json (30525B)
1 { 2 "paper": { 3 "title": "MoCo: A One-Stop Shop for Model Collaboration Research", 4 "authors": [ 5 "Shangbin Feng", 6 "Yuyang Bai", 7 "Ziyuan Yang", 8 "Yike Wang", 9 "Zhaoxuan Tan", 10 "Jiajie Yan", 11 "Zhenyu Lei", 12 "Wenxuan Ding", 13 "Weijia Shi", 14 "Haojin Wang", 15 "Zhenting Qi", 16 "Yuru Jiang", 17 "Heng Wang", 18 "Chengsong Huang", 19 "Yu Fei", 20 "Jihan Yao", 21 "Yilun Du", 22 "Luke Zettlemoyer", 23 "Yejin Choi", 24 "Yulia Tsvetkov" 25 ], 26 "year": 2026, 27 "venue": "arXiv preprint", 28 "arxiv_id": "2601.21257", 29 "doi": "10.48550/arXiv.2601.21257" 30 }, 31 "scan_version": 2, 32 "active_modules": ["experimental_rigor", "data_leakage"], 33 "methodology_tags": ["benchmark-eval"], 34 "key_findings": "MOCO benchmarks 26 model collaboration algorithms across 25 datasets, finding that collaboration outperforms individual models in 61.0% of (model, data) settings. Weight-level and text-level methods are generally strongest, with Model Swarms and Sparta Alignment among the top performers. Model diversity matters more than model quantity for collaboration effectiveness, and approximately 18.5% of problems unsolvable by any individual model become solvable through collaboration ('collaborative emergence').", 35 "checklist": { 36 "artifacts": { 37 "code_released": { 38 "applies": true, 39 "answer": true, 40 "justification": "GitHub repository provided: https://github.com/BunsenFeng/model_collaboration. The paper states 'MOCO is publicly available at https://github.com/BunsenFeng/model_collaboration' and also promises a PyPI package (Appendix B)." 41 }, 42 "data_released": { 43 "applies": true, 44 "answer": true, 45 "justification": "MOCO integrates 25 publicly available evaluation datasets (GSM8k, MATH, HumanEval, MMLU-redux, etc.) with built-in support. All datasets are from public sources listed in Table 3." 46 }, 47 "environment_specified": { 48 "applies": true, 49 "answer": false, 50 "justification": "No requirements.txt, Dockerfile, conda environment file, or detailed environment setup section found in the paper. The paper only mentions Python and refers to the GitHub repository." 51 }, 52 "reproduction_instructions": { 53 "applies": true, 54 "answer": false, 55 "justification": "Figure 1 shows a high-level diagram ('git clone moco', 'moco -c config.json') but no step-by-step instructions for reproducing the specific experiments in Table 1. No 'Reproducing Results' section or scripts to replicate the main experiments." 56 } 57 }, 58 "statistical_methodology": { 59 "confidence_intervals_or_error_bars": { 60 "applies": true, 61 "answer": false, 62 "justification": "Table 1 reports only point estimates for all 26 methods across all domains. No confidence intervals, error bars, or ± notation anywhere in the main results." 63 }, 64 "significance_tests": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper claims methods 'outperform' baselines and identifies 'best' methods based solely on comparing numbers without any statistical significance tests. No p-values, t-tests, or bootstrap tests reported." 68 }, 69 "effect_sizes_reported": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper reports '61.0% of (model, data) settings' show improvement and 'up to 25.8%' improvement. Table 1 provides both 'Best Single' baseline numbers and method scores across all domains, enabling effect size assessment. Weight-level average (60.1) vs global average (53.5) is also reported." 73 }, 74 "sample_size_justified": { 75 "applies": true, 76 "answer": false, 77 "justification": "No justification for key design choices: why 2 model pools, why 3 models per pool, why 11 of 25 available datasets, or why 1k downsampling. The paper states 'We by default downsample to 1k for both the dev and test sets' without justification." 78 }, 79 "variance_reported": { 80 "applies": true, 81 "answer": false, 82 "justification": "Main results in Table 1 show no variance or standard deviation despite using stochastic decoding (τ=0.7, top-p=0.9). Table 2 shows std dev for a leave-one-out sensitivity analysis but this covers only one method (multiagent debate) on 3 datasets, not the main results." 83 } 84 }, 85 "evaluation_design": { 86 "baselines_included": { 87 "applies": true, 88 "answer": true, 89 "justification": "Table 1 includes 'Best Single' baseline (best individual model without collaboration) for both model pools. All 26 collaboration methods are compared against this baseline." 90 }, 91 "baselines_contemporary": { 92 "applies": true, 93 "answer": true, 94 "justification": "Model pool #2 uses QWEN-2.5-7B, LLAMA-3.1-8B, and OLMO-3-7B — all released in 2024-2025. The collaboration methods span recent work from 2022-2025." 95 }, 96 "ablation_study": { 97 "applies": true, 98 "answer": true, 99 "justification": "Table 2 provides leave-one-out analysis on model composition for multiagent debate. Figure 3 ablates model diversity (a×b settings: 1×8, 2×4, 4×2, 8×1). Figure 2 ablates the number of models (2, 4, 8, 16). These test which factors contribute to collaboration effectiveness." 100 }, 101 "multiple_metrics": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper evaluates across 6 domains (QA, math, reasoning, safety, coding, instruction following) using task accuracy, generative verifiers, and reward models. Multiple evaluation approaches are combined." 105 }, 106 "human_evaluation": { 107 "applies": true, 108 "answer": false, 109 "justification": "All evaluation is automated: task accuracy, generative verifiers (Ma et al., 2025), and reward models (Liu et al., 2024b). No human evaluation of system outputs is performed." 110 }, 111 "held_out_test_set": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper explicitly separates dev and test splits: 'We by default downsample to 1k for both the dev and test sets.' Methods requiring training (e.g., Trained Router, LLM Blender) use the dev set; results are reported on the test set." 115 }, 116 "per_category_breakdown": { 117 "applies": true, 118 "answer": true, 119 "justification": "Table 1 provides per-domain breakdowns across QA, math, reasoning, safety, code, and instruction following for all 26 methods and both model pools." 120 }, 121 "failure_cases_discussed": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper discusses where methods fail: Mentor Collab performs poorly across most settings (Table 1), 'refining generation is challenging for safety and refusal scenarios in the CocoNot dataset' (Section 4), and routing is less effective with general-purpose LMs due to 'artificial hivemind phenomenon.'" 125 }, 126 "negative_results_reported": { 127 "applies": true, 128 "answer": true, 129 "justification": "Table 1 shows numerous methods underperforming the baseline (39% of settings). Mentor Collab scores 0.316/0.419 avg vs 0.549/0.582 baseline. Logit Contrastive, Logit Fusion, and several text-level methods show degradation. The paper reports these results transparently." 130 } 131 }, 132 "claims_and_evidence": { 133 "abstract_claims_supported": { 134 "applies": true, 135 "answer": true, 136 "justification": "Abstract claims of '61.0% of (model, data) settings' improvement and 'up to 25.8%' outperformance are supported by Table 1. Claims about text-level and weight-level methods being effective are supported by the results section." 137 }, 138 "causal_claims_justified": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper's causal claims ('X outperforms Y', 'model diversity benefits collaboration') are supported by controlled comparisons: same benchmarks, same model pools, varying only the collaboration method. The leave-one-out analysis (Table 2) and diversity experiment (Figure 3) are controlled single-variable manipulations." 142 }, 143 "generalization_bounded": { 144 "applies": true, 145 "answer": false, 146 "justification": "The title claims 'A One-Stop Shop for Model Collaboration Research' and the paper makes broad claims about 'model collaboration' as a paradigm and 'collaborative and decentralized AI future.' Results are limited to 7B-8B parameter models, specific benchmarks, and default hyperparameters. The paper does not bound its claims to these settings." 147 }, 148 "alternative_explanations_discussed": { 149 "applies": true, 150 "answer": false, 151 "justification": "The paper does not discuss alternative explanations for observed results. For example, 'collaborative emergence' (18.5% of impossible problems solved) could partly reflect random variation with stochastic decoding or majority voting effects, but these alternatives are not considered." 152 }, 153 "proxy_outcome_distinction": { 154 "applies": true, 155 "answer": false, 156 "justification": "The paper measures benchmark accuracy and reward model scores but frames results as evidence for 'model collaboration' as a paradigm for 'compositional AI systems' and a 'collaborative and decentralized AI future.' The gap between benchmark performance and real-world collaboration effectiveness is not acknowledged." 157 } 158 }, 159 "setup_transparency": { 160 "model_versions_specified": { 161 "applies": true, 162 "answer": true, 163 "justification": "Exact HuggingFace model IDs are listed in Appendix B: QWEN/QWEN2.5-7B-INSTRUCT, META-LLAMA/LLAMA-3.1-8B-INSTRUCT, ALLENAI/OLMO-3-7B-INSTRUCT, plus 16 specific models for Figure 2 experiments." 164 }, 165 "prompts_provided": { 166 "applies": true, 167 "answer": false, 168 "justification": "Actual prompt text is not provided for any of the methods. The paper explicitly states: 'please refer to the software repository for details.' Prompt routing is described as 'we prompt an LLM to select the best-fitting model' without the actual text." 169 }, 170 "hyperparameters_reported": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 3 reports key generation hyperparameters: 'τ = 0.7 and p = 0.9 for temperature and top-p sampling,' '512 max new tokens, with an exception of 1024 for coding tasks.' Method-specific hyperparameters are deferred to the MOCO codebase ('We employ the default hyperparameters provided in MOCO')." 174 }, 175 "scaffolding_described": { 176 "applies": false, 177 "answer": false, 178 "justification": "No agentic scaffolding is used. The collaboration methods are algorithmic protocols for multi-LLM interaction, not agentic scaffolds with tools, memory, or feedback loops." 179 }, 180 "data_preprocessing_documented": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 2.2 describes downsampling: 'We by default downsample to 1k for both the dev and test sets.' Table 3 provides full dataset statistics. Section 3 describes the evaluation pipeline including normalization ('we normalize [IF] scores with min-max standardization to 0-1')." 184 } 185 }, 186 "limitations_and_scope": { 187 "limitations_section_present": { 188 "applies": true, 189 "answer": false, 190 "justification": "No limitations section exists. The paper has a Discussion (Section 5) that poses open research questions and an Impact Statement about malicious actors, but neither discusses limitations of the current study." 191 }, 192 "threats_to_validity_specific": { 193 "applies": true, 194 "answer": false, 195 "justification": "No threats to validity are discussed. The paper does not acknowledge potential issues such as results being specific to 7B-8B models, sensitivity to hyperparameter defaults, or the limitation of automated evaluation." 196 }, 197 "scope_boundaries_stated": { 198 "applies": true, 199 "answer": false, 200 "justification": "The paper does not explicitly state what the results do NOT show. No discussion of excluded settings, populations, or claims the authors are not making. The broad framing ('one-stop shop', 'decentralized AI future') contrasts with the narrow experimental scope." 201 } 202 }, 203 "data_integrity": { 204 "raw_data_available": { 205 "applies": true, 206 "answer": false, 207 "justification": "Raw experimental outputs (model generations, per-example scores, intermediate results) are not mentioned as available. Only the MOCO toolkit code and public benchmark datasets are released." 208 }, 209 "data_collection_described": { 210 "applies": true, 211 "answer": true, 212 "justification": "All 25 datasets are described with sources, sizes, and splits in Table 3. Model pools are specified with exact model identifiers. The evaluation methodology using task accuracy, generative verifiers, and reward models is described in Section 3." 213 }, 214 "recruitment_methods_described": { 215 "applies": false, 216 "answer": false, 217 "justification": "No human participants. All data sources are standard public benchmarks." 218 }, 219 "data_pipeline_documented": { 220 "applies": true, 221 "answer": true, 222 "justification": "The pipeline is documented: public datasets → downsampled to 1k → run collaboration methods with specified models → evaluate with task-specific metrics (accuracy, verifiers, reward models) → normalize IF scores → report macro-average per domain." 223 } 224 }, 225 "conflicts_of_interest": { 226 "funding_disclosed": { 227 "applies": true, 228 "answer": false, 229 "justification": "No funding sources are disclosed anywhere in the paper. No acknowledgments section listing grants or sponsors." 230 }, 231 "affiliations_disclosed": { 232 "applies": true, 233 "answer": true, 234 "justification": "All author affiliations are listed: University of Washington, Texas A&M University, University of Notre Dame, University of Virginia, NYU, UIUC, Harvard, Zhejiang University, WashU, UC Irvine, Stanford." 235 }, 236 "funder_independent_of_outcome": { 237 "applies": true, 238 "answer": false, 239 "justification": "No funding information is disclosed, making it impossible to assess funder independence. Academic researchers at this level likely have funding, but it is not reported." 240 }, 241 "financial_interests_declared": { 242 "applies": true, 243 "answer": false, 244 "justification": "No competing interests statement or financial disclosure of any kind appears in the paper." 245 } 246 }, 247 "contamination": { 248 "training_cutoff_stated": { 249 "applies": true, 250 "answer": false, 251 "justification": "No training data cutoff dates are stated for QWEN-2.5-7B, LLAMA-3.1-8B, OLMO-3-7B, or any of the 16 models used in the scaling experiments." 252 }, 253 "train_test_overlap_discussed": { 254 "applies": true, 255 "answer": false, 256 "justification": "Many benchmarks (HumanEval 2021, GSM8k 2021, MATH 2021, MMLU) have been publicly available for years and are likely in the training data of 2024-2025 models. This is not discussed." 257 }, 258 "benchmark_contamination_addressed": { 259 "applies": true, 260 "answer": false, 261 "justification": "HumanEval, GSM8k, MATH, and MMLU were all published before the training cutoffs of the evaluated models. No contamination analysis or discussion is provided." 262 } 263 }, 264 "human_studies": { 265 "pre_registered": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "irb_or_ethics_approval": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "demographics_reported": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "inclusion_exclusion_criteria": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "randomization_described": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 }, 290 "blinding_described": { 291 "applies": false, 292 "answer": false, 293 "justification": "No human participants in this study." 294 }, 295 "attrition_reported": { 296 "applies": false, 297 "answer": false, 298 "justification": "No human participants in this study." 299 } 300 }, 301 "cost_and_practicality": { 302 "inference_cost_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "Table 4 in Appendix A provides theoretical FLOPs complexity analysis, not measured inference costs. No API costs, wall-clock time, tokens consumed, or cost per example are reported for any experiment." 306 }, 307 "compute_budget_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "No total GPU hours, hardware specifications, training time, or total compute budget stated for any of the experiments. The paper does not mention what GPUs were used despite Section 2.3 emphasizing 'any hardware setting.'" 311 } 312 }, 313 "experimental_rigor": { 314 "seed_sensitivity_reported": { 315 "applies": true, 316 "answer": false, 317 "justification": "No random seed sensitivity analysis is reported. Main results in Table 1 appear to be from single runs with stochastic decoding (τ=0.7, top-p=0.9) but no seed variation is tested." 318 }, 319 "number_of_runs_stated": { 320 "applies": true, 321 "answer": false, 322 "justification": "The number of experimental runs producing Table 1 results is not stated. It is unclear whether results are single-run or averaged." 323 }, 324 "hyperparameter_search_budget": { 325 "applies": true, 326 "answer": false, 327 "justification": "The paper states 'We employ the default hyperparameters provided in MOCO' without reporting any search budget or justification for defaults. For 26 methods with method-specific hyperparameters, no tuning process is documented." 328 }, 329 "best_config_selection_justified": { 330 "applies": true, 331 "answer": false, 332 "justification": "No justification for the selected configurations. Default hyperparameters are used without explanation of how defaults were determined or whether they are optimal for the tested settings." 333 }, 334 "multiple_comparison_correction": { 335 "applies": true, 336 "answer": false, 337 "justification": "26 methods are compared across 12 settings (2 pools × 6 domains) without any correction for multiple comparisons. Rankings and claims of 'best' methods are based on raw numbers." 338 }, 339 "self_comparison_bias_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "Many methods are from the first author's research group: Knowledge Card (Feng et al. 2024a), Multiagent Feedback (Feng et al. 2024b), Model Swarms (Feng et al. 2025c), Switch Generation (Feng et al. 2025d), Graph Router (Feng et al. 2025e), Heterogeneous Swarms (Feng et al. 2025b), Sparta Alignment (Jiang et al. 2025c, co-authored). This self-comparison bias is not acknowledged." 343 }, 344 "compute_budget_vs_performance": { 345 "applies": true, 346 "answer": false, 347 "justification": "Methods have vastly different computational costs (e.g., single-pass routing vs multi-round debate vs model merging) but performance is not reported as a function of compute budget. Table 4 provides theoretical complexity but no compute-normalized comparison." 348 }, 349 "benchmark_construct_validity": { 350 "applies": true, 351 "answer": false, 352 "justification": "25 benchmarks are used without any discussion of whether they actually measure the claimed capabilities. No analysis of construct validity or comparison with alternative benchmarks for the same capabilities." 353 }, 354 "scaffold_confound_addressed": { 355 "applies": false, 356 "answer": false, 357 "justification": "The collaboration method IS the thing being compared — the paper tests different collaboration algorithms as the independent variable, so there is no scaffold confound." 358 } 359 }, 360 "data_leakage": { 361 "temporal_leakage_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "Models trained through 2024-2025 are evaluated on benchmarks published 2019-2023 (HumanEval, GSM8k, MATH, MMLU). Temporal leakage is not discussed." 365 }, 366 "feature_leakage_addressed": { 367 "applies": true, 368 "answer": false, 369 "justification": "No discussion of whether evaluation setups leak answer information. Not addressed." 370 }, 371 "non_independence_addressed": { 372 "applies": true, 373 "answer": false, 374 "justification": "No discussion of train/test independence. Multiple benchmarks are well-known and likely present in the training data of evaluated models." 375 }, 376 "leakage_detection_method": { 377 "applies": true, 378 "answer": false, 379 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, or decontamination pipelines." 380 } 381 } 382 }, 383 "claims": [ 384 { 385 "claim": "Model collaboration outperforms individual models in 61.0% of (model, data) settings on average.", 386 "evidence": "Table 1 shows results across 26 methods, 2 model pools, and 6 evaluation domains. Orange-highlighted cells indicate improvement over 'Best Single' baseline (Section 4).", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "The most effective collaboration methods outperform individual models by up to 25.8%.", 391 "evidence": "Table 1 shows the best methods (Model Swarms, Sparta) achieving substantial gains, e.g., Model Swarms at 0.729 avg vs 0.549 baseline in pool #1 (Section 4).", 392 "supported": "weak" 393 }, 394 { 395 "claim": "Weight-level collaboration is generally the most effective, achieving 60.1 average vs 53.5 global average.", 396 "evidence": "Table 1 results, Section 4. However, weight-level methods require shared architecture and only work with model pool #1 — this is acknowledged but the generality claim is still broad.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "Model collaboration benefits more from diversity of models than from quantity.", 401 "evidence": "Figure 3 shows consistent upward trend from 1×8 to 8×1 configurations at fixed total pool size, across reasoning and QA domains (Section 5).", 402 "supported": "moderate" 403 }, 404 { 405 "claim": "Collaborative emergence: 18.5% of previously 'impossible' problems (unsolvable by any individual model) become solvable through model collaboration.", 406 "evidence": "Figures 4, 6, 7, 8 show collaborative emergence rates across math (19.2%), reasoning (17.7%), QA (15.8%), safety (14.1%), and coding (17.6%) domains (Section 5).", 407 "supported": "moderate" 408 }, 409 { 410 "claim": "Scaling from 2 to 16 models shows consistent upward performance trends for collaboration methods.", 411 "evidence": "Figure 2 shows scaling curves across reasoning, QA, and safety domains. Text-level and weight-level methods show clearer scaling than API-level routing (Section 5).", 412 "supported": "moderate" 413 } 414 ], 415 "red_flags": [ 416 { 417 "flag": "Self-comparison bias", 418 "detail": "Many of the 26 methods originate from the first author's research group (Feng et al. 2024a, 2024b, 2025a-f, plus co-authored Sparta and Jiang et al. 2025c). Model Swarms and Heterogeneous Swarms — among the top performers — are both first-authored by Feng. The paper does not acknowledge this potential bias in implementation or evaluation." 419 }, 420 { 421 "flag": "No uncertainty quantification on main results", 422 "detail": "Table 1 reports only point estimates for all 26 methods despite stochastic decoding (τ=0.7, top-p=0.9). Without error bars or confidence intervals, it is impossible to know whether differences between methods are meaningful or within noise. Rankings could change entirely with different random seeds." 423 }, 424 { 425 "flag": "Cherry-picked maximum improvement framing", 426 "detail": "The abstract highlights 'up to 25.8%' improvement, which is the most favorable case (Model Swarms with specialized LMs). The 39% of settings where collaboration hurts performance is mentioned only in the results ('61.0% of settings') without emphasis. Median improvement is not reported." 427 }, 428 { 429 "flag": "Complete absence of contamination discussion", 430 "detail": "Multiple benchmarks (HumanEval 2021, GSM8k 2021, MATH 2021, MMLU) have been publicly available for years before the 2024-2025 models were trained. Collaboration gains could partly reflect differential contamination across models rather than genuine collaboration benefits." 431 }, 432 { 433 "flag": "No limitations section", 434 "detail": "The paper has no limitations section or discussion of threats to validity. Results are limited to 7B-8B models, default hyperparameters, specific benchmarks, and automated evaluation, but none of these boundaries are acknowledged." 435 }, 436 { 437 "flag": "Non-reproducibility disclaimer", 438 "detail": "Section 2.1 explicitly states: 'MOCO does not aim to be a reproducibility study: we adapt the core ideas behind related papers and employ what works flexibly.' This means the 26 methods may not faithfully represent the original algorithms, yet comparative claims are made." 439 } 440 ], 441 "cited_papers": [ 442 { 443 "title": "Improving factuality and reasoning in language models through multiagent debate", 444 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba", "Joshua B. Tenenbaum", "Igor Mordatch"], 445 "year": 2023, 446 "relevance": "Foundational work on multi-LLM debate for improving reasoning, directly implemented as Method #9 in MOCO." 447 }, 448 { 449 "title": "RouteLLM: Learning to route LLMs from preference data", 450 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang", "Tianhao Wu", "Joseph E. Gonzalez", "M. Waleed Kadous", "Ion Stoica"], 451 "year": 2025, 452 "relevance": "LLM routing library and methodology for selecting best model per query; implemented as Method #4 (Trained Router) in MOCO." 453 }, 454 { 455 "title": "When one LLM drools, multi-LLM collaboration rules", 456 "authors": ["Shangbin Feng", "Wenxuan Ding", "Ao Liu", "Zhe Wang", "Weijia Shi", "Yike Wang"], 457 "year": 2025, 458 "arxiv_id": "2502.04506", 459 "relevance": "Position paper on model collaboration taxonomy defining four levels of cross-model information exchange that MOCO operationalizes." 460 }, 461 { 462 "title": "A survey on model merging: Recycling and routing among specialized experts for collaborative learning", 463 "authors": ["Prateek Yadav", "Colin Raffel", "Mohammed Muqeeth", "Lucas Caccia", "Haokun Liu", "Tianlong Chen", "Mohit Bansal", "Leshem Choshen", "Alessandro Sordoni"], 464 "year": 2024, 465 "relevance": "Comprehensive survey on model merging approaches relevant to weight-level collaboration methods in MOCO." 466 }, 467 { 468 "title": "Learning to decode collaboratively with multiple language models", 469 "authors": ["Zejiang Shen", "Hunter Lang", "Bailin Wang", "Yoon Kim", "David Sontag"], 470 "year": 2024, 471 "relevance": "Co-LLM method for collaborative decoding with a deferral model, implemented as Method #7 in MOCO." 472 }, 473 { 474 "title": "LLM-Blender: Ensembling large language models with pairwise ranking and generative fusion", 475 "authors": ["Dongfu Jiang", "Xiang Ren", "Bill Yuchen Lin"], 476 "year": 2023, 477 "relevance": "Ensembling approach using pairwise ranking and fusion for multi-LLM outputs, implemented as Method #11 and among the top performers." 478 }, 479 { 480 "title": "Model Swarms: Collaborative search to adapt LLM experts via swarm intelligence", 481 "authors": ["Shangbin Feng", "Zhe Wang", "Yike Wang", "Saab Ebrahimi", "Hamid Palangi"], 482 "year": 2025, 483 "relevance": "Swarm intelligence approach for model weight-space search, implemented as Method #24 and the top-performing method in pool #1." 484 }, 485 { 486 "title": "Tuning language models by proxy", 487 "authors": ["Alisa Liu", "Xiaochuang Han", "Yizhong Wang", "Yulia Tsvetkov", "Yejin Choi", "Noah A. Smith"], 488 "year": 2024, 489 "relevance": "Logit-level proxy tuning approach for cross-model collaboration, related to logit-level methods in MOCO." 490 }, 491 { 492 "title": "Arcee's MergeKit: A toolkit for merging large language models", 493 "authors": ["Charles Goddard", "Shamane Siriwardhana", "Malikeh Ehghaghi"], 494 "year": 2024, 495 "relevance": "Open-source toolkit for model merging used by MOCO's weight-level collaboration implementations." 496 }, 497 { 498 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 499 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 500 "year": 2023, 501 "relevance": "LLM cascading approach for cost-efficient inference, foundational work for MOCO's cascade method (Method #6)." 502 }, 503 { 504 "title": "Multiagent finetuning: Self improvement with diverse reasoning chains", 505 "authors": ["Vighnesh Subramaniam", "Yilun Du", "Joshua B. Tenenbaum", "Antonio Torralba", "Shuang Li", "Igor Mordatch"], 506 "year": 2025, 507 "relevance": "Multi-agent finetuning with majority vote consensus, implemented as Method #15 and among the top text-level methods." 508 }, 509 { 510 "title": "Sparta alignment: Collectively aligning multiple language models through combat", 511 "authors": ["Yuru Jiang", "Wenxuan Ding", "Shangbin Feng", "Greg Durrett", "Yulia Tsvetkov"], 512 "year": 2025, 513 "relevance": "Multi-LLM competitive alignment via mutual evaluation and preference optimization, implemented as Method #18 and among the top performers." 514 } 515 ] 516 }