scan.json (26450B)
1 { 2 "paper": { 3 "title": "Learning Decentralized LLM Collaboration with Multi-Agent Actor Critic", 4 "authors": ["Shuo Liu", "Tianle Chen", "Ryan Amiri", "Christopher Amato"], 5 "year": 2026, 6 "venue": "arXiv", 7 "arxiv_id": "2601.21972", 8 "doi": "10.48550/arXiv.2601.21972" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "The paper proposes CoLLM-CC and CoLLM-DC, multi-agent actor-critic methods for decentralized LLM collaboration. CoLLM-CC with a centralized critic consistently outperforms Monte Carlo methods (MAGRPO) and CoLLM-DC, particularly in long-horizon and sparse-reward settings (Minecraft, coding). In short-horizon dense-reward settings (writing), all three MARL methods achieve comparable performance. Decentralized multi-agent MARL methods can match or exceed single larger model performance while enabling parallel inference.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Code is released at https://github.com/OpenMLRL/CoMLRL/releases/tag/v1.3.6 with separate repos for each experiment domain (writing, code generation, Minecraft), listed in Appendix H." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available datasets (TLDR, arXiv-public-datasets) and releases CoopHumanEval as a new dataset in the code repositories (Appendix H). Minecraft tasks are described with specifications." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or dependency version list is provided. Hardware is listed (Appendix G) but software environment specifications are absent." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "While code repositories are linked, the paper does not include step-by-step reproduction instructions, README commands, or scripts to replicate main experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": true, 41 "justification": "Figure 2 shows 95% bootstrapped confidence intervals as shaded regions. The paper states 'Shaded regions denote 95% bootstrapped confidence intervals.'" 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are reported. Claims of 'outperforms' and 'comparable' are based on comparing point estimates and visual inspection of confidence intervals, without formal tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 1 provides absolute performance numbers with baselines for context (e.g., CoLLM-CC 75.2% pass rate vs MAGRPO 74.3% vs raw model 56.3%), allowing readers to assess magnitude of differences." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for dataset sizes. CoopHE has only 82 problems (66 train, 16 test), Minecraft has 10 tasks (8 train, 2 test). No power analysis or justification for these small sizes." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Results are averaged over 5 runs (Table 1: 'Results are averaged over 5 runs'), and Figure 2 shows 95% bootstrapped confidence intervals capturing run-to-run variance." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Three categories of baselines: single-model (GRPO, AC), multi-agent test-time interaction (parallel, pipeline, discussion), and MARL (MAGRPO). Described in Section 6.2." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "MAGRPO is based on Liu et al. 2026a, and test-time interaction baselines follow Wu et al. 2023 and Du et al. 2023. The MARL baselines are contemporary to this work." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The comparison between CoLLM-CC (centralized critic) and CoLLM-DC (decentralized critics) serves as an ablation of the critic architecture. The paper systematically varies this component across all domains." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics per domain: writing uses structure, style consistency, logical coherence; coding uses pass rate and pass@k; Minecraft uses IoU, adjacency rate, health points. Response time and token cost also reported." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is included. Writing quality is assessed by automated metrics (length ratio, Jaccard similarity, transition words), not human judges. For a paper about collaborative writing and coding, human evaluation of output quality would be relevant." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Appendix C.1 clearly separates training and test sets for all domains (e.g., TLDR[0:1000] train / TLDR[1000:1100] test, CoopHE[0:66] train / CoopHE[66:82] test)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down across 5 tasks (TLDR, arXiv, CoopHE, StrBuild, HouseBuild) in Table 1, with separate learning curves in Figure 2." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses where CoLLM-DC fails to converge (coding tasks at 4500 timesteps, Minecraft tasks), and where MAGRPO struggles (long-horizon settings). Section 6.3 discusses specific failure modes." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "CoLLM-DC's convergence failures in coding and Minecraft tasks are negative results. The paper also reports that prompt-based multi-agent methods perform poorly in Minecraft, and that single-model fine-tuning yields limited improvement in coding." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims CoLLM-CC outperforms in long-horizon/sparse-reward settings while Monte Carlo and CoLLM-DC are comparable in short-horizon/dense-reward settings. Table 1 and Figure 2 support these claims across all domains." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about why CoLLM-CC outperforms (centralized critic provides more accurate value estimates). The controlled experimental design (same agents, same tasks, same learning rate scaling) with ablation of the critic type provides adequate causal evidence for these architecture-level claims." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims 'Decentralized LLM Collaboration' generally, but experiments use only small models (1.7B-4B parameters) on proof-of-concept tasks. The paper acknowledges this in Limitations (Appendix I) but the title and abstract do not bound the claims to small-scale settings." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for the results. For example, CoLLM-CC's advantage could stem from having access to more information during training rather than the actor-critic architecture per se. No robustness checks against confounds." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures specific metrics (pass rate, IoU, adjacency rate, writing scores) and frames claims at the level of those metrics. It does not claim broader 'collaboration quality' beyond what is measured." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model names with sizes are provided: Qwen3-1.7B, Qwen2.5-Coder-3B, Qwen3-4B-Instruct, Qwen3-4B-Instruct-2507, Qwen2.5-3B-Instruct. These are sufficiently specific versioned model releases." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt text for all agents in all three domains is provided in Appendix E, including writing collaboration, coding collaboration, and Minecraft instructions." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix C provides comprehensive hyperparameters: temperature, top-p, max tokens, learning rates, advantage clip, number of generations, rollout buffer size, training epochs, and evaluation samples for all methods and domains." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The multi-agent framework is described in detail: Algorithm 1 shows the full training procedure, Section 5 describes KV-cache history management, replay buffer, teacher-forced forward passes, and the feedback loop with external tools." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix C.1 lists exact dataset splits and indices. The CoopHE construction criteria are described in Section 6.1 (problems that 'naturally admit cooperative decomposition'). Reward design details in Appendix F document how outputs are processed." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Appendix I is titled 'Limitations and Future Work' and discusses several limitations in substantive detail." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Appendix I discusses specific threats: inability to compare with MAGRPO on longer horizons due to sample efficiency, experiments limited to proof-of-concept settings due to compute constraints, and open question of scaling to larger MAS with more diverse agents." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Appendix I explicitly states: 'our experiments are limited to proof-of-concept settings. How LLM-based collaboration can scale to larger multi-agent systems with more diverse and heterogeneous agents remains an open question.' Also notes the strictly decentralized no-communication assumption." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (training logs, per-run results) is made available. Only aggregated results in Table 1 and smoothed curves in Figure 2 are shown." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data sources are described: TLDR Reddit posts, arXiv abstracts from arXiv-public-datasets, CoopHE constructed from HumanEval/MBPP filtering, Minecraft tasks with specifications. Collection procedures are clear." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data comes from public datasets and constructed benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from data collection through reward computation is documented: dataset splits (Appendix C.1), reward computation (Appendix F), and training procedure (Algorithm 1). The coding feedback loop including static analysis and sandbox tests is described." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Section 8 (Acknowledgment) lists NSF grants #2044993 and #2409351, NCSA allocation CIS251326, Northeastern University computing, and Lambda Research Grant Program." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are affiliated with Northeastern University, Boston, MA. No commercial product is being evaluated, so no product-affiliation conflict exists." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Funding is from NSF (federal research grants) and university computing resources. None of these funders have a financial interest in the outcomes of decentralized LLM collaboration research." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is included in the paper. Absence of disclosure is not the same as absence of conflict." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses pre-trained Qwen models (Qwen3-1.7B, Qwen2.5-Coder-3B, etc.) but does not state their training data cutoff dates. For CoopHE, it is unclear whether the base models saw HumanEval/MBPP problems during pre-training." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the Qwen models' pre-training data overlaps with TLDR, arXiv, or HumanEval/MBPP problems used in evaluation." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "CoopHE is derived from HumanEval and MBPP, which were published in 2021. The Qwen models used were likely trained after 2021 and may have seen these problems. This contamination risk is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 1 reports response time (seconds) and token cost (tokens/agent/turn) for all methods across all tasks. Inference device specified as NVIDIA GeForce RTX 5090." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Table 3 (Appendix D.2) reports training duration in H200 hours, VRAM usage in GB, total samples, and total updates for each method. Appendix G lists all hardware used." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": true, 296 "justification": "Results are reported across 5 runs with 95% bootstrapped confidence intervals (Figure 2), showing run-to-run variance. Table 1 states 'Results are averaged over 5 runs.'" 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Explicitly stated: 'Results are averaged over 5 runs' (Table 1 caption) and '5 runs' repeated in Table 3." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. Appendix C lists final hyperparameters but does not describe how they were selected or how many configurations were tried." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No justification for how the final hyperparameters were selected. Different learning rates are used for different methods (e.g., MAGRPO 2×10⁻⁵ vs CoLLM 5×10⁻⁶ for coding) without explaining the selection process." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite making many comparisons across 5 tasks and multiple methods." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement their own baselines (MAGRPO, parallel, pipeline, discussion) and do not acknowledge the bias of evaluating against their own implementations." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Table 3 compares training overhead (samples, updates, duration, VRAM) across methods. The paper explicitly discusses the compute-performance tradeoff: MAGRPO needs more samples but less training time." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "CoopHE is a custom benchmark filtering HumanEval/MBPP for 'cooperative decomposition' but the paper does not discuss whether this filtering actually captures meaningful collaborative coding or how well these tasks represent real multi-agent programming." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": true, 336 "justification": "All MARL methods use the same agent models and prompts, varying only the training algorithm. The scaffold (prompt design, feedback loop) is held constant across MARL comparisons, isolating the critic architecture effect." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "HumanEval (2021) and MBPP (2021) predate the Qwen models. The paper does not discuss whether solutions to these problems appeared in the models' training data." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the feedback from static analysis and sandbox tests provides information leakage that wouldn't be available in real deployment scenarios." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between train and test splits in CoopHE, TLDR, or arXiv datasets. CoopHE problems come from the same source (HumanEval/MBPP)." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "CoLLM-CC consistently outperforms Monte Carlo methods and CoLLM-DC, particularly in long-horizon tasks with sparse rewards.", 365 "evidence": "Table 1 shows CoLLM-CC achieves 68.5% IoU on StrBuild vs 50.6% for MAGRPO and 44.6% for CoLLM-DC; 86.4% HP and 52.7% IoU on HouseBuild vs 80.2%/50.9% and 43.8%/46.8%. Figure 2d-e show convergence advantages.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "In short-horizon dense-reward settings, Monte Carlo methods and CoLLM-DC achieve performance comparable to CoLLM-CC.", 370 "evidence": "Table 1 shows writing tasks: MAGRPO 93.5/93.1, CoLLM-DC 95.4/94.1, CoLLM-CC 95.2/95.0 on TLDR/arXiv scores. Differences are within confidence intervals in Figure 2a-b.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "MARL-based methods can achieve equal or better performance than a single larger model while enabling parallel inference.", 375 "evidence": "Table 1 shows MARL methods (1.7B-4B agents) match or exceed single 4B-7B model performance on most tasks, with faster inference time (e.g., 1.8s vs 5.0s on TLDR).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "K-sampling MA-REINFORCE variance scales as 1/K^(H-t) but inference cost grows as nK(K^H - 1)/(K-1).", 380 "evidence": "Propositions 4.2 and 4.3 with formal proofs in Appendix A. The practical implication is demonstrated by setting K=2 for Minecraft (H=4) vs K=4 for writing (H=1).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "CoLLM-DC fails to converge in long-horizon tasks due to non-stationarity accumulating across turns.", 385 "evidence": "Figure 2c-e shows CoLLM-DC oscillating or failing to improve. Section 6.3 attributes this to 'non-stationarity accumulates across 4 turns, leading to more unstable value estimates.'", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Very small evaluation sets", 392 "detail": "CoopHE test set has only 16 problems and Minecraft test sets have only 2 tasks each. Results on such small test sets may not be stable or generalizable." 393 }, 394 { 395 "flag": "No benchmark contamination analysis", 396 "detail": "CoopHE is derived from HumanEval/MBPP (2021). The Qwen models likely trained on data containing these problems. No contamination analysis is performed." 397 }, 398 { 399 "flag": "Custom automated writing metrics of questionable validity", 400 "detail": "Writing quality is evaluated using Jaccard similarity, transition word frequency, and length ratios. These metrics have no established correlation with actual writing quality. No human evaluation validates the automated metrics." 401 }, 402 { 403 "flag": "Unequal hyperparameter tuning across methods", 404 "detail": "Different learning rates, buffer sizes, and epoch counts are used for MAGRPO vs CoLLM methods (e.g., MAGRPO 2×10⁻⁵ vs CoLLM 5×10⁻⁶ for coding). No justification for these choices or evidence of fair search budgets." 405 } 406 ], 407 "cited_papers": [ 408 { 409 "title": "ChatDev: Communicative Agents for Software Development", 410 "authors": ["Chen Qian", "Wei Liu", "Hongzhang Liu"], 411 "year": 2024, 412 "relevance": "Multi-agent LLM system for collaborative software development, directly relevant to agentic AI programming workflows." 413 }, 414 { 415 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 416 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"], 417 "year": 2023, 418 "arxiv_id": "2308.08155", 419 "relevance": "Major multi-agent LLM framework enabling collaborative conversation-based problem solving." 420 }, 421 { 422 "title": "AgentBench: Evaluating LLMs as Agents", 423 "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"], 424 "year": 2023, 425 "arxiv_id": "2308.03688", 426 "relevance": "Benchmark for evaluating LLMs as autonomous agents across diverse environments." 427 }, 428 { 429 "title": "Evaluating Large Language Models Trained on Code", 430 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 431 "year": 2021, 432 "arxiv_id": "2107.03374", 433 "relevance": "Introduces HumanEval benchmark for code generation, foundational to LLM programming evaluation." 434 }, 435 { 436 "title": "DeepSeek-R1 Incentivizes Reasoning in LLMs through Reinforcement Learning", 437 "authors": ["Daya Guo", "Dejian Yang", "He Zhang"], 438 "year": 2025, 439 "doi": "10.1038/s41586-025-09422-z", 440 "relevance": "RL fine-tuning of LLMs for reasoning, directly relevant to RLHF/RLVR methodology for LLM improvement." 441 }, 442 { 443 "title": "Multi-Agent Deep Research: Training Multi-Agent Systems with M-GRPO", 444 "authors": ["Haotian Hong", "Jia Yin", "Yiwei Wang"], 445 "year": 2025, 446 "arxiv_id": "2511.13288", 447 "relevance": "Multi-agent GRPO method for training collaborative LLM systems, direct comparison baseline." 448 }, 449 { 450 "title": "MARFT: Multi-Agent Reinforcement Fine-Tuning", 451 "authors": ["Junhao Liao", "Mingzhou Wen", "Jiaqi Wang"], 452 "year": 2025, 453 "arxiv_id": "2504.16129", 454 "relevance": "MARL fine-tuning framework for LLMs, closely related approach to optimizing multi-agent collaboration." 455 }, 456 { 457 "title": "Heterogeneous Swarms: Jointly Optimizing Model Roles and Weights for Multi-LLM Systems", 458 "authors": ["Shangbin Feng", "Zifeng Wang", "Prithviraj Goyal"], 459 "year": 2025, 460 "arxiv_id": "2502.04510", 461 "relevance": "Optimization of heterogeneous multi-LLM systems with role specialization." 462 }, 463 { 464 "title": "The Surprising Effectiveness of PPO in Cooperative Multi-Agent Games", 465 "authors": ["Chao Yu", "Akash Velu", "Eugene Vinitsky"], 466 "year": 2022, 467 "relevance": "Key MARL benchmark paper on cooperative multi-agent PPO, foundational to actor-critic methods used here." 468 }, 469 { 470 "title": "SWE-Agent: Agent-Computer Interfaces Enable Automated Software Engineering", 471 "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig"], 472 "year": 2024, 473 "relevance": "Agent-based automated software engineering system, relevant to agentic AI coding pipelines." 474 } 475 ] 476 }