scan-v5.json (25408B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Learning Decentralized LLM Collaboration with Multi-Agent Actor Critic", 6 "authors": [ 7 "Shuo Liu", 8 "Tianle Chen", 9 "Ryan Amiri", 10 "Christopher Amato" 11 ], 12 "year": 2026, 13 "venue": "arXiv.org", 14 "arxiv_id": "2601.21972", 15 "doi": "10.48550/arXiv.2601.21972" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "Abstract claims about MAGRPO/CoLLM-DC matching CoLLM-CC in dense-reward settings and underperforming in sparse-reward/long-horizon settings are directly supported by Table 1 and Figure 2 results across all five tasks.", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Causal claims about CoLLM-CC outperforming alternatives are backed by controlled comparisons with matched hyperparameter budgets across 5 runs; theoretical propositions (4.1–4.3) provide mechanistic justification.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": false, 34 "justification": "The conclusion broadly claims 'MARL-based methods can achieve equal or better performance than a single larger model' from proof-of-concept experiments with 2–100 test examples per task using 1.7B–4B models; limitations acknowledge this but the main text overstates generalizability.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "CoLLM-CC's advantages are attributed solely to variance reduction and critic stationarity; alternative explanations such as critic model capacity, specific reward shaping choices, or task-specific artifacts are not considered.", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper equates high reward with 'high-quality content' (Section 6.3) but writing quality is measured via automated proxies (Jaccard similarity, transition word frequency, length ratios) without validation that these proxies correlate with actual human-judged quality.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "Appendix I contains a dedicated Limitations and Future Work section discussing proof-of-concept scale, compute constraints, CoLLM-DC parameter-sharing trade-offs, and open questions about scaling.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": false, 60 "justification": "Limitations reference 'proof-of-concept settings' and compute constraints broadly but do not specifically address the 2-example Minecraft test sets, the validity of automated writing proxies, absence of significance tests, or benchmark contamination.", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper explicitly states that scaling to larger, more heterogeneous multi-agent systems remains open, and that the strictly decentralized no-communication assumption bounds the applicable settings.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": true, 74 "justification": "NSF grants (#2044993, #2409351, and multiple others) and computing grants via NCAR and Lambda's Research Grant Program are disclosed in the Acknowledgment section.", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "All four authors are affiliated with Northeastern University, Boston, MA, disclosed on the title page.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": true, 86 "justification": "NSF is an independent government funder with no commercial stake in the MARL framework being evaluated.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests, patent, or financial interests statement appears anywhere in the paper.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Decentralized LLM collaboration (Section 3.1), LLM Dec-POMDP (Section 3.2), and actor-critic methods including DC and CC variants (Section 4.2) are formally defined with mathematical notation.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Three contributions are explicitly enumerated in the introduction: MAAC methods for decentralized LLM collaboration, two concrete algorithms (CoLLM-CC and CoLLM-DC), and empirical validation across three domains.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 2 situates the work relative to LLM collaboration frameworks with predefined protocols and MARL actor-critic literature, explaining how this work extends beyond both.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "Code released at https://github.com/OpenMLRL/CoMLRL/releases/tag/v1.3.6 and three task-specific repositories (writing, coding, Minecraft) all at version 1.3.6.", 124 "source": "haiku" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "Public datasets (TLDR, arXiv public datasets, HumanEval, MBPP) are used, and the novel CoopHE dataset is available through the GitHub repositories listed in Appendix H.", 130 "source": "haiku" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "Hardware specs and model names are listed in Appendix G but no requirements.txt, Dockerfile, or pip dependency list is provided in the paper or referenced for the repositories.", 136 "source": "haiku" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": true, 141 "justification": "GitHub repositories are linked; detailed hyperparameters (Appendix C.3), model architectures (C.2), dataset splits (C.1), prompts (E), and reward functions (F) provide sufficient detail to reconstruct experiments.", 142 "source": "haiku" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": true, 149 "justification": "Figure 2 explicitly displays 95% bootstrapped confidence intervals as shaded regions around all learning curves for all three methods.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No statistical significance tests (t-tests, Mann-Whitney, etc.) are reported for performance comparisons in Table 1; results are averaged over 5 runs without p-values.", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Table 1 reports absolute performance percentages for all methods, enabling direct comparison of effect magnitudes with baseline context (e.g., CoLLM-CC 75.2% vs MAGRPO 74.3% vs CoLLM-DC 59.1% on CoopHE).", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "The choice of 5 runs is not justified by power analysis; critically, Minecraft test sets contain only 2 examples each (StrBuild[8:10], HouseBuild[8:10]) with no justification for this size.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": true, 173 "justification": "Figure 2 shows 95% bootstrapped confidence intervals for all learning curves; Table 1 results are described as averaged over 5 runs.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "Three baseline categories are included: single-model (raw + fine-tuned), multi-agent test-time interaction (parallel, pipeline, discussion), and MARL (MAGRPO).", 182 "source": "haiku" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "MAGRPO (2026), current Qwen3 models, and recent frameworks are used; baselines are drawn from the same model families as the proposed methods.", 188 "source": "haiku" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "Comparison between CoLLM-CC (centralized critic) and CoLLM-DC (decentralized critics) is a direct ablation of the key design choice, and MAGRPO vs MAAC ablates the Monte Carlo vs actor-critic distinction.", 194 "source": "haiku" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "Table 1 reports response time, token cost, and task-specific performance; additional metrics include pass@k, IoU, adjacency rate, health points, and training overhead (Table 3).", 200 "source": "haiku" 201 }, 202 "human_evaluation": { 203 "applies": true, 204 "answer": false, 205 "justification": "No human evaluation of generated outputs is performed; writing quality is assessed entirely via automated metrics without human judgment validation.", 206 "source": "haiku" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": true, 211 "justification": "Appendix C.1 specifies separate training and test set indices for all tasks (e.g., TLDR[0:1000] train vs TLDR[1000:1100] test).", 212 "source": "haiku" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Table 1 provides per-dataset breakdowns across all 5 task variants; Table 2 provides pass@k breakdowns for CoopHE; Figure 2 shows per-task learning curves.", 218 "source": "haiku" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "CoLLM-DC's failure to converge on Minecraft tasks and MAGRPO's sample inefficiency are discussed with mechanistic attribution (non-stationarity accumulation, reward sparsity).", 224 "source": "haiku" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "Negative results reported: CoLLM-DC fails to converge on long-horizon tasks, and prompt-based multi-agent methods underperform single-model baselines without MARL optimization.", 230 "source": "haiku" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "Exact model names provided: Qwen3-1.7B, Qwen2.5-Coder-3B, Qwen3-4B-Instruct-2507, Qwen2.5-3B-Instruct, Qwen2.5-7B Coder and Instruct variants.", 238 "source": "haiku" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": true, 243 "justification": "Appendix E provides complete verbatim prompts for all task types: TLDR summarization, arXiv expansion, coding collaboration, StrBuild, and HouseBuild agents.", 244 "source": "haiku" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": true, 249 "justification": "Appendix C.3 provides comprehensive hyperparameters per task: learning rates, temperature, top-p, buffer sizes, epoch counts, advantage clip, and evaluation sample counts.", 250 "source": "haiku" 251 }, 252 "scaffolding_described": { 253 "applies": true, 254 "answer": true, 255 "justification": "The MAAC framework is described via formal pseudocode (Algorithms 1 and 2) with detailed rollout, replay buffer, and training phase descriptions for both CC and DC variants.", 256 "source": "haiku" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "Dataset splits are documented in Appendix C.1; CoopHE construction from HumanEval/MBPP selection criteria is described in Section 6.1; reward computation pipelines are in Appendix F.", 262 "source": "haiku" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "Public datasets (TLDR, arXiv) are accessible via standard sources, and CoopHE is available through GitHub repositories listed in Appendix H.", 270 "source": "haiku" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "CoopHE construction is described: problems requiring cooperative decomposition were selected from HumanEval/MBPP with the auxiliary function named 'aux' and the main function signature provided in the prompt.", 276 "source": "haiku" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants; all experiments use automated benchmark evaluation on pre-existing or constructed datasets.", 282 "source": "haiku" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "The full pipeline from dataset splits (Appendix C.1) through reward computation (Appendix F) to evaluation metrics is documented across the appendices.", 288 "source": "haiku" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Training data cutoffs for Qwen3 and Qwen2.5 models are not stated; this is directly relevant since CoopHE derives from HumanEval/MBPP which likely appear in pre-training corpora.", 296 "source": "haiku" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": false, 301 "justification": "No discussion of potential overlap between the pre-trained models' training data and benchmark tasks (HumanEval, MBPP) used in evaluation.", 302 "source": "haiku" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": false, 307 "justification": "HumanEval (2021) and MBPP (2021) predate Qwen model training cutoffs, making contamination plausible; this is not addressed or acknowledged in the paper.", 308 "source": "haiku" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants in this study.", 316 "source": "haiku" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants in this study.", 322 "source": "haiku" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants in this study.", 328 "source": "haiku" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants in this study.", 334 "source": "haiku" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants in this study.", 340 "source": "haiku" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants in this study.", 346 "source": "haiku" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants in this study.", 352 "source": "haiku" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": true, 359 "justification": "Table 1 reports response time (seconds on RTX 5090) and token cost (tokens/agent/turn) for all methods across all five tasks.", 360 "source": "haiku" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": true, 365 "justification": "Appendix G lists all hardware used for training and inference; Appendix D.2 provides training overhead in H200 hours, VRAM usage in GB, and sample/update counts for all methods.", 366 "source": "haiku" 367 } 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "CoLLM-CC consistently outperforms MAGRPO and CoLLM-DC in long-horizon, sparse-reward tasks.", 374 "evidence": "Table 1 shows CoLLM-CC achieving 75.2% pass rate on CoopHE vs MAGRPO 74.3% and CoLLM-DC 59.1%; 68.5% IoU on StrBuild vs 50.6% and 44.6%; 52.7% on HouseBuild vs 50.9% and 46.8%.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Monte Carlo methods require substantially more samples to converge in sparse-reward and long-horizon settings.", 379 "evidence": "Figure 2c shows MAGRPO reaching stability at ~5000 timesteps vs ~2000 for CoLLM-CC on CoopHE; Proposition 4.3 derives that required inference calls grow as nK(K^H-1)/(K-1).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "CoLLM-DC fails to converge in long-horizon tasks due to non-stationarity from local-only critic conditioning.", 384 "evidence": "Figure 2d/2e show CoLLM-DC underperforming substantially on Minecraft tasks; the paper attributes this to non-stationarity accumulating across 4 turns when critics condition only on local history.", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "In dense-reward, short-horizon settings, all MARL methods achieve comparable performance.", 389 "evidence": "Table 1 writing tasks: CoLLM-CC 95.2%/95.0%, CoLLM-DC 95.4%/94.1%, MAGRPO 93.5%/93.1% on TLDR/arXiv respectively.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "MARL fine-tuning achieves comparable or better task performance than a comparable single larger model.", 394 "evidence": "Table 1 shows MARL methods matching/exceeding single-model baselines on writing, but single-model AC achieves highest health points in HouseBuild (55.9% vs CoLLM-CC 52.7%).", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Decentralized MARL reduces inference time and token cost compared to single-model approaches.", 399 "evidence": "Table 1: MAGRPO achieves 1.8s / 178 tokens/agent vs raw model 5.0s / 465 tokens on TLDR, attributing this to shorter, coordination-optimized responses.", 400 "supported": "strong" 401 } 402 ], 403 "methodology_tags": [ 404 "benchmark-eval", 405 "theoretical" 406 ], 407 "key_findings": "Multi-Agent Actor-Critic methods with a centralized critic (CoLLM-CC) outperform Monte Carlo-based MARL (MAGRPO) and decentralized critic approaches (CoLLM-DC) in long-horizon, sparse-reward LLM collaboration tasks, while all methods converge comparably in dense-reward, short-horizon settings. Theoretical analysis (Propositions 4.1–4.3) explains these differences: MC methods suffer exponential sample growth with horizon, and decentralized critics accumulate non-stationarity. Decentralized MARL fine-tuning delivers faster inference and lower token costs than single larger models. Code, prompts, datasets, and hyperparameters are fully released.", 408 "red_flags": [ 409 { 410 "flag": "Tiny Minecraft test sets", 411 "detail": "StrBuild and HouseBuild use only 2 test examples each (indices 8:10), making performance estimates on these tasks highly unreliable despite 5-run averaging — 10 total observations per method." 412 }, 413 { 414 "flag": "No significance tests", 415 "detail": "Table 1 performance comparisons lack p-values or hypothesis tests; several differences between methods are small (1–3pp) and may not be statistically meaningful." 416 }, 417 { 418 "flag": "Proxy writing metrics unvalidated", 419 "detail": "Writing quality claims are based on automated proxies (Jaccard similarity, transition word frequency, length ratios) without human evaluation or validation that these correlate with human-judged quality." 420 }, 421 { 422 "flag": "Benchmark contamination unaddressed", 423 "detail": "CoopHE derives from HumanEval (2021) and MBPP (2021), which predate Qwen model training cutoffs; no discussion of whether base models have memorized these problems." 424 }, 425 { 426 "flag": "Proof-of-concept scale only", 427 "detail": "All models are 1.7B–4B parameters and task sets are small (up to 1000 training examples); the main text makes broad performance claims while the limitations section hedges to proof-of-concept." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "LLM Collaboration with Multi-Agent Reinforcement Learning (Liu et al., 2026a)", 433 "relevance": "Direct predecessor from same group proposing MAGRPO; this paper extends it with actor-critic methods" 434 }, 435 { 436 "title": "Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments (Lowe et al., 2017)", 437 "relevance": "Foundational MADDPG paper introducing centralized-critic training with decentralized execution" 438 }, 439 { 440 "title": "Counterfactual Multi-Agent Policy Gradients (Foerster et al., 2018)", 441 "relevance": "Key MARL paper introducing centralized critics for cooperative settings, directly cited for CTDE" 442 }, 443 { 444 "title": "On Centralized Critics in Multi-Agent Reinforcement Learning (Lyu et al., 2023)", 445 "relevance": "Theoretical analysis of centralized vs decentralized critics that directly informs this work's analysis" 446 }, 447 { 448 "title": "The Surprising Effectiveness of PPO in Cooperative Multi-Agent Games (Yu et al., 2022)", 449 "relevance": "MAPPO: centralized-critic MARL approach forming part of the theoretical backdrop" 450 }, 451 { 452 "title": "DeepSeek-R1: Incentivizes Reasoning in LLMs through Reinforcement Learning (Guo et al., 2025)", 453 "relevance": "Key reference for RLVR training paradigm used in this work" 454 }, 455 { 456 "title": "ChatDev: Communicative Agents for Software Development (Qian et al., 2024)", 457 "relevance": "Representative multi-agent LLM collaboration framework used as comparison baseline" 458 }, 459 { 460 "title": "Evaluating Large Language Models Trained on Code (Chen et al., 2021)", 461 "relevance": "HumanEval benchmark — source material for CoopHE coding collaboration dataset" 462 } 463 ], 464 "engagement_factors": { 465 "practical_relevance": { 466 "score": 2, 467 "justification": "Released code enables practitioners to apply MAAC to multi-LLM collaboration tasks, though implementation requires ML expertise and proof-of-concept scale limits confidence." 468 }, 469 "surprise_contrarian": { 470 "score": 1, 471 "justification": "Centralized-critic advantages in MARL are well-established; applying this to LLM fine-tuning confirms expected behavior rather than overturning conventional wisdom." 472 }, 473 "fear_safety": { 474 "score": 0, 475 "justification": "No safety or risk concerns are raised; the paper focuses purely on performance optimization of collaborative agents." 476 }, 477 "drama_conflict": { 478 "score": 1, 479 "justification": "The paper positions against centralized execution protocols dominant in LLM collaboration, arguing for decentralized alternatives, but without major controversy." 480 }, 481 "demo_ability": { 482 "score": 2, 483 "justification": "Code and datasets are publicly released on GitHub with versioned releases; Minecraft building demos are visually compelling and runnable." 484 }, 485 "brand_recognition": { 486 "score": 1, 487 "justification": "Northeastern University is a solid research institution but not a top-tier AI lab; no famous models or products are involved." 488 } 489 }, 490 "hn_data": { 491 "threads": [ 492 { 493 "hn_id": "46822095", 494 "title": "Addressing Asymptomatic AI Harms for Dignified Human-AI Interaction", 495 "points": 1, 496 "comments": 0, 497 "url": "https://news.ycombinator.com/item?id=46822095", 498 "created_at": "2026-01-30T08:59:56Z" 499 } 500 ], 501 "top_points": 1, 502 "total_points": 1, 503 "total_comments": 0 504 } 505 }