scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25408B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Learning Decentralized LLM Collaboration with Multi-Agent Actor Critic",
      6     "authors": [
      7       "Shuo Liu",
      8       "Tianle Chen",
      9       "Ryan Amiri",
     10       "Christopher Amato"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2601.21972",
     15     "doi": "10.48550/arXiv.2601.21972"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Abstract claims about MAGRPO/CoLLM-DC matching CoLLM-CC in dense-reward settings and underperforming in sparse-reward/long-horizon settings are directly supported by Table 1 and Figure 2 results across all five tasks.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Causal claims about CoLLM-CC outperforming alternatives are backed by controlled comparisons with matched hyperparameter budgets across 5 runs; theoretical propositions (4.1–4.3) provide mechanistic justification.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The conclusion broadly claims 'MARL-based methods can achieve equal or better performance than a single larger model' from proof-of-concept experiments with 2–100 test examples per task using 1.7B–4B models; limitations acknowledge this but the main text overstates generalizability.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "CoLLM-CC's advantages are attributed solely to variance reduction and critic stationarity; alternative explanations such as critic model capacity, specific reward shaping choices, or task-specific artifacts are not considered.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper equates high reward with 'high-quality content' (Section 6.3) but writing quality is measured via automated proxies (Jaccard similarity, transition word frequency, length ratios) without validation that these proxies correlate with actual human-judged quality.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "Appendix I contains a dedicated Limitations and Future Work section discussing proof-of-concept scale, compute constraints, CoLLM-DC parameter-sharing trade-offs, and open questions about scaling.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "Limitations reference 'proof-of-concept settings' and compute constraints broadly but do not specifically address the 2-example Minecraft test sets, the validity of automated writing proxies, absence of significance tests, or benchmark contamination.",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "The paper explicitly states that scaling to larger, more heterogeneous multi-agent systems remains open, and that the strictly decentralized no-communication assumption bounds the applicable settings.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "NSF grants (#2044993, #2409351, and multiple others) and computing grants via NCAR and Lambda's Research Grant Program are disclosed in the Acknowledgment section.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "All four authors are affiliated with Northeastern University, Boston, MA, disclosed on the title page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "NSF is an independent government funder with no commercial stake in the MARL framework being evaluated.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests, patent, or financial interests statement appears anywhere in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Decentralized LLM collaboration (Section 3.1), LLM Dec-POMDP (Section 3.2), and actor-critic methods including DC and CC variants (Section 4.2) are formally defined with mathematical notation.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Three contributions are explicitly enumerated in the introduction: MAAC methods for decentralized LLM collaboration, two concrete algorithms (CoLLM-CC and CoLLM-DC), and empirical validation across three domains.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section 2 situates the work relative to LLM collaboration frameworks with predefined protocols and MARL actor-critic literature, explaining how this work extends beyond both.",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": true,
    123           "justification": "Code released at https://github.com/OpenMLRL/CoMLRL/releases/tag/v1.3.6 and three task-specific repositories (writing, coding, Minecraft) all at version 1.3.6.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": true,
    129           "justification": "Public datasets (TLDR, arXiv public datasets, HumanEval, MBPP) are used, and the novel CoopHE dataset is available through the GitHub repositories listed in Appendix H.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": false,
    135           "justification": "Hardware specs and model names are listed in Appendix G but no requirements.txt, Dockerfile, or pip dependency list is provided in the paper or referenced for the repositories.",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": true,
    141           "justification": "GitHub repositories are linked; detailed hyperparameters (Appendix C.3), model architectures (C.2), dataset splits (C.1), prompts (E), and reward functions (F) provide sufficient detail to reconstruct experiments.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": true,
    149           "justification": "Figure 2 explicitly displays 95% bootstrapped confidence intervals as shaded regions around all learning curves for all three methods.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests (t-tests, Mann-Whitney, etc.) are reported for performance comparisons in Table 1; results are averaged over 5 runs without p-values.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Table 1 reports absolute performance percentages for all methods, enabling direct comparison of effect magnitudes with baseline context (e.g., CoLLM-CC 75.2% vs MAGRPO 74.3% vs CoLLM-DC 59.1% on CoopHE).",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The choice of 5 runs is not justified by power analysis; critically, Minecraft test sets contain only 2 examples each (StrBuild[8:10], HouseBuild[8:10]) with no justification for this size.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": true,
    173           "justification": "Figure 2 shows 95% bootstrapped confidence intervals for all learning curves; Table 1 results are described as averaged over 5 runs.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Three baseline categories are included: single-model (raw + fine-tuned), multi-agent test-time interaction (parallel, pipeline, discussion), and MARL (MAGRPO).",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": true,
    187           "justification": "MAGRPO (2026), current Qwen3 models, and recent frameworks are used; baselines are drawn from the same model families as the proposed methods.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Comparison between CoLLM-CC (centralized critic) and CoLLM-DC (decentralized critics) is a direct ablation of the key design choice, and MAGRPO vs MAAC ablates the Monte Carlo vs actor-critic distinction.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": true,
    199           "justification": "Table 1 reports response time, token cost, and task-specific performance; additional metrics include pass@k, IoU, adjacency rate, health points, and training overhead (Table 3).",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "No human evaluation of generated outputs is performed; writing quality is assessed entirely via automated metrics without human judgment validation.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": true,
    210           "answer": true,
    211           "justification": "Appendix C.1 specifies separate training and test set indices for all tasks (e.g., TLDR[0:1000] train vs TLDR[1000:1100] test).",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table 1 provides per-dataset breakdowns across all 5 task variants; Table 2 provides pass@k breakdowns for CoopHE; Figure 2 shows per-task learning curves.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "CoLLM-DC's failure to converge on Minecraft tasks and MAGRPO's sample inefficiency are discussed with mechanistic attribution (non-stationarity accumulation, reward sparsity).",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Negative results reported: CoLLM-DC fails to converge on long-horizon tasks, and prompt-based multi-agent methods underperform single-model baselines without MARL optimization.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "Exact model names provided: Qwen3-1.7B, Qwen2.5-Coder-3B, Qwen3-4B-Instruct-2507, Qwen2.5-3B-Instruct, Qwen2.5-7B Coder and Instruct variants.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "Appendix E provides complete verbatim prompts for all task types: TLDR summarization, arXiv expansion, coding collaboration, StrBuild, and HouseBuild agents.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Appendix C.3 provides comprehensive hyperparameters per task: learning rates, temperature, top-p, buffer sizes, epoch counts, advantage clip, and evaluation sample counts.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The MAAC framework is described via formal pseudocode (Algorithms 1 and 2) with detailed rollout, replay buffer, and training phase descriptions for both CC and DC variants.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Dataset splits are documented in Appendix C.1; CoopHE construction from HumanEval/MBPP selection criteria is described in Section 6.1; reward computation pipelines are in Appendix F.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Public datasets (TLDR, arXiv) are accessible via standard sources, and CoopHE is available through GitHub repositories listed in Appendix H.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "CoopHE construction is described: problems requiring cooperative decomposition were selected from HumanEval/MBPP with the auxiliary function named 'aux' and the main function signature provided in the prompt.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; all experiments use automated benchmark evaluation on pre-existing or constructed datasets.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": true,
    287           "justification": "The full pipeline from dataset splits (Appendix C.1) through reward computation (Appendix F) to evaluation metrics is documented across the appendices.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "Training data cutoffs for Qwen3 and Qwen2.5 models are not stated; this is directly relevant since CoopHE derives from HumanEval/MBPP which likely appear in pre-training corpora.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of potential overlap between the pre-trained models' training data and benchmark tasks (HumanEval, MBPP) used in evaluation.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "HumanEval (2021) and MBPP (2021) predate Qwen model training cutoffs, making contamination plausible; this is not addressed or acknowledged in the paper.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": true,
    359           "justification": "Table 1 reports response time (seconds on RTX 5090) and token cost (tokens/agent/turn) for all methods across all five tasks.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": true,
    365           "justification": "Appendix G lists all hardware used for training and inference; Appendix D.2 provides training overhead in H200 hours, VRAM usage in GB, and sample/update counts for all methods.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "CoLLM-CC consistently outperforms MAGRPO and CoLLM-DC in long-horizon, sparse-reward tasks.",
    374       "evidence": "Table 1 shows CoLLM-CC achieving 75.2% pass rate on CoopHE vs MAGRPO 74.3% and CoLLM-DC 59.1%; 68.5% IoU on StrBuild vs 50.6% and 44.6%; 52.7% on HouseBuild vs 50.9% and 46.8%.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Monte Carlo methods require substantially more samples to converge in sparse-reward and long-horizon settings.",
    379       "evidence": "Figure 2c shows MAGRPO reaching stability at ~5000 timesteps vs ~2000 for CoLLM-CC on CoopHE; Proposition 4.3 derives that required inference calls grow as nK(K^H-1)/(K-1).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "CoLLM-DC fails to converge in long-horizon tasks due to non-stationarity from local-only critic conditioning.",
    384       "evidence": "Figure 2d/2e show CoLLM-DC underperforming substantially on Minecraft tasks; the paper attributes this to non-stationarity accumulating across 4 turns when critics condition only on local history.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "In dense-reward, short-horizon settings, all MARL methods achieve comparable performance.",
    389       "evidence": "Table 1 writing tasks: CoLLM-CC 95.2%/95.0%, CoLLM-DC 95.4%/94.1%, MAGRPO 93.5%/93.1% on TLDR/arXiv respectively.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "MARL fine-tuning achieves comparable or better task performance than a comparable single larger model.",
    394       "evidence": "Table 1 shows MARL methods matching/exceeding single-model baselines on writing, but single-model AC achieves highest health points in HouseBuild (55.9% vs CoLLM-CC 52.7%).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "Decentralized MARL reduces inference time and token cost compared to single-model approaches.",
    399       "evidence": "Table 1: MAGRPO achieves 1.8s / 178 tokens/agent vs raw model 5.0s / 465 tokens on TLDR, attributing this to shorter, coordination-optimized responses.",
    400       "supported": "strong"
    401     }
    402   ],
    403   "methodology_tags": [
    404     "benchmark-eval",
    405     "theoretical"
    406   ],
    407   "key_findings": "Multi-Agent Actor-Critic methods with a centralized critic (CoLLM-CC) outperform Monte Carlo-based MARL (MAGRPO) and decentralized critic approaches (CoLLM-DC) in long-horizon, sparse-reward LLM collaboration tasks, while all methods converge comparably in dense-reward, short-horizon settings. Theoretical analysis (Propositions 4.1–4.3) explains these differences: MC methods suffer exponential sample growth with horizon, and decentralized critics accumulate non-stationarity. Decentralized MARL fine-tuning delivers faster inference and lower token costs than single larger models. Code, prompts, datasets, and hyperparameters are fully released.",
    408   "red_flags": [
    409     {
    410       "flag": "Tiny Minecraft test sets",
    411       "detail": "StrBuild and HouseBuild use only 2 test examples each (indices 8:10), making performance estimates on these tasks highly unreliable despite 5-run averaging — 10 total observations per method."
    412     },
    413     {
    414       "flag": "No significance tests",
    415       "detail": "Table 1 performance comparisons lack p-values or hypothesis tests; several differences between methods are small (1–3pp) and may not be statistically meaningful."
    416     },
    417     {
    418       "flag": "Proxy writing metrics unvalidated",
    419       "detail": "Writing quality claims are based on automated proxies (Jaccard similarity, transition word frequency, length ratios) without human evaluation or validation that these correlate with human-judged quality."
    420     },
    421     {
    422       "flag": "Benchmark contamination unaddressed",
    423       "detail": "CoopHE derives from HumanEval (2021) and MBPP (2021), which predate Qwen model training cutoffs; no discussion of whether base models have memorized these problems."
    424     },
    425     {
    426       "flag": "Proof-of-concept scale only",
    427       "detail": "All models are 1.7B–4B parameters and task sets are small (up to 1000 training examples); the main text makes broad performance claims while the limitations section hedges to proof-of-concept."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "LLM Collaboration with Multi-Agent Reinforcement Learning (Liu et al., 2026a)",
    433       "relevance": "Direct predecessor from same group proposing MAGRPO; this paper extends it with actor-critic methods"
    434     },
    435     {
    436       "title": "Multi-Agent Actor-Critic for Mixed Cooperative-Competitive Environments (Lowe et al., 2017)",
    437       "relevance": "Foundational MADDPG paper introducing centralized-critic training with decentralized execution"
    438     },
    439     {
    440       "title": "Counterfactual Multi-Agent Policy Gradients (Foerster et al., 2018)",
    441       "relevance": "Key MARL paper introducing centralized critics for cooperative settings, directly cited for CTDE"
    442     },
    443     {
    444       "title": "On Centralized Critics in Multi-Agent Reinforcement Learning (Lyu et al., 2023)",
    445       "relevance": "Theoretical analysis of centralized vs decentralized critics that directly informs this work's analysis"
    446     },
    447     {
    448       "title": "The Surprising Effectiveness of PPO in Cooperative Multi-Agent Games (Yu et al., 2022)",
    449       "relevance": "MAPPO: centralized-critic MARL approach forming part of the theoretical backdrop"
    450     },
    451     {
    452       "title": "DeepSeek-R1: Incentivizes Reasoning in LLMs through Reinforcement Learning (Guo et al., 2025)",
    453       "relevance": "Key reference for RLVR training paradigm used in this work"
    454     },
    455     {
    456       "title": "ChatDev: Communicative Agents for Software Development (Qian et al., 2024)",
    457       "relevance": "Representative multi-agent LLM collaboration framework used as comparison baseline"
    458     },
    459     {
    460       "title": "Evaluating Large Language Models Trained on Code (Chen et al., 2021)",
    461       "relevance": "HumanEval benchmark — source material for CoopHE coding collaboration dataset"
    462     }
    463   ],
    464   "engagement_factors": {
    465     "practical_relevance": {
    466       "score": 2,
    467       "justification": "Released code enables practitioners to apply MAAC to multi-LLM collaboration tasks, though implementation requires ML expertise and proof-of-concept scale limits confidence."
    468     },
    469     "surprise_contrarian": {
    470       "score": 1,
    471       "justification": "Centralized-critic advantages in MARL are well-established; applying this to LLM fine-tuning confirms expected behavior rather than overturning conventional wisdom."
    472     },
    473     "fear_safety": {
    474       "score": 0,
    475       "justification": "No safety or risk concerns are raised; the paper focuses purely on performance optimization of collaborative agents."
    476     },
    477     "drama_conflict": {
    478       "score": 1,
    479       "justification": "The paper positions against centralized execution protocols dominant in LLM collaboration, arguing for decentralized alternatives, but without major controversy."
    480     },
    481     "demo_ability": {
    482       "score": 2,
    483       "justification": "Code and datasets are publicly released on GitHub with versioned releases; Minecraft building demos are visually compelling and runnable."
    484     },
    485     "brand_recognition": {
    486       "score": 1,
    487       "justification": "Northeastern University is a solid research institution but not a top-tier AI lab; no famous models or products are involved."
    488     }
    489   },
    490   "hn_data": {
    491     "threads": [
    492       {
    493         "hn_id": "46822095",
    494         "title": "Addressing Asymptomatic AI Harms for Dignified Human-AI Interaction",
    495         "points": 1,
    496         "comments": 0,
    497         "url": "https://news.ycombinator.com/item?id=46822095",
    498         "created_at": "2026-01-30T08:59:56Z"
    499       }
    500     ],
    501     "top_points": 1,
    502     "total_points": 1,
    503     "total_comments": 0
    504   }
    505 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs