scan-v5.json (26719B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "LLM-Coordination: Evaluating and Analyzing Multi-agent Coordination Abilities in Large Language Models", 6 "authors": [ 7 "Saaket Agashe", 8 "Yue Fan", 9 "Anthony Reyna", 10 "Xin Eric Wang" 11 ], 12 "year": 2023, 13 "venue": "North American Chapter of the Association for Computational Linguistics", 14 "arxiv_id": "2310.03903", 15 "doi": "10.18653/v1/2025.findings-naacl.448" 16 }, 17 "checklist": { 18 "claims_and_evidence": { 19 "abstract_claims_supported": { 20 "applies": true, 21 "answer": true, 22 "justification": "All abstract claims are backed by results: environmental coordination strength (Tables 1–2), ToM deficits (Table 3, Figure 3), joint planning weaknesses (Figure 3), and ZSC robustness (Tables 4–5).", 23 "source": "haiku" 24 }, 25 "causal_claims_justified": { 26 "applies": true, 27 "answer": true, 28 "justification": "Ablation study (Table 6) removes ToM reasoning and verification steps independently, providing adequate design to support the causal claim that each component improves Hanabi performance.", 29 "source": "haiku" 30 }, 31 "generalization_bounded": { 32 "applies": true, 33 "answer": true, 34 "justification": "Claims are scoped to 'pure coordination games' throughout; the conclusion explicitly states findings underscore areas for improvement in pure coordination setups, not general AI coordination.", 35 "source": "haiku" 36 }, 37 "alternative_explanations_discussed": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper attributes Overcooked success to full observability and Hanabi failure to ToM demands, but does not consider alternative explanations such as training data overlap with game rules or prompt sensitivity effects.", 41 "source": "haiku" 42 }, 43 "proxy_outcome_distinction": { 44 "applies": true, 45 "answer": true, 46 "justification": "Game scores (delivery points, card sequences, escape rates) are used directly as coordination performance metrics; the paper treats these as direct measures of coordination ability within well-defined game objectives without conflating them with broader constructs.", 47 "source": "haiku" 48 } 49 }, 50 "limitations_and_scope": { 51 "limitations_section_present": { 52 "applies": true, 53 "answer": true, 54 "justification": "Section 8 is a dedicated Limitations section covering three specific issues: latency/compute, prompt configuration, and manual curation of CoordQA.", 55 "source": "haiku" 56 }, 57 "threats_to_validity_specific": { 58 "applies": true, 59 "answer": true, 60 "justification": "Limitations are specific: e.g., effective reasoning requires GPT-4-turbo scale, prompt quality affects results, and manual edge-case curation limits scalability — not boilerplate disclaimers.", 61 "source": "haiku" 62 }, 63 "scope_boundaries_stated": { 64 "applies": true, 65 "answer": true, 66 "justification": "The paper explicitly restricts scope to pure coordination games (no mixed incentives), distinguishes this from multi-LLM orchestration work, and notes that performance improvements via prompt engineering are left to future work.", 67 "source": "haiku" 68 } 69 }, 70 "conflicts_of_interest": { 71 "funding_disclosed": { 72 "applies": true, 73 "answer": true, 74 "justification": "Section 7 discloses funding from the Microsoft Accelerate Foundation Models Research (AFMR) grant program.", 75 "source": "haiku" 76 }, 77 "affiliations_disclosed": { 78 "applies": true, 79 "answer": true, 80 "justification": "All four authors list University of California, Santa Cruz affiliations with email addresses in the paper header.", 81 "source": "haiku" 82 }, 83 "funder_independent_of_outcome": { 84 "applies": true, 85 "answer": false, 86 "justification": "Microsoft funded the research; Microsoft holds a major investment in OpenAI, whose GPT-4 models are the primary beneficiary of the paper's positive findings, creating a financial interest in the outcome.", 87 "source": "haiku" 88 }, 89 "financial_interests_declared": { 90 "applies": true, 91 "answer": false, 92 "justification": "No competing interests statement or financial disclosure beyond the funding acknowledgment appears in the paper.", 93 "source": "haiku" 94 } 95 }, 96 "scope_and_framing": { 97 "key_terms_defined": { 98 "applies": true, 99 "answer": true, 100 "justification": "Pure Coordination Games, Theory of Mind, Environment Comprehension, Joint Planning, and Zero-Shot Coordination are all explicitly defined in context in Sections 1 and 3.", 101 "source": "haiku" 102 }, 103 "intended_contribution_clear": { 104 "applies": true, 105 "answer": true, 106 "justification": "Three contributions are explicitly enumerated at the end of the introduction: the benchmark, the holistic LLM-vs-RL evaluation, and the component-level CoordQA analysis.", 107 "source": "haiku" 108 }, 109 "engagement_with_prior_work": { 110 "applies": true, 111 "answer": true, 112 "justification": "Related Work (Section 2) differentiates this work from multi-LLM orchestration frameworks (MetaGPT, ChatDev) by focusing on innate coordination ability of individual LLMs in established pure coordination benchmarks.", 113 "source": "haiku" 114 } 115 } 116 }, 117 "type_checklist": { 118 "empirical": { 119 "artifacts": { 120 "code_released": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract explicitly links to https://github.com/eric-ai-lab/llm_coordination, a GitHub repository for the benchmark and agent implementations.", 124 "source": "haiku" 125 }, 126 "data_released": { 127 "applies": true, 128 "answer": true, 129 "justification": "CoordQA (198 manually curated questions) is released with the code; Hanabi and Overcooked-AI are established public benchmarks available independently.", 130 "source": "haiku" 131 }, 132 "environment_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "No requirements.txt, Dockerfile, or dependency specification is mentioned in the paper; only a GitHub link is provided without explicit environment details.", 136 "source": "haiku" 137 }, 138 "reproduction_instructions": { 139 "applies": true, 140 "answer": false, 141 "justification": "No step-by-step reproduction instructions appear in the paper; the appendices detail prompts and game setups but not how to run experiments end-to-end.", 142 "source": "haiku" 143 } 144 }, 145 "statistical_methodology": { 146 "confidence_intervals_or_error_bars": { 147 "applies": true, 148 "answer": true, 149 "justification": "Tables 1, 2, 3, and 5 all report ± values alongside mean scores across trials.", 150 "source": "haiku" 151 }, 152 "significance_tests": { 153 "applies": true, 154 "answer": false, 155 "justification": "No statistical significance tests (t-tests, ANOVA, etc.) are used to support comparative claims between LLMs and RL baselines, despite substantial comparative claims throughout.", 156 "source": "haiku" 157 }, 158 "effect_sizes_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Absolute performance differences are reported (e.g., GPT-4-turbo 13.33 vs GPT-3.5 1.33 in Hanabi; 80% vs 33% escape rates) providing meaningful context for comparison.", 162 "source": "haiku" 163 }, 164 "sample_size_justified": { 165 "applies": true, 166 "answer": false, 167 "justification": "Three trials per model-scenario combination is stated as the protocol but is not justified via power analysis or discussion of why three trials is sufficient for the variance observed.", 168 "source": "haiku" 169 }, 170 "variance_reported": { 171 "applies": true, 172 "answer": true, 173 "justification": "Standard deviations (±) are reported across all main result tables (Tables 1–5), though it is unclear if these are across seeds or trials.", 174 "source": "haiku" 175 } 176 }, 177 "evaluation_design": { 178 "baselines_included": { 179 "applies": true, 180 "answer": true, 181 "justification": "MARL baselines (PPO, PBT for Overcooked; BAD, SAD, OBL for Hanabi) and greedy baselines for Collab games are included throughout.", 182 "source": "haiku" 183 }, 184 "baselines_contemporary": { 185 "applies": true, 186 "answer": true, 187 "justification": "Baselines include state-of-the-art MARL methods for each game (OBL, HSP, PBT) from 2021–2023, appropriate to the paper's timeframe.", 188 "source": "haiku" 189 }, 190 "ablation_study": { 191 "applies": true, 192 "answer": true, 193 "justification": "Table 6 (Appendix C.4) and Table 3 ablate ToM reasoning and answer verification steps independently for the Hanabi LLM agent, showing the contribution of each component.", 194 "source": "haiku" 195 }, 196 "multiple_metrics": { 197 "applies": true, 198 "answer": true, 199 "justification": "Overcooked uses delivery score, Hanabi uses card sequence count, Collab games report success rate and average turns — multiple metrics across evaluation tasks.", 200 "source": "haiku" 201 }, 202 "human_evaluation": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human evaluation of system outputs; the study uses automated game environments and MCQ accuracy with ground-truth labels.", 206 "source": "haiku" 207 }, 208 "held_out_test_set": { 209 "applies": true, 210 "answer": false, 211 "justification": "All 198 CoordQA questions constitute both the benchmark and the evaluation set; there is no held-out portion, and Agentic Coordination uses the full game environments without a separate test split.", 212 "source": "haiku" 213 }, 214 "per_category_breakdown": { 215 "applies": true, 216 "answer": true, 217 "justification": "Figure 3 breaks CoordQA results by EC/ToM/JP category for each model; Tables 1–3 break agentic results by game and layout.", 218 "source": "haiku" 219 }, 220 "failure_cases_discussed": { 221 "applies": true, 222 "answer": true, 223 "justification": "Specific failure modes are discussed: LLMs 'bombing' in Hanabi (losing all lives), open-source LLMs performing below random on joint planning, and GPT-3.5 failing Collab games.", 224 "source": "haiku" 225 }, 226 "negative_results_reported": { 227 "applies": true, 228 "answer": true, 229 "justification": "The paper clearly reports that most LLMs fail badly at Hanabi compared to RL, that open-source models score below random on joint planning, and that ToM reasoning in CollabCapture shows minimal benefit.", 230 "source": "haiku" 231 } 232 }, 233 "setup_transparency": { 234 "model_versions_specified": { 235 "applies": true, 236 "answer": true, 237 "justification": "Specific model versions are named: gpt-4-0125-preview, GPT-3.5-turbo-0125, Mixtral 8x7B, and GPT-4o are all explicitly identified in Section 4.1.1.", 238 "source": "haiku" 239 }, 240 "prompts_provided": { 241 "applies": true, 242 "answer": true, 243 "justification": "Full verbatim prompts for Overcooked (Appendix A), Hanabi (Appendix B), ToM reasoning step, and answer verification step are all provided in the appendices.", 244 "source": "haiku" 245 }, 246 "hyperparameters_reported": { 247 "applies": true, 248 "answer": false, 249 "justification": "Temperature, top-p, max tokens, and other API hyperparameters are not reported anywhere in the paper.", 250 "source": "haiku" 251 }, 252 "scaffolding_described": { 253 "applies": true, 254 "answer": true, 255 "justification": "The agentic framework is described in detail in Section 4.1.1: Memory (long-term, working, episodic), Reasoning (LLM), and Grounding modules with game-specific implementations.", 256 "source": "haiku" 257 }, 258 "data_preprocessing_documented": { 259 "applies": true, 260 "answer": true, 261 "justification": "Appendix A describes distance preprocessing for Overcooked states (shortest-path distances replacing grid coordinates); Appendix B describes Hanabi knowledge state construction.", 262 "source": "haiku" 263 } 264 }, 265 "data_integrity": { 266 "raw_data_available": { 267 "applies": true, 268 "answer": true, 269 "justification": "CoordQA questions and game environments are released via GitHub; game interaction logs are implicitly available through the open-source code and publicly available game simulators.", 270 "source": "haiku" 271 }, 272 "data_collection_described": { 273 "applies": true, 274 "answer": true, 275 "justification": "Section 3.2 and Appendix D describe the process: 66 edge-case scenarios manually sampled from games, 3 question types per scenario, filtered for ambiguity, yielding 198 questions.", 276 "source": "haiku" 277 }, 278 "recruitment_methods_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants were recruited; evaluations use automated game environments and LLM API calls.", 282 "source": "haiku" 283 }, 284 "data_pipeline_documented": { 285 "applies": true, 286 "answer": true, 287 "justification": "The pipeline from game scenario sampling to question construction to MCQ evaluation with fuzzy string matching is described across Sections 3.2, 4.2, and Appendix D.", 288 "source": "haiku" 289 } 290 }, 291 "contamination": { 292 "training_cutoff_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No training data cutoff dates are stated for any of the evaluated models (GPT-4-turbo, GPT-3.5, Mixtral, GPT-4o).", 296 "source": "haiku" 297 }, 298 "train_test_overlap_discussed": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not discuss whether game rules, strategies, or CoordQA-style questions could have appeared in LLM training data, despite Hanabi and Overcooked being publicly documented games.", 302 "source": "haiku" 303 }, 304 "benchmark_contamination_addressed": { 305 "applies": true, 306 "answer": false, 307 "justification": "Hanabi and Overcooked rules and strategies are publicly available and likely present in LLM pretraining data; this potential contamination is never addressed.", 308 "source": "haiku" 309 } 310 }, 311 "human_studies": { 312 "pre_registered": { 313 "applies": false, 314 "answer": false, 315 "justification": "No human participants in this study.", 316 "source": "haiku" 317 }, 318 "irb_or_ethics_approval": { 319 "applies": false, 320 "answer": false, 321 "justification": "No human participants in this study.", 322 "source": "haiku" 323 }, 324 "demographics_reported": { 325 "applies": false, 326 "answer": false, 327 "justification": "No human participants in this study.", 328 "source": "haiku" 329 }, 330 "inclusion_exclusion_criteria": { 331 "applies": false, 332 "answer": false, 333 "justification": "No human participants in this study.", 334 "source": "haiku" 335 }, 336 "randomization_described": { 337 "applies": false, 338 "answer": false, 339 "justification": "No human participants in this study.", 340 "source": "haiku" 341 }, 342 "blinding_described": { 343 "applies": false, 344 "answer": false, 345 "justification": "No human participants in this study.", 346 "source": "haiku" 347 }, 348 "attrition_reported": { 349 "applies": false, 350 "answer": false, 351 "justification": "No human participants in this study.", 352 "source": "haiku" 353 } 354 }, 355 "cost_and_practicality": { 356 "inference_cost_reported": { 357 "applies": true, 358 "answer": false, 359 "justification": "The limitations mention 'significant latency and substantial computational resources' qualitatively, and note GPT-4-turbo cross-play was limited to single trials due to cost, but no dollar or token cost figures are provided.", 360 "source": "haiku" 361 }, 362 "compute_budget_stated": { 363 "applies": true, 364 "answer": false, 365 "justification": "No total compute budget, API call counts, or GPU hours are reported anywhere in the paper.", 366 "source": "haiku" 367 } 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "Zero-shot LLM agents (GPT-4-turbo) match or outperform self-play RL baselines (PPO, PBT) in Overcooked-AI coordination tasks.", 374 "evidence": "Table 1 shows GPT-4-turbo outperforming PPO/PBT in 3 of 5 layouts (e.g., Asymmetric Advantages: 260 vs 190.1); Table 4 shows it outperforming BC and PPOBC in zero-shot coordination.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "LLM agents significantly underperform specialized RL methods (BAD, SAD, OBL) on Hanabi due to Theory of Mind reasoning demands.", 379 "evidence": "Table 3: best LLM (GPT-4-turbo) scores 13.33 vs RL methods scoring 23–24; without ToM+Verification, score drops to 4.33 with 100% bomb rate.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "LLMs perform well at Environment Comprehension but fail at Joint Planning, with most models scoring below random on JP questions.", 384 "evidence": "Figure 3: GPT-4-turbo achieves >80% EC accuracy but <40% JP accuracy; open-source models (Vicuna, Mistral) score below random baseline on JP.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Adding explicit ToM reasoning and answer verification steps substantially improves LLM coordination performance in Hanabi.", 389 "evidence": "Table 6: GPT-4-turbo with ToM+Verification scores 13.33 vs 10.33 (Verification only) vs 4.33 (neither); bomb rate goes from 100% to 0% with Verification.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "LLM agents exhibit zero-shot coordination robustness to unseen partners, unlike self-play RL methods.", 394 "evidence": "Table 5: GPT-4-turbo scores 15.00 cross-play with OBL-1 vs 13.66 self-play; SAD degrades from 23.66 self-play to 11.33 cross-play. Table 4 shows LLM outperforming BC/PPOBC with human proxies.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "CoordQA performance correlates with agentic coordination performance, validating the benchmark as a diagnostic tool.", 399 "evidence": "Figure 2 reports high Pearson correlations (0.85–0.98 for EC vs game performance) across 4 LLMs, but only 4 data points makes these correlations statistically unreliable.", 400 "supported": "weak" 401 } 402 ], 403 "methodology_tags": [ 404 "benchmark-eval", 405 "observational" 406 ], 407 "key_findings": "The LLM-Coordination Benchmark reveals a clear capability split: LLM agents (especially GPT-4-turbo) match or exceed RL baselines in fully-observable, environment-focused coordination tasks (Overcooked) without any task-specific training, but substantially underperform in Hanabi where active Theory of Mind reasoning is required. CoordQA dissects why — LLMs are competent at environment comprehension but most models, including GPT-4, score below 40% on joint planning, with open-source models performing worse than random. A critical practical finding is that ToM reasoning and answer verification steps are necessary scaffolding components for LLM coordination: without them, GPT-4-turbo bombs every Hanabi game. Unlike self-play RL methods, LLM agents maintain performance with unseen partners, making them naturally robust for zero-shot coordination settings.", 408 "red_flags": [ 409 { 410 "flag": "4-point correlation", 411 "detail": "Pearson correlations in Figure 2 are computed across only 4 LLM models, making all reported correlation coefficients (r=0.88, 0.95, etc.) statistically meaningless — you cannot establish a correlation pattern with 4 data points." 412 }, 413 { 414 "flag": "Funder conflict", 415 "detail": "Microsoft (AFMR grant) funds research that shows Microsoft-invested OpenAI GPT-4 models outperforming all competitors; no competing interests statement addresses this." 416 }, 417 { 418 "flag": "Inconsistent trial counts", 419 "detail": "GPT-4-turbo cross-play in Overcooked uses only a single trial 'due to cost and time constraints' (footnote 1), while other conditions use 3 trials, undermining the reliability of those specific comparisons." 420 }, 421 { 422 "flag": "No significance tests", 423 "detail": "Comparative claims (LLMs vs RL, model vs model) are made throughout without any statistical significance testing, despite high variance in some conditions (e.g., Mixtral ±14.40 in Overcooked)." 424 }, 425 { 426 "flag": "Contamination unaddressed", 427 "detail": "Hanabi and Overcooked rules, strategies, and game-theoretic analysis are extensively documented online and likely present in all tested LLMs' pretraining data; this potential advantage over RL baselines is never discussed." 428 } 429 ], 430 "cited_papers": [ 431 { 432 "title": "The Hanabi Challenge: A New Frontier for AI Research", 433 "relevance": "Primary benchmark game; provides RL baselines (BAD, SAD, OBL) and evaluation protocol" 434 }, 435 { 436 "title": "On the Utility of Learning about Humans for Human-AI Coordination (Overcooked-AI)", 437 "relevance": "Primary benchmark environment and source of PPO/BC/PPOBC baselines for zero-shot coordination comparison" 438 }, 439 { 440 "title": "Off-Belief Learning", 441 "relevance": "State-of-the-art cross-play method for Hanabi used as unseen partner in ZSC experiments" 442 }, 443 { 444 "title": "Cognitive Architectures for Language Agents", 445 "relevance": "Design framework for the LLM agent scaffolding (Memory, Reasoning, Grounding)" 446 }, 447 { 448 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 449 "relevance": "Reasoning strategy used in agent design; key prior work on LLM agents" 450 }, 451 { 452 "title": "Theory of Mind for Multi-Agent Collaboration via Large Language Models", 453 "relevance": "Direct prior work on ToM in LLM multi-agent settings; differentiates via explicit belief representations" 454 }, 455 { 456 "title": "Theory of Mind Might Have Spontaneously Emerged in Large Language Models", 457 "relevance": "Motivating claim for using LLMs in coordination; Kosinski's ToM emergence hypothesis" 458 }, 459 { 460 "title": "Learning Zero-Shot Cooperation with Humans, Assuming Humans are Biased (HSP)", 461 "relevance": "Competitive baseline for zero-shot coordination in Overcooked; LLM matches HSP performance" 462 } 463 ], 464 "engagement_factors": { 465 "practical_relevance": { 466 "score": 2, 467 "justification": "Practitioners building multi-agent LLM systems get actionable findings: LLMs need explicit ToM scaffolding, excel at observable coordination, and work well with unseen partners." 468 }, 469 "surprise_contrarian": { 470 "score": 2, 471 "justification": "Zero-shot LLMs matching trained RL in Overcooked is surprising; LLMs failing at joint planning worse than random challenges assumptions about emergent reasoning." 472 }, 473 "fear_safety": { 474 "score": 1, 475 "justification": "Mild safety relevance: unreliable coordination agents (100% bomb rate without scaffolding) highlights risks of deploying LLMs in high-stakes coordination without adequate design." 476 }, 477 "drama_conflict": { 478 "score": 1, 479 "justification": "LLMs vs RL framing generates mild interest, but the comparison is mostly constructive rather than adversarial." 480 }, 481 "demo_ability": { 482 "score": 3, 483 "justification": "Code is publicly released on GitHub; Hanabi and Overcooked are well-known games that can be run interactively, making live demos straightforward." 484 }, 485 "brand_recognition": { 486 "score": 2, 487 "justification": "GPT-4 is prominently featured as the best-performing system; NAACL is a well-regarded NLP venue." 488 } 489 }, 490 "hn_data": { 491 "threads": [ 492 { 493 "hn_id": "45492803", 494 "title": "OpenZL: An open source format-aware compression framework", 495 "points": 434, 496 "comments": 107, 497 "url": "https://news.ycombinator.com/item?id=45492803", 498 "created_at": "2025-10-06T16:01:58Z" 499 }, 500 { 501 "hn_id": "40612538", 502 "title": "Benchmarking the Energy Costs of Large Language Model Inference (2023)", 503 "points": 2, 504 "comments": 0, 505 "url": "https://news.ycombinator.com/item?id=40612538", 506 "created_at": "2024-06-07T20:17:48Z" 507 }, 508 { 509 "hn_id": "38186733", 510 "title": "Pipeline Parallelism for DNN Inference with Practical Performance Guarantees", 511 "points": 2, 512 "comments": 0, 513 "url": "https://news.ycombinator.com/item?id=38186733", 514 "created_at": "2023-11-08T04:16:22Z" 515 }, 516 { 517 "hn_id": "37896876", 518 "title": "Sorting It Out in Hardware: A State-of-the-Art Survey", 519 "points": 1, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=37896876", 522 "created_at": "2023-10-16T07:49:46Z" 523 }, 524 { 525 "hn_id": "38698557", 526 "title": "Augmenting LLM with Human-Like Memory for Mobile Task Automation", 527 "points": 1, 528 "comments": 0, 529 "url": "https://news.ycombinator.com/item?id=38698557", 530 "created_at": "2023-12-19T17:29:15Z" 531 }, 532 { 533 "hn_id": "34332441", 534 "title": "Hunter: Using Change Point Detection to Hunt for Performance Regressions", 535 "points": 1, 536 "comments": 0, 537 "url": "https://news.ycombinator.com/item?id=34332441", 538 "created_at": "2023-01-10T22:28:46Z" 539 } 540 ], 541 "top_points": 434, 542 "total_points": 441, 543 "total_comments": 107 544 } 545 }