scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25217B)
      1 {
      2   "paper": {
      3     "title": "LLM-Coordination: Evaluating and Analyzing Multi-agent Coordination Abilities in Large Language Models",
      4     "authors": ["Saaket Agashe", "Yue Fan", "Anthony Reyna", "Xin Eric Wang"],
      5     "year": 2023,
      6     "venue": "North American Chapter of the Association for Computational Linguistics",
      7     "arxiv_id": "2310.03903",
      8     "doi": "10.18653/v1/2025.findings-naacl.448"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "GitHub link provided in abstract: https://github.com/eric-ai-lab/llm_coordination"
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The CoordinationQA suite of 198 questions is described as part of the benchmark contribution and the code repository is provided. The games used (Overcooked-AI, Hanabi) are publicly available benchmarks."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No environment specifications, requirements.txt, or dependency versions mentioned in the paper."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions provided in the paper. The GitHub link is given but no README instructions or reproduction scripts are described."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Tables 1, 3, and 5 report ± values (e.g., '198.8 ± 4.06', '13.33 ± 0.88')."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No statistical significance tests reported. Claims like 'LLM agents match or outperform RL baselines' are based on point comparisons without significance testing."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No formal effect sizes (Cohen's d, etc.) reported. Raw score differences are shown but no effect size measures."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Only 3 trials per condition for most experiments with no justification for why 3 trials is sufficient. GPT-4-turbo cross-play uses a single trial per position 'due to cost and time constraints' (footnote 1)."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Standard deviations reported via ± notation in Tables 1, 3, and 5 across experimental runs."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Extensive baselines: PPO, PBT for Overcooked; BAD, SAD, OBL for Hanabi; Greedy baseline for Collab games; BC, PPO_BC, HSP for cross-play; Random baseline for CoordinationQA."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include HSP (Yu et al., 2023), OBL (Hu et al., 2021a), and established MARL methods that are standard for these benchmarks."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Ablation on ToM reasoning and verification steps in Table 3/6 and Table 2 (GPT-4-turbo w/out ToM Reasoning)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple metrics used: total score (Overcooked, Hanabi), capture/escape rate, average turns, accuracy on EC/ToM/JP questions, Pearson correlation."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "No human evaluation of LLM agent outputs. Evaluation is entirely automated through game scores and MCQ accuracy. Human proxy agents are behavior cloning models, not actual humans."
     87       },
     88       "held_out_test_set": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No explicit separation of dev/test sets mentioned. The CoordinationQA questions and game scenarios are used directly without held-out splits."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Results broken down per game (Overcooked, Hanabi, CollabCapture, CollabEscape), per layout (5 Overcooked layouts), and per question type (EC, ToM, JP) in Figure 3."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Discussion of LLM failures in Hanabi (Section 5): 'LLM agents seem to struggle... GPT-4-turbo performs reasonably well, while other LLMs can barely complete the games.' Also discusses failure modes: misplays, bombing out, and open-source LLMs performing worse than random on JP."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Several negative results: LLMs fail at Hanabi compared to RL (Table 3), open-source LLMs perform worse than random on Joint Planning (Figure 3), GPT-3.5 and Mixtral struggle across most games."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims about LLMs excelling in environment-focused games but struggling with ToM are supported by Tables 1 (Overcooked success) and 3 (Hanabi failure). ZSC robustness claim supported by Tables 4 and 5."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Ablation study (Table 3/6) provides controlled single-variable manipulation for causal claims about ToM reasoning and verification steps. 'We attribute this failure to two factors' is supported by correlation analysis (Figure 2)."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "Title claims 'Multi-agent Coordination Abilities in Large Language Models' broadly, but results are limited to 4 specific games and 4 LLMs. The paper does not explicitly bound its claims to these specific settings."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "No substantive discussion of alternative explanations. The claim that failure in Hanabi is due to ToM requirements could also be explained by partial observability, action space complexity, or prompt design. These alternatives are not considered."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "Game scores are used as proxies for 'coordination ability' without discussing whether performance on these specific games generalizes to real coordination. The correlation between CoordinationQA and agentic performance (Figure 2) is presented as validation but the proxy gap is not acknowledged."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Specific model versions provided: 'gpt-4-0125-preview', 'GPT-3.5-turbo-0125', 'Mixtral 8x7B', 'GPT-4o' (Section 4.1.1)."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "Full prompt text provided in Appendices A, B, and C including game descriptions, state description formats, system prompts, ToM reasoning prompts, and verification prompts."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": false,
    150         "justification": "No mention of temperature, top-p, or other API parameters for LLM calls."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Detailed agentic framework described in Section 4.1.1: Memory (long-term, working, episodic), Reasoning (LLM), Grounding (action translation). ToM reasoning and verification steps described. Based on Cognitive Architectures for Language Agents (Sumers et al., 2023)."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "State description conversion process documented in detail (Appendix A.2): how game states are converted to natural language, distance calculations, inventory descriptions, cooker states, etc."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section 8 'Limitations' discusses latency/compute requirements, initial prompt configuration challenges, and manual curation of edge cases."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Specific limitations discussed: 'effective reasoning for coordination is achievable primarily with larger LLMs like GPT-4-turbo' (latency constraint), prompt sensitivity ('careful manual configuration of the initial prompts'), and scalability of CoordinationQA ('can hinder the ability to scale')."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not explicitly state what its results do NOT show. No explicit boundaries on what games, settings, or LLMs the conclusions extend to. Section 8 discusses limitations of the approach but not boundaries of the claims."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw experimental logs, game transcripts, or LLM responses made available. Only aggregated scores reported in tables."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "CoordinationQA creation process described in Section 3.2: 'We manually sampled edge cases from all 4 pure coordination games... generated a total of N=66 scenarios (25 from Overcooked, 28 from Hanabi, and 13 from the two Collab Games) and created 3 questions per scenario, resulting in 198 unique questions.'"
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. This is a benchmark evaluation of LLMs on game environments."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Pipeline from scenario selection to question creation documented: edge case sampling → question creation (3 types per scenario) → ambiguity filtering → final dataset of 198 questions (Section 3.2)."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 7 Acknowledgements: 'This research project has benefitted from the Microsoft Accelerate Foundation Models Research (AFMR) grant program.'"
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "All authors affiliated with University of California, Santa Cruz. No affiliation with the LLM providers being evaluated."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "Microsoft funded the research and Microsoft's products (through OpenAI partnership) are among those evaluated (GPT-4-turbo, GPT-3.5, GPT-4o). This is not acknowledged as a potential conflict."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No training data cutoff dates stated for any of the models used. The games (Overcooked, Hanabi) are well-known and their rules/strategies exist extensively in training data."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No discussion of whether LLMs have seen Hanabi strategies, Overcooked solutions, or the game rules in their training data. This is particularly relevant since these are well-known research benchmarks."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "Hanabi and Overcooked-AI are well-documented research benchmarks published in 2019-2020. The models tested (GPT-4, etc.) were trained after these benchmarks were published. No contamination discussion."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "Cost mentioned as a limitation ('significant latency and require substantial computational resources') and as reason for limited trials ('due to cost and time constraints' in footnote 1), but no actual cost figures reported."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "No total API spend, GPU hours, or computational budget stated despite using paid API models across many experiments."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Table 5 states 'All agents play three games with different seeds (same seeds across agents).' Variance across seeds reported via ± notation."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Number of trials stated: '3 trials' for CollabCapture/Escape (Table 2), '3 trials per model' for CoordinationQA (Section 4.2), 'three games with different seeds' for Hanabi cross-play (Table 5). Single trial noted for GPT-4-turbo cross-play in Overcooked (footnote 1)."
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No hyperparameter search described. Prompt configurations appear hand-tuned but no search process documented."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No description of how the final prompt configurations or reasoning strategies were selected. The paper states prompt engineering is left to 'future works focused on performance improvement' but doesn't describe how current configs were chosen."
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Multiple comparisons across games, layouts, and models without any correction applied. No significance tests at all, let alone corrected ones."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Authors implemented the LLM agent framework and compare against established RL baselines from prior work. No acknowledgment that their implementation choices could favor their approach."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Section 5 notes LLMs are 'significantly slower and larger than RL models' but no quantitative comparison of compute requirements. The compute asymmetry between LLM and RL agents is acknowledged but not measured."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "Section 3.1 discusses why these specific games were selected: 'carefully selected for their ability to isolate and highlight specific coordination challenges.' Section 5 analyzes what each game tests (environment-focused vs ToM-focused coordination) with correlation analysis (Figure 2)."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Different scaffolding used for different games (ToM reasoning and verification for Hanabi, not for Overcooked). Different LLMs compared using the same scaffold, but the scaffold is designed by the authors and its effect vs model capability is not disentangled."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "No discussion of temporal leakage. Hanabi (2020) and Overcooked-AI (2019) were published well before GPT-4's training cutoff. The models may have seen game strategies and solutions."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": true,
    345         "answer": false,
    346         "justification": "No discussion of whether game descriptions in the prompt leak strategy information that wouldn't be available to RL agents learning from scratch."
    347       },
    348       "non_independence_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of whether the CoordinationQA questions or game scenarios could overlap with training data content."
    352       },
    353       "leakage_detection_method": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No leakage detection methods applied."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "LLM agents (GPT-4-turbo) match or outperform RL baselines in environment-focused coordination games (Overcooked-AI)",
    363       "evidence": "Table 1 shows GPT-4-turbo outperforming or matching PPO and PBT across 3 of 5 Overcooked layouts (Section 5).",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "LLM agents struggle in games requiring active Theory of Mind reasoning (Hanabi)",
    368       "evidence": "Table 3 shows GPT-4-turbo scoring 13.33 vs RL methods at 23-24 in Hanabi. Other LLMs score near zero.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "ToM reasoning and verification steps improve LLM agent performance in Hanabi",
    373       "evidence": "Table 3/6 ablation: removing verification drops from 13.33 to 10.33, removing both drops to 4.33 with 100% bomb rate.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "LLM agents are robust to unseen partners (Zero-Shot Coordination) unlike RL methods",
    378       "evidence": "Table 4: GPT-4-turbo matches/outperforms HSP with human proxies. Table 5: GPT-4-turbo maintains performance with OBL partners (15.0 vs 13.66 self-play) while SAD drops from 23.66 to 11.33/8.00.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "LLMs perform best on Environment Comprehension, worst on Joint Planning in CoordinationQA",
    383       "evidence": "Figure 3 shows GPT-4-turbo >80% on EC but <40% on JP. Open-source LLMs perform worse than random on JP.",
    384       "supported": "strong"
    385     }
    386   ],
    387   "methodology_tags": ["benchmark-eval"],
    388   "key_findings": "LLM agents (particularly GPT-4-turbo) can match or outperform RL baselines in fully observable coordination games (Overcooked-AI) without any training, but significantly underperform in games requiring Theory of Mind reasoning (Hanabi). LLMs show robustness to unseen partners (zero-shot coordination), unlike RL methods that suffer performance degradation. Fine-grained analysis via CoordinationQA reveals LLMs excel at environment comprehension but struggle with joint planning, with open-source models performing worse than random on planning questions.",
    389   "red_flags": [
    390     {
    391       "flag": "Very small trial counts",
    392       "detail": "Only 3 trials per condition for most experiments. GPT-4-turbo Overcooked cross-play uses a single trial per position, acknowledged as a cost constraint. With such small N, reported ± values may not reflect true variance."
    393     },
    394     {
    395       "flag": "No contamination analysis for well-known benchmarks",
    396       "detail": "Overcooked-AI and Hanabi are established benchmarks with extensive online strategy discussions. GPT-4 could have absorbed game strategies during training, which would inflate performance on environment-focused tasks. This is never discussed."
    397     },
    398     {
    399       "flag": "Microsoft funding conflict not acknowledged",
    400       "detail": "Research funded by Microsoft AFMR while evaluating OpenAI models (GPT-4-turbo, GPT-3.5, GPT-4o). Microsoft is a major OpenAI investor. This potential conflict is not acknowledged."
    401     },
    402     {
    403       "flag": "Unfair compute comparison with RL baselines",
    404       "detail": "LLM agents use orders of magnitude more compute than RL agents but this asymmetry is noted qualitatively ('significantly slower and larger') without quantification. Claiming LLMs 'match or outperform' RL without accounting for compute disparity is misleading."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "The Hanabi Challenge: A New Frontier for AI Research",
    410       "authors": ["Nolan Bard", "Jakob N. Foerster"],
    411       "year": 2020,
    412       "relevance": "Core benchmark used for evaluating multi-agent coordination requiring Theory of Mind."
    413     },
    414     {
    415       "title": "On the Utility of Learning about Humans for Human-AI Coordination",
    416       "authors": ["Micah Carroll", "Rohin Shah"],
    417       "year": 2019,
    418       "relevance": "Overcooked-AI benchmark for human-AI coordination, core evaluation environment in this paper."
    419     },
    420     {
    421       "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models",
    422       "authors": ["Guanzhi Wang"],
    423       "year": 2023,
    424       "arxiv_id": "2305.16291",
    425       "relevance": "LLM-based agent showing planning abilities in virtual environments."
    426     },
    427     {
    428       "title": "MetaGPT: Meta Programming for a Multi-Agent Collaborative Framework",
    429       "authors": ["Sirui Hong"],
    430       "year": 2023,
    431       "arxiv_id": "2308.00352",
    432       "relevance": "Multi-LLM framework for collaborative task completion."
    433     },
    434     {
    435       "title": "ChatDev: Communicative Agents for Software Development",
    436       "authors": ["Chen Qian"],
    437       "year": 2024,
    438       "arxiv_id": "2307.07924",
    439       "relevance": "Multi-agent LLM framework for software development coordination."
    440     },
    441     {
    442       "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Language Model Society",
    443       "authors": ["Guohao Li"],
    444       "year": 2023,
    445       "arxiv_id": "2303.17760",
    446       "relevance": "Multi-agent LLM communication and collaboration framework."
    447     },
    448     {
    449       "title": "Theory of Mind for Multi-Agent Collaboration via Large Language Models",
    450       "authors": ["Huao Li"],
    451       "year": 2023,
    452       "relevance": "Directly relevant — investigates ToM in LLM multi-agent collaboration."
    453     },
    454     {
    455       "title": "ProAgent: Building Proactive Cooperative AI with Large Language Models",
    456       "authors": ["Ceyao Zhang"],
    457       "year": 2023,
    458       "arxiv_id": "2308.11339",
    459       "relevance": "LLM-based cooperative agent architecture for Overcooked-AI."
    460     },
    461     {
    462       "title": "Cognitive Architectures for Language Agents",
    463       "authors": ["Theodore R. Sumers", "Shunyu Yao"],
    464       "year": 2023,
    465       "arxiv_id": "2309.02427",
    466       "relevance": "Design framework for LLM agent architectures, used as basis for this paper's agent design."
    467     },
    468     {
    469       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    470       "authors": ["Shunyu Yao"],
    471       "year": 2023,
    472       "arxiv_id": "2210.03629",
    473       "relevance": "Reasoning strategy used as baseline for LLM agent implementations."
    474     },
    475     {
    476       "title": "Exploring Large Language Models for Communication Games: An Empirical Study on Werewolf",
    477       "authors": ["Yuzhuang Xu"],
    478       "year": 2024,
    479       "arxiv_id": "2309.04658",
    480       "relevance": "Empirical study of LLMs in multi-agent game settings."
    481     }
    482   ]
    483 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs