scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26405B)
      1 {
      2   "paper": {
      3     "title": "BattleAgentBench: A Benchmark for Evaluating Cooperation and Competition Capabilities of Language Models in Multi-Agent Systems",
      4     "authors": ["Wei Wang", "Dan Zhang", "Tao Feng", "Boyan Wang", "Jie Tang"],
      5     "year": 2024,
      6     "venue": "arXiv",
      7     "arxiv_id": "2408.15971",
      8     "doi": "10.48550/arXiv.2408.15971"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The abstract states 'The code for BattleAgentBench is available at https://github.com/THUDM/BattleAgentBench.' A working GitHub URL is provided."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The benchmark is the artifact itself — the game environment and stage configurations are released via the GitHub repository. The benchmark does not rely on a separate static dataset; the evaluation data is generated through game interactions using the released code."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No mention of a requirements.txt, Dockerfile, conda environment, or detailed environment setup with library versions found in the paper. The paper does not describe what software dependencies are needed beyond the LLM APIs."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper does not include step-by-step reproduction instructions. While the code is released on GitHub, the paper itself does not describe how to run the experiments, what commands to execute, or how to replicate the results."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "All results in Tables 2-5 are reported as point estimates (e.g., scores, accuracy) with no confidence intervals, error bars, or ± notation. The appendix (Section 7.1) mentions running multiple times (3 for API-based, 5 for open-source), but no uncertainty measures are reported."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes comparative claims (e.g., 'claude3.5-sonnet and gpt-4o-mini achieved relatively high scores, while other models scored comparatively lower') but provides no statistical significance tests to support these claims."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper reports raw scores and accuracy values but does not provide effect sizes. The ablation study (Table 5) reports ΔScore but without baseline context or standardized effect size measures."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The appendix states API-based models run each stage 3 times and open-source models 5 times, but there is no justification for why these numbers were chosen and no power analysis. The asymmetric run counts (3 vs 5) are not explained."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "Despite running each stage multiple times (3 or 5 runs), the paper reports only averaged scores in the tables. No standard deviations, ranges, or other variance measures are reported across runs."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper includes a random baseline in all evaluation tables (Tables 2-4), showing the expected performance of random actions. This provides a lower-bound comparison for all models."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The evaluated models include contemporary models at the time of writing (2024): Claude 3.5 Sonnet (June 2024), GPT-4o-mini, Gemma2-9b, Qwen2-7b, and Yi-1.5-9b. These represent recent API-based and open-source models."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Section 4.3 presents an ablation study that removes the cooperation interface between agents in Level 3, comparing Table 4 (with cooperation) to Table 5 (without cooperation) to assess the effectiveness of the collaboration component."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper uses multiple metrics: Forward Distance (F Dis), Format Accuracy (F Acc), Move Accuracy (M Acc) for Level 1, and Score, F Acc, M Acc for Levels 2-3. Goal completion rate is also computed from forward distance."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "The paper makes claims about cooperation quality and strategic behavior (Section 4.4 case study describes cooperation strategies qualitatively), but all evaluation is automated through game metrics (scores, accuracy). No human evaluation of cooperation quality, strategy sophistication, or agent decision-making is included."
     85       },
     86       "held_out_test_set": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "The benchmark evaluates LLMs as game agents in a live interactive environment. There is no static train/test split — each evaluation generates new game states through agent-environment interaction with randomly sampled initial positions."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down by stage (7 stages across 3 levels) and by model category (API-based vs open-source), with separate tables for each level (Tables 2, 3, 4). Per-model results are shown for all 11 models plus a random baseline."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 4.3 discusses that cooperation among weaker models actually lowers their scores, indicating 'cooperation between these models is ineffective or even harmful.' The case study (Section 4.4, Figure 6) shows qualitative examples of agent behavior, though failure cases could be more detailed."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The ablation study in Section 4.3 reports that removing cooperation actually improves scores for most models except claude3.5-sonnet and gpt-4o-mini, demonstrating that cooperation is counterproductive for weaker models. Some open-source models (e.g., llama3-8b-instruct) perform worse than random."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims are supported: (1) API-based models perform excellently on simple tasks — confirmed in Table 2; (2) open-source small models struggle with simple tasks — confirmed in Table 2 and Figure 5; (3) API-based models have demonstrated some collaborative capabilities but with room for improvement — confirmed in the ablation study (Section 4.3)."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The main causal claim is that cooperation improves performance for some models (Section 4.3). This is supported by a controlled ablation study comparing performance with and without the cooperation interface (Table 4 vs Table 5), which is an adequate single-variable manipulation."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The title claims to evaluate 'Cooperation and Competition Capabilities of Language Models in Multi-Agent Systems' generally, but the evaluation is specific to a single game environment (Battle City tank game). The paper does not discuss whether findings generalize to other cooperative/competitive multi-agent settings beyond this one game."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper does not discuss alternative explanations for the observed results. For example, the performance gap between API and open-source models could be due to model size, instruction tuning quality, spatial reasoning ability, or format following capability, but these are not disentangled or discussed."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "Most model versions are not fully specified. 'claude3.5-sonnet-0620' is specified with a date. 'gpt-3.5-turbo-0125' has a version. However, 'gpt-4o-mini' lacks a snapshot date. Open-source models like 'internlm2.5-7b-chat', 'mistral-7b-instruct', 'qwen2-7b-Instruct' lack specific version identifiers or checkpoint dates. The specification is inconsistent across models."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Full prompt templates for all 7 stages are provided in the appendix (Figures 1-7 in Section 7.2), including game state data format, goals, game instructions, operation options, and output format. The templates contain placeholders (%s, %d) for dynamic game state data, but the fill values are the actual game state which varies per turn — the templates themselves are complete and reproducible."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "No mention of temperature, top-p, max tokens, or other sampling parameters used when calling the LLM APIs or running open-source models. These settings can significantly affect agent behavior."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The agent scaffolding is described in detail: Section 3.1 describes the agent interaction framework (game server sends observations, agents return actions), the communication interface for cooperation (Section 3.3-3.5), the primary/secondary agent setup (Section 4.1), and the MDP formulation (Section 2). The prompts include thought process templates for agent reasoning."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3.1 describes how 'game observation data is converted into text format based on predefined templates.' The appendix (Section 7.1) documents the map generation process (random sampling of agent positions within a small range) and stage settings (Table 6: turns, agents, teams, NPCs per stage)."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The conclusion (Section 6) is a brief summary without any discussion of limitations."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "No threats to validity are discussed anywhere in the paper. There is no discussion of specific methodological concerns such as the limited number of runs, the choice of reference model (yi-1.5-9b), or the single-game-environment limitation."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "The paper does not explicitly state what the results do NOT show. There is no discussion of scope boundaries — e.g., that findings are specific to the Battle City game environment, that the cooperation metric depends on the reference model choice, or that results may not transfer to other multi-agent settings."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper does not release raw game logs, model outputs, or individual run data. Only averaged results are reported in the tables. The code is released on GitHub, but the actual experimental data (game trajectories, model responses) is not."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 4.1 and the appendix (Section 7.1) describe how data is collected: each stage runs for a specified number of turns (Table 6), with random map generation, multiple runs per model (3 for API, 5 for open-source), and averaged scores. The evaluation metrics and their computation are formally defined (Equations 2-4)."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants are involved. The data source is LLM game agent interactions, and the models evaluated are standard public benchmarks/APIs."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The pipeline is documented: game server loads stage settings → sends observations to agents → agents (LLMs) return actions → server updates state and calculates rewards → process repeats for specified turns → metrics computed from final states. The evaluation setup (Section 7.1) describes the run procedure and averaging."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "No funding source or acknowledgments section is present in the paper. The authors are from Tsinghua University's KEG group, but no grants or sponsorship are disclosed."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly stated: all five authors are from 'The Knowledge Engineering Group (KEG), Tsinghua University.' The authors are affiliated with an academic institution, not with any of the evaluated model providers."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "Since funding is not disclosed at all, independence of the funder cannot be assessed. The absence of funding disclosure does not satisfy this criterion."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement or financial interest declarations are present in the paper."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper evaluates LLM capabilities on game reasoning tasks. While the benchmark is novel, the training cutoffs of the evaluated models are not stated. This matters because models may have been trained on similar game reasoning tasks or the Battle City game itself."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "There is no discussion of whether any evaluated models may have been exposed to Battle City game data, game strategy data, or similar game reasoning benchmarks during training."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "While BattleAgentBench is a novel benchmark with dynamically generated maps, the underlying game is Battle City (a classic 1985 NES game). Models may have been trained on descriptions of Battle City rules, strategies, and gameplay, potentially advantaging some models. The paper does not discuss this contamination risk."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants are involved in this benchmark evaluation study."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this benchmark evaluation study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved in this benchmark evaluation study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved in this benchmark evaluation study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved in this benchmark evaluation study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved in this benchmark evaluation study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved in this benchmark evaluation study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": false,
    275         "justification": "The appendix (Section 7.1) mentions that 'the setting of the number of turns takes into account the cost factor of calling LLM' but does not report actual inference costs, API costs, tokens consumed, or wall-clock time for the evaluations."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No computational budget is stated. The paper does not report total API costs, GPU hours for running open-source models, or overall evaluation time."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "API-based models perform excellently on simple tasks but open-source small models struggle with simple tasks.",
    287       "evidence": "Table 2 shows API-based models achieve significantly higher Forward Distance scores (e.g., claude3.5-sonnet: 12.8 avg, gpt-4o-mini: 11.5 avg) compared to open-source models (best: internlm2.5-7b at 5.7 avg). Figure 5 shows goal completion rates above 50% only for API-based models.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "In difficult tasks requiring collaborative and competitive abilities, API-based models have demonstrated some collaborative capabilities but there is still enormous room for improvement.",
    292       "evidence": "Tables 3-4 show that only claude3.5-sonnet and gpt-4o-mini achieve notable scores in Levels 2-3. The ablation study (Table 5, Section 4.3) shows cooperation only improves scores for these two models. Other API models (gpt-3.5, glm-4-flash) show minimal advantage over open-source models in complex stages.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Only claude3.5-sonnet and gpt-4o-mini benefit from cooperation; cooperation is ineffective or harmful for other models.",
    297       "evidence": "Section 4.3 and Table 5: After removing the cooperation interface, only claude3.5-sonnet and gpt-4o-mini show decreased scores (ΔScore: -2.3 and -0.3 respectively), while all other models show increased scores, indicating cooperation hurts their performance.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Claude3.5-sonnet maintains stable spatial awareness ability across increasingly complex stages.",
    302       "evidence": "Tables 2-4 consistently show claude3.5-sonnet maintaining high Move Accuracy (M Acc ≥ 0.87) across all stages, while other models show declining M Acc as complexity increases.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Open-source small models have poor two-dimensional spatial perception.",
    307       "evidence": "Table 2 shows open-source models have Move Accuracy around 0.41-0.65, compared to 0.83-0.98 for API-based models. llama3-8b-instruct actually moves backward (negative forward distance: -1.3 avg), performing worse than random.",
    308       "supported": "strong"
    309     }
    310   ],
    311   "methodology_tags": ["benchmark-eval"],
    312   "key_findings": "BattleAgentBench evaluates 11 LLMs across 7 stages of increasing difficulty in a Battle City tank game, testing navigation, cooperation, and competition capabilities. API-based models (especially Claude 3.5 Sonnet and GPT-4o-mini) significantly outperform open-source 7-9B models on both simple navigation and complex multi-agent tasks. An ablation study removing the cooperation interface shows that only Claude 3.5 Sonnet and GPT-4o-mini genuinely benefit from inter-agent cooperation, while cooperation actually hurts performance for all other models tested.",
    313   "red_flags": [
    314     {
    315       "flag": "No uncertainty quantification despite multiple runs",
    316       "detail": "The paper runs each stage 3 times (API models) or 5 times (open-source models) and reports averages, but never reports standard deviations, confidence intervals, or any variance measure. This makes it impossible to assess whether observed differences are meaningful or within noise."
    317     },
    318     {
    319       "flag": "Asymmetric evaluation conditions",
    320       "detail": "API-based models run each stage 3 times while open-source models run 5 times. This asymmetry is not justified and could introduce bias in the averaged results."
    321     },
    322     {
    323       "flag": "Reference model choice may bias multi-agent results",
    324       "detail": "In Levels 2-3, secondary agents use yi-1.5-9b as a fixed reference model. Since yi-1.5-9b is one of the weakest models tested, the evaluated primary agent is always paired with or competing against weak opponents. This may inflate scores for stronger models and depress scores for weaker models differently than a stronger reference would."
    325     },
    326     {
    327       "flag": "No limitations section",
    328       "detail": "The paper contains no discussion of limitations, threats to validity, or scope boundaries. Major concerns like single-game generalizability, reference model dependence, and small run counts are not acknowledged."
    329     },
    330     {
    331       "flag": "Broad claims from narrow evaluation",
    332       "detail": "The paper claims to evaluate 'Cooperation and Competition Capabilities of Language Models in Multi-Agent Systems' but tests only a single game environment (Battle City). The cooperative behavior tested (sending text messages to coordinate tank movements) is a very specific form of cooperation that may not generalize."
    333     },
    334     {
    335       "flag": "Missing LLM hyperparameters",
    336       "detail": "No temperature, top-p, max tokens, or other sampling parameters are reported for any of the 11 models. These settings significantly affect LLM output behavior and make results non-reproducible even with the released code."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "AgentBench: Evaluating LLMs as Agents",
    342       "authors": ["Xiao Liu", "Hao Yu", "Hanchen Zhang"],
    343       "year": 2023,
    344       "arxiv_id": "2308.03688",
    345       "relevance": "Major benchmark for evaluating LLMs as agents across diverse interactive environments, directly relevant to the survey's scope of LLM capability evaluation."
    346     },
    347     {
    348       "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework",
    349       "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang"],
    350       "year": 2023,
    351       "arxiv_id": "2308.08155",
    352       "relevance": "Foundational multi-agent LLM framework relevant to agentic AI architectures and workflows."
    353     },
    354     {
    355       "title": "Large Language Model based Multi-Agents: A Survey of Progress and Challenges",
    356       "authors": ["Taicheng Guo", "Xiuying Chen", "Yaqi Wang"],
    357       "year": 2024,
    358       "arxiv_id": "2402.01680",
    359       "relevance": "Survey of LLM-based multi-agent systems covering progress and challenges, directly relevant to the survey's coverage of multi-agent AI research."
    360     },
    361     {
    362       "title": "A Survey on Large Language Model-based Game Agents",
    363       "authors": ["Sihao Hu", "Tiansheng Huang", "Fatih Ilhan"],
    364       "year": 2024,
    365       "arxiv_id": "2404.02039",
    366       "relevance": "Survey covering LLM-based game agents across multiple game genres, relevant to benchmark evaluation methodology for agent capabilities."
    367     },
    368     {
    369       "title": "AvalonBench: Evaluating LLMs Playing the Game of Avalon",
    370       "authors": ["Jonathan Light", "Min Cai", "Sheng Shen"],
    371       "year": 2023,
    372       "relevance": "Game-based benchmark for evaluating LLM social deduction and cooperation capabilities, directly comparable to BattleAgentBench."
    373     },
    374     {
    375       "title": "Building Cooperative Embodied Agents Modularly with Large Language Models",
    376       "authors": ["Hongxin Zhang", "Weihua Du", "Jiaming Shan"],
    377       "year": 2024,
    378       "relevance": "ICLR paper on modular cooperative embodied agents using LLMs, relevant to multi-agent cooperation evaluation methodology."
    379     },
    380     {
    381       "title": "GPT-4 Technical Report",
    382       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    383       "year": 2023,
    384       "arxiv_id": "2303.08774",
    385       "relevance": "Technical report for one of the major LLM families evaluated in agent benchmarks, relevant to understanding model capabilities."
    386     },
    387     {
    388       "title": "Suspicion-Agent: Playing Imperfect Information Games with Theory of Mind Aware GPT-4",
    389       "authors": ["Jiaxian Guo", "Bo Yang", "Paul Yoo"],
    390       "year": 2023,
    391       "arxiv_id": "2309.17277",
    392       "relevance": "Evaluates LLM agent capabilities in game-theoretic settings, relevant to competitive multi-agent evaluation."
    393     },
    394     {
    395       "title": "MindAgent: Emergent Gaming Interaction",
    396       "authors": ["Ran Gong", "Qiuyuan Huang", "Xiaojian Ma"],
    397       "year": 2023,
    398       "arxiv_id": "2309.09971",
    399       "relevance": "Assesses multi-agent collaboration efficiency in gaming environments, directly comparable benchmark evaluation methodology."
    400     },
    401     {
    402       "title": "Agent-Pro: Learning to Evolve via Policy-Level Reflection and Optimization",
    403       "authors": ["Wenqi Zhang", "Ke Tang", "Hai Wu"],
    404       "year": 2024,
    405       "arxiv_id": "2402.17574",
    406       "relevance": "Evaluates LLM agent strategy learning in competitive settings, relevant to agentic AI capability evaluation."
    407     }
    408   ]
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs