scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24383B)
      1 {
      2   "paper": {
      3     "title": "Harnessing Language for Coordination: A Framework and Benchmark for LLM-Driven Multi-Agent Control",
      4     "authors": ["Timothée Anne", "Noah Syrkis", "Meriem Elhosni", "Florian Turati", "Franck Legendre", "Alain Jaquier", "Sebastian Risi"],
      5     "year": 2024,
      6     "venue": "IEEE Transactions on Games",
      7     "arxiv_id": "2412.11761",
      8     "doi": "10.1109/TG.2025.3564042"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "HIVE enables a single human to coordinate up to 2,000 agents via natural language with an LLM. Claude 3.5 Sonnet performed best across five ability tests (19/50 wins), followed by Gemini 2 (18/50). Human-machine collaboration outperforms LLM-alone planning, and LLMs struggle with visual/spatial map understanding, performing better with textual map descriptions than images. Even state-of-the-art LLMs are sensitive to minor prompt wording variations.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper references a project page at hive.syrkis.com with videos, and the game/framework is described as implemented in JAX. While no explicit GitHub URL is given in the paper text, the project page serves as a public artifact."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset download link is provided. The benchmark scenarios, maps, and prompts are described in the appendix but no downloadable data archive is referenced."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions JAX and Python but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The paper describes the system and prompts in detail in the appendix but does not include a reproducibility guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results are reported as counts of wins/losses/ties out of 10 prompts and box plots (Fig. 5), but no confidence intervals or error bars on win rates are provided."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper compares 9 LLMs and claims Sonnet performs best, but no statistical significance tests are used. Comparisons are based on raw counts and visual inspection of box plots."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No effect sizes reported. Results are presented as win/loss/tie counts and percentage metrics in box plots without formal effect size measures."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "10 prompt variations per ability test per model is used without justification for why 10 was chosen or whether this is sufficient to distinguish model performance."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Box plots in Fig. 5 show the distribution (quartiles and spread) of continuous metrics across the 10 prompt variations for each model and ability test."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Nine LLMs are compared including Llama3-8B as a small open-source baseline (Sec. IV-B). The paper also compares HIVE with vs. without human help (Sec. V-B)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Models tested include GPT-4o, Claude 3.5 Sonnet, Gemini 2.0, and o1-mini, which were contemporary state-of-the-art at time of writing."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Multiple ablation studies: temperature effect (Sec. B/Fig. 11), scaling with number of units (Sec. V-A), with vs. without human help (Sec. V-B), and text vs. image map descriptions (Sec. V-C)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper uses win/loss/tie counts, percentage of enemies eliminated, covered distance to objective, and plan validity as evaluation metrics across different ability tests."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation of system outputs is not applicable here — the benchmark has deterministic game outcomes (win/loss/tie) that serve as ground truth evaluation."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is not a train/test setup. The benchmark evaluates LLMs zero-shot on game scenarios without any training or tuning phase."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down across all five ability tests (Coordinate, Exploit weakness, Follow markers, Exploit terrain, Strategize points) per model in Table I and Figs. 5-6."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses failure modes: LLMs' inability to extract positions from raw data (Sec. VI), visual-spatial reasoning failures (Sec. V-C), sensitivity to prompt wording, and Llama3-8B's inability to follow structured output."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results: HIVE without human help performs worse (Sec. V-B), image-based map input degrades performance vs. text (Sec. V-C), Llama3-8B completely fails, most models have <50% success rates on most tasks."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'promising results' on multi-agent tasks and 'critical limitations' — both are supported by Table I showing partial success and the discussion of spatial/strategic planning failures."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are modest and supported by controlled ablations: human help improves performance (Sec. V-B, controlled comparison), text descriptions outperform images (Sec. V-C, same prompts/models). The ablation designs are single-variable manipulations."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper bounds claims to the specific game benchmark and tested models. The title references 'A Framework and Benchmark' rather than making general claims. Discussion acknowledges limitations of the specific setting."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section VI discusses alternative explanations: poor Exploit weakness performance may be due to LLMs' inability to process numerical data (not strategic reasoning per se), and visual failures may be due to out-of-distribution map images."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures game win/loss outcomes directly and does not overclaim beyond them. The five ability tests are clearly defined operational measures, and the paper does not frame them as proxies for broader capabilities."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Exact model versions are specified in Sec. IV-B: 'gpt-4o-2024-11-20', 'claude-3-5-sonnet-20241022', 'gpt-4o-mini-2024-07-18', 'o1-mini-2024-09-12', 'claude-3-5-haiku-20241022', 'gemini-2.0-flash-exp', 'gemini-1.5-pro', 'gemini-1.5-flash'."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "All 50 player prompts are provided in Appendix Sec. I, and the full system prompt instruction is provided in Appendix Sec. E1."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Temperature is explicitly discussed with an ablation study (Appendix B), and the paper states temperature was set to 0 for all models except o1-mini (fixed at 1)."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The HIVE framework is described in detail in Section III including the plan structured output, behavior tree assignment, objective checking, and game-to-text conversion pipeline (Fig. 2)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "The paper describes how game state is converted to text for the LLM (Sec. III-E), how maps are described textually (Sec. III-E2), and how plans are parsed and validated (Sec. III-D1)."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section VI (Discussion) serves as a substantial limitations discussion, covering LLM inference time, visual reasoning challenges, structured output limitations, and numerical processing issues."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: LLMs' sensitivity to prompt wording variations (Sec. IV-C), inability to process mathematical/positional data (Sec. VI), non-determinism even at temperature 0 (Sec. VI citing [38])."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper states scope boundaries: single plan inference per game (not real-time replanning), specific game mechanics (4 terrain types, 3 unit types), and notes that results may not transfer to more complex games."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Raw experimental data (individual game logs, plan outputs per run) is not made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The experimental procedure is well-described: 10 prompt variations per ability test per model, deterministic replay with saved plans and random seeds (Sec. III-C), API calls to each LLM."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants in the study. The benchmark uses automated LLM evaluations on predefined scenarios."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: player prompt → LLM API call → structured output parsing → plan validation → game execution → outcome recording (Sec. III-C, III-D)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funding is disclosed in Acknowledgments: 'Funded by the armasuisse S+T project F00-007.'"
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: IT University of Copenhagen and armasuisse Science+Technology. No authors are affiliated with the LLM providers being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "armasuisse (Swiss defense procurement agency) funds the research but has no financial stake in which LLM performs best on the benchmark."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff dates are stated for any of the 9 models tested."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether any game scenarios, behavior tree formats, or similar strategy game tasks appeared in LLM training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "The benchmark is novel (reducing contamination risk), but the paper does not discuss this advantage or address contamination at all."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Inference wall-clock times are reported in Fig. 10 and Sec. IV-C with medians and quartiles per model (e.g., '4o with 6.4 s [3.3 s, 12.2 s]', 'Sonnet with 12.6 s [9.9 s, 14.2 s]')."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No total compute budget is stated (total API costs, total number of API calls, GPU hours for game simulation, etc.)."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper uses deterministic game replay with saved random seeds but does not report sensitivity to different random seeds. Variation comes from prompt wording, not seed variation."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Clearly stated: 10 prompt variations per ability test per model, for a total of 50 evaluations per model (Sec. IV-C)."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget reported. Temperature was set to 0 based on an ablation, but no other hyperparameter tuning is discussed."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "Temperature selection is justified via the ablation study in Appendix B showing that higher temperatures increase invalid plans. The choice of temperature 0 is data-driven."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "Nine models compared across five tasks with no statistical tests at all, let alone multiple comparison correction."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors built the HIVE framework and benchmark, which may favor certain LLM behaviors. This author-evaluation bias is not discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Models have very different inference costs (3.9s to 12.6s median) and capabilities, but performance is not analyzed as a function of compute. The paper notes inference times but does not formally relate them to performance."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper does not discuss whether the five ability tests actually measure the claimed coordination capabilities or whether game outcomes could be achieved through simpler strategies."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "All models use the same HIVE scaffold, so the scaffold is not a confound in model comparisons."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether similar strategy game tasks or the structured output format appeared in model training data before the benchmark was created."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the game state information provided to models leaks solution information."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The 10 prompt variations per task are intentionally similar — the paper acknowledges sensitivity to wording but does not analyze independence of these samples."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "HIVE can coordinate swarms of up to 2,000 agents through natural language dialog with an LLM",
    365       "evidence": "Scaling study in Sec. V-A/Fig. 7 tests up to 4,000 units (2,000 per side). Sonnet produces valid plans at all tested scales.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Claude 3.5 Sonnet is the best-performing LLM on the HIVE benchmark",
    370       "evidence": "Table I shows Sonnet achieves 19/50 total wins, the highest among all models. Sonnet is the only model to succeed on all five ability tests.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "HIVE performs better with human help than without",
    375       "evidence": "Sec. V-B/Fig. 8 compares with and without human prompts. Win rates decrease and median performance drops without human guidance.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "LLMs perform significantly worse with visual map input than textual descriptions",
    380       "evidence": "Sec. V-C/Fig. 9 shows win rates and median performance drop when replacing text descriptions with raw images, grid images, or scaffolding images.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "LLMs are sensitive to minor prompt wording variations",
    385       "evidence": "Sec. IV-C notes 'slight changes in the prompt result in drastic changes in the plan and the execution.' Fig. 6 shows variable outcomes across prompt variations.",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "No statistical tests for model comparisons",
    392       "detail": "Nine LLMs are compared and ranked with only 10 trials each, using raw win counts without any statistical significance testing. The difference between Sonnet (19/50) and Gemini 2 (18/50) may not be meaningful."
    393     },
    394     {
    395       "flag": "Small sample size for strong claims",
    396       "detail": "10 prompt variations per ability test per model is a small sample. Given the acknowledged sensitivity to prompt wording, the results may not be stable."
    397     },
    398     {
    399       "flag": "No contamination discussion for a novel benchmark",
    400       "detail": "While the benchmark is novel (reducing contamination risk), the structured output format and strategy game concepts may appear in training data. This is not discussed."
    401     },
    402     {
    403       "flag": "Potential author-benchmark bias",
    404       "detail": "The authors designed both the benchmark and the HIVE system. The structured output format, behavior trees, and scenario design may inadvertently favor certain model behaviors without independent validation."
    405     }
    406   ],
    407   "cited_papers": [
    408     {
    409       "title": "Human-level play in the game of Diplomacy by combining language models with strategic reasoning",
    410       "authors": ["E. Dinan", "G. Farina", "C. Flaherty"],
    411       "year": 2022,
    412       "relevance": "Seminal work on LLMs achieving human-level strategic game play through language coordination."
    413     },
    414     {
    415       "title": "SwarmBrain: Embodied agent for real-time strategy game StarCraft II via large language models",
    416       "authors": ["X. Shao", "W. Jiang", "F. Zuo", "M. Liu"],
    417       "year": 2024,
    418       "relevance": "Hybrid LLM approach combining high-level strategic planning with low-level tactical execution in RTS games."
    419     },
    420     {
    421       "title": "Large Language Model based Multi-Agents: A Survey of Progress and Challenges",
    422       "authors": ["T. Guo", "X. Chen", "Y. Wang"],
    423       "year": 2024,
    424       "relevance": "Comprehensive survey of LLM-based multi-agent systems covering agent profiling, communication, and environment interaction."
    425     },
    426     {
    427       "title": "LLM as a Mastermind: A Survey of Strategic Reasoning with Large Language Models",
    428       "authors": ["Y. Zhang", "S. Mao", "T. Ge"],
    429       "year": 2024,
    430       "relevance": "Survey of LLM strategic reasoning capabilities relevant to evaluating LLM coordination abilities."
    431     },
    432     {
    433       "title": "BALROG: Benchmarking Agentic LLM and VLM Reasoning On Games",
    434       "authors": ["D. Paglieri"],
    435       "year": 2024,
    436       "relevance": "Benchmark for evaluating LLM/VLM reasoning in game environments, showing limitations in complex dynamic settings."
    437     },
    438     {
    439       "title": "Voyager: An open-ended embodied agent with large language models",
    440       "authors": ["G. Wang", "Y. Xie", "Y. Jiang"],
    441       "year": 2023,
    442       "arxiv_id": "2305.16291",
    443       "relevance": "LLM-based agent generating code for game control in Minecraft, an alternative approach to HIVE's structured output."
    444     },
    445     {
    446       "title": "JaxMARL: Multi-Agent RL Environments in JAX",
    447       "authors": ["A. Rutherford", "B. Ellis", "M. Gallici"],
    448       "year": 2023,
    449       "relevance": "Foundation environment for multi-agent RL research using JAX parallelization, directly informs HIVE's implementation."
    450     },
    451     {
    452       "title": "Vision language models are blind",
    453       "authors": ["P. Rahmanzadehgervi", "L. Bolton", "M. R. Taesiri"],
    454       "year": 2025,
    455       "relevance": "Documents VLM failures on trivial spatial reasoning tasks, supporting HIVE's finding that image-based map input fails."
    456     },
    457     {
    458       "title": "An empirical study of the non-determinism of chatgpt in code generation",
    459       "authors": ["S. Ouyang", "J. M. Zhang", "M. Harman"],
    460       "year": 2025,
    461       "relevance": "Studies LLM non-determinism at temperature 0, directly relevant to HIVE's methodology choices."
    462     },
    463     {
    464       "title": "AgentCoord: Visually Exploring Coordination Strategy for LLM-based Multi-Agent Collaboration",
    465       "authors": ["B. Pan", "J. Lu", "K. Wang"],
    466       "year": 2024,
    467       "relevance": "Visual interface for LLM multi-agent coordination strategies, complementary approach to HIVE."
    468     }
    469   ]
    470 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs