scan.json (24221B)
1 { 2 "paper": { 3 "title": "TeamCraft: A Benchmark for Multi-Modal Multi-Agent Systems in Minecraft", 4 "authors": ["Qian Long", "Zhi Li", "Ran Gong", "Ying Nian Wu", "Demetri Terzopoulos", "Xiaofeng Gao"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2412.05255", 8 "doi": "10.48550/arXiv.2412.05255" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "GitHub repository URL provided: https://github.com/teamcraft-bench/teamcraft. Paper states 'we open source the entire platform, its training and evaluation code, and release the model checkpoints and training data.'" 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "Paper states dataset will be available on Huggingface (Appendix K) and training data and model checkpoints are released via GitHub." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Table 5 lists hyperparameters, model architecture details (CLIP ViT-L/14, Vicuna-v1.5), and training used 8 A100 GPUs. However, no requirements.txt or Dockerfile mentioned explicitly in the paper text. The released code repository would contain this. Borderline — the paper itself specifies key dependencies (CLIP, Vicuna, Mineflayer) but not a full environment spec." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "No step-by-step reproduction instructions in the paper itself. The paper refers to the GitHub repo but does not include a 'Reproducing Results' section with commands." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": false, 39 "justification": "All results in Tables 6-8 and Figure 5 report single point estimates with no confidence intervals or error bars." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper makes comparative claims (e.g., centralized outperforms decentralized, grid-world outperforms VLA) but no statistical significance tests are reported." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "Results are reported as absolute success rates with baseline context (e.g., Table 6 shows 0.42 vs 0.00 task success rate for centralized vs decentralized building), allowing readers to assess magnitude of differences." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "Test set has 50 samples per task and 50 per generalization condition (950 total). No justification for why these sizes were chosen or whether they are sufficient for statistical power." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "No variance, standard deviation, or spread measures reported across experimental runs. Results appear to be single-run numbers." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "Paper compares TeamCraft-VLA (7B and 13B), GPT-4o, and a grid-world text-based LLM baseline across centralized and decentralized settings." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "GPT-4o (gpt-4o-2024-08-06) is a contemporary proprietary model. The fine-tuned baselines use Vicuna-v1.5 and CLIP ViT-L/14, which are reasonable for 2024." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": true, 76 "justification": "Paper performs 15 ablation studies varying dataset size (10%, 50%, 100%), control settings (centralized/decentralized), modalities (multi-modal vs grid-world), and model sizes (7B, 13B)." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "Three metrics used: task success rate, subgoal success rate, and redundancy rate (Equations 2-4)." 82 }, 83 "human_evaluation": { 84 "applies": false, 85 "answer": false, 86 "justification": "This is a benchmark paper evaluating automated agent performance in a simulated environment. Human evaluation of agent outputs is not clearly relevant to the claims about agent generalization capabilities." 87 }, 88 "held_out_test_set": { 89 "applies": true, 90 "answer": true, 91 "justification": "Paper explicitly separates training, test, and generalization sets. The generalization set uses held-out elements (novel shapes, materials, scenes, 4 agents) excluded from training data (Section 3.7)." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Tables 6-8 provide detailed per-task (building, clearing, farming, smelting) and per-generalization-condition (test, shape, material, scene, agents) breakdowns." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 4.4 provides qualitative failure analysis with specific examples: object mismatching, task allocation failure, and object state recognition failure. Appendix G.4 provides additional detailed failure case studies with figures." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "Several negative findings reported: GPT-4o fails almost all tasks, farming crop generalization is 0% across all models, more training data can decrease performance in grid-world setting (overfitting), decentralized agents have high redundancy rates." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Abstract claims that 'existing models continue to face significant challenges in generalizing to novel goals, scenes, and unseen numbers of agents' are well-supported by the experimental results showing low success rates, especially in generalization splits." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper's causal claims are primarily from ablation studies (e.g., removing visual modality in grid-world, varying dataset size, comparing centralized vs decentralized). These are controlled single-variable manipulations and the ablation design is adequate." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": true, 123 "justification": "Claims are generally bounded to the Minecraft TeamCraft environment. The paper frames findings as challenges within the benchmark rather than making broad claims about all multi-agent systems." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper discusses alternative explanations: grid-world outperformance suggests visual processing is the bottleneck (not planning); more data lowering performance in grid-world suggests overfitting; centralized vs decentralized gap is attributed to information availability." 129 }, 130 "proxy_outcome_distinction": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper measures task success rate, subgoal success rate, and redundancy rate directly — these are well-defined metrics that match the claims about agent performance. No proxy gap exists between measurement and claims." 134 } 135 }, 136 "setup_transparency": { 137 "model_versions_specified": { 138 "applies": true, 139 "answer": true, 140 "justification": "GPT-4o version specified as 'gpt-4o-2024-08-06' (Appendix H). VLA models use CLIP ViT-L/14 and Vicuna-v1.5 with specific sizes (7B, 13B)." 141 }, 142 "prompts_provided": { 143 "applies": true, 144 "answer": true, 145 "justification": "Full prompt text provided for GPT-4o in Appendix H (Figures 17, 19, 21) including system prompts and user prompts. Grid-world prompts in Appendix E (Figures 7-10)." 146 }, 147 "hyperparameters_reported": { 148 "applies": true, 149 "answer": true, 150 "justification": "Table 5 reports hyperparameters: lr=2e-5, model max length=4096, patch size=14, resolution=336x336, optimizer=AdamW, lr scheduler=constant_with_warmup, warmup ratio=0.03." 151 }, 152 "scaffolding_described": { 153 "applies": true, 154 "answer": true, 155 "justification": "The platform architecture is described in Figure 1 and Section 3.2. High-level skills are translated to low-level Mineflayer API calls. Table 4 documents all 8 skill types. Appendix A provides detailed skill descriptions." 156 }, 157 "data_preprocessing_documented": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 3.6 describes task generation via rejection sampling and planner-based demonstration generation with the cost function (Equation 1). Appendix D provides planner weight details per task." 161 } 162 }, 163 "limitations_and_scope": { 164 "limitations_section_present": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 5.1 'Limitations and Future Work' lists four specific limitations." 168 }, 169 "threats_to_validity_specific": { 170 "applies": true, 171 "answer": true, 172 "justification": "Limitations are specific to this work: reliance on MineFlayer oracle controller, training only on procedural demonstrations (not human), decentralized agents lack explicit communication, no human-AI collaboration tested." 173 }, 174 "scope_boundaries_stated": { 175 "applies": true, 176 "answer": true, 177 "justification": "Limitations section explicitly states what was not tested: low-level control (uses oracle controller), human demonstrations, explicit communication between agents, human-AI collaboration." 178 } 179 }, 180 "data_integrity": { 181 "raw_data_available": { 182 "applies": true, 183 "answer": true, 184 "justification": "Dataset released on Huggingface with 55,000+ demonstrations. Model checkpoints also released." 185 }, 186 "data_collection_described": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 3.6 describes task generation (rejection sampling from diversity pools) and demonstration generation (planner-based with cost function optimization). Appendix D provides detailed planner parameters." 190 }, 191 "recruitment_methods_described": { 192 "applies": false, 193 "answer": false, 194 "justification": "No human participants. Data is procedurally generated in a simulated environment." 195 }, 196 "data_pipeline_documented": { 197 "applies": true, 198 "answer": true, 199 "justification": "Pipeline documented: variables sampled → rejection sampling for solvable tasks → planner assigns actions → demonstrations recorded with prompts, observations, and actions. Statistics in Table 2 and Appendix J." 200 } 201 }, 202 "conflicts_of_interest": { 203 "funding_disclosed": { 204 "applies": true, 205 "answer": false, 206 "justification": "No funding information or acknowledgments section found in the paper." 207 }, 208 "affiliations_disclosed": { 209 "applies": true, 210 "answer": true, 211 "justification": "Author affiliations listed: UCLA and Amazon AGI. The Amazon author includes a footnote: 'This work does not relate to the author's position at Amazon.'" 212 }, 213 "funder_independent_of_outcome": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding disclosed, so independence cannot be assessed. One author is from Amazon AGI, which has a financial interest in multi-agent AI systems, though the footnote disclaims connection." 217 }, 218 "financial_interests_declared": { 219 "applies": true, 220 "answer": false, 221 "justification": "No competing interests statement found in the paper." 222 } 223 }, 224 "contamination": { 225 "training_cutoff_stated": { 226 "applies": true, 227 "answer": false, 228 "justification": "GPT-4o is used for evaluation but its training data cutoff is not stated. The fine-tuned models use only the TeamCraft data, but GPT-4o's training data is relevant." 229 }, 230 "train_test_overlap_discussed": { 231 "applies": true, 232 "answer": false, 233 "justification": "For GPT-4o, no discussion of whether Minecraft-related task solutions might appear in training data. The custom benchmark mitigates this risk but it is not discussed." 234 }, 235 "benchmark_contamination_addressed": { 236 "applies": true, 237 "answer": false, 238 "justification": "TeamCraft is a new benchmark, which inherently reduces contamination risk for GPT-4o, but this advantage is not explicitly discussed or claimed." 239 } 240 }, 241 "human_studies": { 242 "pre_registered": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants in this study." 246 }, 247 "irb_or_ethics_approval": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants in this study." 251 }, 252 "demographics_reported": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "inclusion_exclusion_criteria": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "randomization_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "blinding_described": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "attrition_reported": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 } 277 }, 278 "cost_and_practicality": { 279 "inference_cost_reported": { 280 "applies": true, 281 "answer": false, 282 "justification": "No inference costs reported for GPT-4o API calls or VLA model inference time." 283 }, 284 "compute_budget_stated": { 285 "applies": true, 286 "answer": true, 287 "justification": "Training compute stated: 8 A100 GPUs, 7B centralized takes 36 hours, 13B takes 72 hours, decentralized doubles training time, grid-world 7B takes 20 hours (Appendix F)." 288 } 289 }, 290 "experimental_rigor": { 291 "seed_sensitivity_reported": { 292 "applies": true, 293 "answer": false, 294 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single training runs." 295 }, 296 "number_of_runs_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "The number of experimental runs is not stated. It appears each model configuration was trained once for 3 epochs." 300 }, 301 "hyperparameter_search_budget": { 302 "applies": true, 303 "answer": false, 304 "justification": "No hyperparameter search budget reported. Hyperparameters in Table 5 appear chosen without documented search." 305 }, 306 "best_config_selection_justified": { 307 "applies": true, 308 "answer": false, 309 "justification": "Models trained for 3 epochs 'before convergence' but no validation-based selection criterion or early stopping details provided." 310 }, 311 "multiple_comparison_correction": { 312 "applies": false, 313 "answer": false, 314 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 315 }, 316 "self_comparison_bias_addressed": { 317 "applies": true, 318 "answer": false, 319 "justification": "The authors evaluate their own TeamCraft-VLA model against GPT-4o and grid-world baselines without acknowledging self-comparison bias." 320 }, 321 "compute_budget_vs_performance": { 322 "applies": true, 323 "answer": false, 324 "justification": "GPT-4o uses one-shot prompting vs fine-tuned VLA models trained on 55K demonstrations. This massive compute/data asymmetry is not discussed as a confound when comparing performance." 325 }, 326 "benchmark_construct_validity": { 327 "applies": true, 328 "answer": false, 329 "justification": "The paper does not discuss whether the Minecraft-based tasks actually measure the collaborative multi-agent capabilities claimed. No analysis of construct validity or comparison with alternative benchmarks." 330 }, 331 "scaffold_confound_addressed": { 332 "applies": true, 333 "answer": false, 334 "justification": "TeamCraft-VLA uses a fine-tuned architecture while GPT-4o uses one-shot prompting with different prompt structures. This scaffolding difference is not addressed when comparing their performance." 335 } 336 }, 337 "data_leakage": { 338 "temporal_leakage_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "No discussion of whether GPT-4o's training data could include Minecraft task solutions or related demonstrations." 342 }, 343 "feature_leakage_addressed": { 344 "applies": true, 345 "answer": false, 346 "justification": "No discussion of whether the evaluation setup leaks information (e.g., inventory contents revealing optimal strategies)." 347 }, 348 "non_independence_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "The procedurally generated tasks share structural templates. No discussion of whether train/test tasks are sufficiently independent despite using the same generation process." 352 }, 353 "leakage_detection_method": { 354 "applies": true, 355 "answer": false, 356 "justification": "No concrete leakage detection or prevention method applied." 357 } 358 } 359 }, 360 "claims": [ 361 { 362 "claim": "Existing models face significant challenges in generalizing to novel goals, scenes, and unseen numbers of agents.", 363 "evidence": "Tables 6-8 and Figure 5 show performance drops across generalization splits, particularly for novel goals (farming crop generalization is 0% for all models) and 4-agent scenarios.", 364 "supported": "strong" 365 }, 366 { 367 "claim": "Centralized control significantly outperforms decentralized control across nearly all task variants.", 368 "evidence": "Figure 6 and Tables 6-7 show centralized models consistently achieving higher task success rates. Redundancy rates in Table 3 show 0.01 for centralized vs 0.15 for decentralized.", 369 "supported": "strong" 370 }, 371 { 372 "claim": "Grid-world (text-based) models significantly outperform VLA models in multi-modal settings.", 373 "evidence": "Table 8 shows grid-world 7B achieving 1.00 clearing success rate vs 0.64 for VLA-7B, and 0.78 farming test vs 0.36 for VLA-7B. Figure 5 confirms the pattern.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "GPT-4o fails on almost all test cases due to lack of 3D spatial reasoning.", 378 "evidence": "Figure 6 shows GPT-4o near-zero task success rates. Section 4.3 notes GPT-4o struggles with mapping block coordinates from visual inputs.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Scaling model size from 7B to 13B does not guarantee improved performance, especially with sufficient data.", 383 "evidence": "Figure 5 shows 7B model approaching 13B performance as data increases. Tables 6-7 confirm similar patterns.", 384 "supported": "moderate" 385 } 386 ], 387 "methodology_tags": ["benchmark-eval"], 388 "key_findings": "TeamCraft introduces a 55,000-variant multi-modal multi-agent benchmark in Minecraft with four task types. Experiments reveal that current VLA models struggle significantly with generalization to novel goals (0% on unseen crops) and unseen numbers of agents. Text-based grid-world representations dramatically outperform visual inputs, exposing a major gap in VLA models' visual understanding. Centralized control vastly outperforms decentralized settings, highlighting the challenge of implicit coordination without shared information.", 389 "red_flags": [ 390 { 391 "flag": "Unfair GPT-4o comparison", 392 "detail": "GPT-4o uses one-shot prompting while TeamCraft-VLA is fine-tuned on 55K demonstrations. This is not an apples-to-apples comparison, yet the paper presents them side by side in Figure 6 without adequately discussing this asymmetry." 393 }, 394 { 395 "flag": "No error bars or multiple runs", 396 "detail": "All results appear to be from single training runs with no variance reported. Given that VLA training can be sensitive to initialization, single-run results may not be reliable." 397 }, 398 { 399 "flag": "No statistical tests for comparative claims", 400 "detail": "Multiple comparative claims (centralized vs decentralized, 7B vs 13B, modality comparisons) are made without any statistical significance testing." 401 } 402 ], 403 "cited_papers": [ 404 { 405 "title": "VIMA: General Robot Manipulation with Multimodal Prompts", 406 "authors": ["Y. Jiang", "A. Gupta", "Z. Zhang", "G. Wang"], 407 "year": 2022, 408 "arxiv_id": "2210.03094", 409 "relevance": "Multi-modal prompt-based task specification for embodied agents, which TeamCraft extends to multi-agent settings." 410 }, 411 { 412 "title": "MindAgent: Emergent Gaming Interaction", 413 "authors": ["R. Gong", "Q. Huang", "X. Ma"], 414 "year": 2023, 415 "relevance": "Multi-agent collaboration benchmark in Minecraft using LLMs, a direct predecessor to TeamCraft." 416 }, 417 { 418 "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models", 419 "authors": ["G. Wang", "Y. Xie", "Y. Jiang"], 420 "year": 2023, 421 "relevance": "LLM-based agent for open-ended Minecraft tasks, relevant to agentic AI capabilities." 422 }, 423 { 424 "title": "MineDojo: Building Open-Ended Embodied Agents with Internet-Scale Knowledge", 425 "authors": ["L. Fan", "G. Wang", "Y. Jiang"], 426 "year": 2022, 427 "relevance": "Large-scale Minecraft benchmark for embodied agents with internet-scale knowledge." 428 }, 429 { 430 "title": "Visual Instruction Tuning", 431 "authors": ["H. Liu", "C. Li", "Q. Wu", "Y. J. Lee"], 432 "year": 2024, 433 "relevance": "Foundation for vision-language model architecture used in TeamCraft-VLA." 434 }, 435 { 436 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 437 "authors": ["J. S. Park", "J. O'Brien", "C. J. Cai"], 438 "year": 2023, 439 "relevance": "Multi-agent simulation with LLM-based agents, relevant to agentic AI collaboration." 440 }, 441 { 442 "title": "PARTNR: A Benchmark for Planning and Reasoning in Embodied Multi-Agent Tasks", 443 "authors": ["M. Chang", "G. Chhablani", "A. Clegg"], 444 "year": 2024, 445 "arxiv_id": "2411.00081", 446 "relevance": "Contemporary multi-agent embodied benchmark for planning and reasoning." 447 }, 448 { 449 "title": "RoCo: Dialectic Multi-Robot Collaboration with Large Language Models", 450 "authors": ["Z. Mandi", "S. Jain", "S. Song"], 451 "year": 2024, 452 "relevance": "LLM-based multi-robot collaboration benchmark relevant to multi-agent coordination." 453 }, 454 { 455 "title": "CAMEL: Communicative Agents for Mind Exploration of Large Language Model Society", 456 "authors": ["G. Li", "H. Hammoud", "H. Itani"], 457 "year": 2023, 458 "relevance": "Multi-agent LLM communication framework relevant to agentic AI collaboration." 459 }, 460 { 461 "title": "ProAgent: Building Proactive Cooperative Agents with Large Language Models", 462 "authors": ["C. Zhang", "K. Yang", "S. Hu"], 463 "year": 2024, 464 "relevance": "LLM-based proactive cooperative agents, relevant to multi-agent AI systems." 465 } 466 ] 467 }