scan.json (25475B)
1 { 2 "paper": { 3 "title": "Building Cooperative Embodied Agents Modularly with Large Language Models", 4 "authors": ["Hongxin Zhang", "Weihua Du", "Jiaming Shan", "Qinhong Zhou", "Yilun Du", "Joshua B. Tenenbaum", "Tianmin Shu", "Chuang Gan"], 5 "year": 2023, 6 "venue": "ICLR 2024", 7 "arxiv_id": "2307.02485", 8 "doi": "10.48550/arXiv.2307.02485" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "The paper provides a project website (https://vis-www.cs.umass.edu/Co-LLM-Agents/) mentioned in the abstract. This serves as a public resource for the project." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The environments used (TDW-MAT and C-WAH) are extensions of publicly available platforms (ThreeDWorld and VirtualHome-Social/Watch-And-Help), and the paper describes the test sets in detail (24 episodes for TDW-MAT, 10 for C-WAH). The environments are publicly available." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The paper mentions training a Mask-RCNN, using GPT-4 API, LLAMA-2-13b-chat, and LoRA fine-tuning, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "While the paper provides detailed descriptions of the framework, modules, and prompts in the appendix, there are no explicit step-by-step reproduction instructions (e.g., a README with commands to run)." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "Results in Tables 1 and 2 report only point estimates (e.g., Transport Rate of 0.69, Average Steps of 59) with no confidence intervals or error bars." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": true, 42 "justification": "The paper reports a t-test p-value (p=0.0003) for the trust score comparison between CoELA and CoELA w/o communication in the user study (Section 5.3.2). However, no significance tests are reported for the main AI-AI cooperation results." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports Efficiency Improvement percentages with baseline context throughout (e.g., '45% efficiency improvement' for MHP+CoELA vs MHP alone, trust score of '6.3 vs. 4.7'), providing magnitude context for the improvements." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The test set sizes (24 episodes for TDW-MAT, 10 for C-WAH) and user study size (8 subjects, 80 trials) are not justified. No power analysis is provided. The user study with 8 subjects is particularly small." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "Table 1 reports results 'over 5 runs for RHP and 1 run for CoELA due to cost constraints.' CoELA results are single-run with no variance reported. The baselines report averages over 5 runs but without standard deviation." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares against three baselines: RHP (Rule-based Hierarchical Planner), MHP (MCTS-based Hierarchical Planner), and MAT (Multi-Agent Transformer) across both environments." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The baselines are adopted from the original challenges (ThreeDWorld Transport Challenge and Watch-And-Help Challenge), representing the strongest known methods for these specific tasks. MAT (2022) is contemporary." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "Figure 4c shows ablation results: removing communication, removing memory module, replacing GPT-4 with GPT-3.5, and removing the execution module. These test the contribution of individual components." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper uses Transport Rate and Efficiency Improvement for TDW-MAT, Average Steps and Efficiency Improvement for C-WAH, and three subjective rating criteria (effectiveness, helpfulness, trust) for the user study." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 5.3.2 describes a user study with 8 human subjects who cooperated with CoELA and rated the agents on 7-point Likert scales for communication effectiveness, helpfulness, and trust." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "The paper uses separate training and test sets: '2 floorplans are for the training set and another two are for the test set' (Appendix B.1). CoLLAMA is trained on training set data and evaluated on the test set." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Table 1 provides per-category breakdown for TDW-MAT (Food vs. Stuff subtasks), and results are reported separately for each environment. The user study reports three separate rating dimensions." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 5.5 'Failure Cases and Limitations of LLM' discusses three types of failures: limited 3D spatial reasoning, lack of low-level action reasoning, and unstable complex reasoning, with specific examples in Figure 5." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "Several negative results are reported: LLAMA-2 'still underperform' significantly (0.15 TR vs 0.85 for GPT-4 with oracle perception); communication did not significantly improve AI-AI cooperation ('We did not observe a significant performance drop when disabling communication among AI agents'); removing execution module caused all trials to fail." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims CoELA 'can surpass strong planning-based methods' (supported by Tables 1-2), 'exhibit emergent effective communication' (supported by qualitative analysis in Figure 3), and CoLLAMA achieves 'promising performance' (supported by Table 1). The trust claim from user study is supported by Figure 4b." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper makes causal claims through ablation studies (removing memory module doubles steps, removing communication has minimal effect on AI-AI cooperation). These are controlled single-variable manipulations within the framework, which is adequate for the claims made." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The title 'Building Cooperative Embodied Agents Modularly with Large Language Models' is broad, but results are on two specific simulated environments (TDW-MAT and C-WAH) with household rearrangement tasks only. The abstract claims about 'multi-agent cooperation' are not bounded to these specific domains." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper does not discuss alternative explanations for the improvements. For example, the performance gains could partly come from the handcrafted prompts, the action list design, or the specific task structure rather than LLM reasoning. No threats-to-validity section is present." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper says 'GPT-4 from the OpenAI API' with a footnote 'Our main experiments are done between 2023.9.1-2023.9.28 and 2023.5.1-2023.5.16' but does not specify the exact model version/snapshot (e.g., gpt-4-0613). It also uses 'GPT-3.5' without a version string, and 'LLAMA-2-13b-chat' which is more specific." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Full example prompts are provided in Appendix E (Tables 4 and 6) for both C-WAH and TDW-MAT environments, including the instruction head, goal description, state description, action/dialogue history, and action list with actual fill values." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 5.2 states 'temperature 0.7, top-p 1, and max tokens 256.' Appendix C.3 reports CoLLAMA training hyperparameters: 'batch size of 384, maximal sequence length of 2048, and a max learning rate of 4e-4 for 30 epochs.' MAT training details are in Appendix C.1: 'hidden layer dim 64, learning rate 7e-4, ppo epoch 10.'" 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "The entire paper describes the agentic scaffolding in detail: the five-module framework (Perception, Memory, Communication, Planning, Execution) is thoroughly described in Sections 4.1-4.6 with workflow diagrams (Figure 2), a detailed working example (Figure 7), and prompt design." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "The perception pipeline (Mask-RCNN training, 3D point cloud construction, semantic map building) is described in detail in Section 4.2 and Appendix A.1. For CoLLAMA, Appendix C.3 documents: '2k trajectories from 10 episodes' filtered to '572 high-quality data with effective communication behavior.'" 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 5.5 'Failure Cases and Limitations of LLM' provides a dedicated discussion of three specific limitations: limited 3D spatial information usage, lack of low-level action reasoning, and unstable complex reasoning." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 5.5 discusses specific threats: the inability to incorporate spatial information causing time-consuming plans, the abstraction of high-level plans making the system unaware of low-level execution states (illustrated in Figure 5a), and LLM counting errors (Figure 5b)." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to simulated environments, the specific task types tested, or acknowledge that the results may not transfer to real-world physical embodied cooperation." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "No raw experimental data (trajectories, logs, LLM outputs) is made available. Only aggregated results are presented in tables." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "The test set construction is described: '6 scenes from the TDW-House dataset and sampled 2 out of the two types of tasks in each of the scenes, making a test set of 24 episodes' (Section 5.1). C-WAH: '2 tasks from each of the five types of activities to construct a test set of 10 episodes.' User study: 8 subjects, 80 trials." 183 }, 184 "recruitment_methods_described": { 185 "applies": true, 186 "answer": false, 187 "justification": "For the user study with 8 human subjects, the paper states 'We recruited 8 human subjects' but does not describe recruitment methods, channels, or whether recruitment could introduce bias." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The pipeline from observation to action is well documented through the modular framework description. For CoLLAMA: '2k trajectories → manually filtered 572 high-quality data' (Appendix C.3). The Mask-RCNN training pipeline is documented: '53K 512×512 RGB images' collected by random sampling." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "The acknowledgement section thanks individuals for discussions and reviewers for suggestions, but no funding sources are disclosed." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: UMass Amherst, Tsinghua University, Shanghai Jiao Tong University, MIT, and MIT-IBM Watson AI Lab." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding information is disclosed. Authors are affiliated with MIT-IBM Watson AI Lab, and the paper uses OpenAI's GPT-4 API, but no funding disclosure or independence statement is made." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses GPT-4 and GPT-3.5 but does not state their training data cutoff dates. This is relevant because the benchmark tasks could potentially be similar to patterns in training data." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "No discussion of whether GPT-4's training data could include information about the ThreeDWorld or VirtualHome environments or similar household task-planning scenarios." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "The environments are custom extensions, which reduces contamination risk, but this is not explicitly discussed. The paper does not address whether the LLMs' commonsense knowledge about household tasks constitutes a form of contamination advantage." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": true, 237 "answer": false, 238 "justification": "No pre-registration is mentioned for the user study with 8 human subjects." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": true, 242 "answer": false, 243 "justification": "No IRB or ethics board approval is mentioned for the user study involving 8 human participants." 244 }, 245 "demographics_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "No demographics are reported for the 8 human subjects. Experience level, background, age, gender, etc. are not described." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": true, 252 "answer": false, 253 "justification": "No inclusion or exclusion criteria are stated for participant recruitment. The paper only says 'We recruited 8 human subjects.'" 254 }, 255 "randomization_described": { 256 "applies": true, 257 "answer": false, 258 "justification": "The paper describes that 'We made sure each subject do 10 trials with at least two trials under each scenario' but does not describe how trials were assigned or randomized across conditions." 259 }, 260 "blinding_described": { 261 "applies": true, 262 "answer": false, 263 "justification": "No blinding is described. Participants likely knew which agent they were cooperating with since the communication style would differ (natural language vs. template language), but this is not discussed." 264 }, 265 "attrition_reported": { 266 "applies": true, 267 "answer": false, 268 "justification": "No information on whether any participants dropped out or failed to complete all trials. The paper states 80 trials total from 8 subjects but does not report attrition." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "The paper acknowledges cost constraints ('1 run for CoELA due to cost constraints') but does not report actual API costs, tokens consumed, or inference time per episode." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "No total computational budget is stated. Training details for Mask-RCNN, MAT, and CoLLAMA are given, but GPU hours, total API spend, or hardware specifications are not reported." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "CoELA driven by GPT-4 surpasses strong planning-based methods, achieving 36% efficiency improvement over RHP+RHP (29%) on TDW-MAT and 45% over MHP+MHP (33%) on C-WAH.", 287 "evidence": "Tables 1 and 2 show Transport Rate and Average Steps results across both environments. RHP+CoELA achieves 0.69 TR (36% EI) vs RHP+RHP 0.61 (29%) on TDW-MAT. MHP+CoELA achieves 59 steps (45% EI) vs MHP+MHP 75 steps (33%) on C-WAH.", 288 "supported": "moderate" 289 }, 290 { 291 "claim": "CoLLAMA (fine-tuned LLAMA-2) achieves competitive performance with GPT-4, reaching 0.70 TR on TDW-MAT compared to GPT-4's 0.71.", 292 "evidence": "Table 1 shows CoLLAMA achieving 0.70 total TR vs GPT-4's 0.71, and even surpassing GPT-4 on the Stuff subtask (0.66 vs 0.61). Fine-tuned on only 572 examples (Appendix C.3).", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "Humans trust CoELA communicating in natural language more than agents without communication (trust score 6.3 vs 4.7, p=0.0003).", 297 "evidence": "Section 5.3.2 and Figure 4b report user study results from 8 subjects across 80 trials, with a t-test yielding p=0.0003 for the trust score difference.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "Communication among AI agents does not significantly improve cooperation efficiency.", 302 "evidence": "Section 5.4 states 'We did not observe a significant performance drop when disabling communication among AI agents' and Figure 4c shows the ablation results.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "The Memory Module is essential, with removal nearly doubling the steps needed to finish tasks.", 307 "evidence": "Section 5.4 and Figure 4c show the ablation result: 'the steps needed to finish the task for the agent with no Memory Module nearly double.'", 308 "supported": "moderate" 309 } 310 ], 311 "methodology_tags": ["benchmark-eval", "case-study"], 312 "key_findings": "CoELA, a modular framework using LLMs for cooperative embodied agents, outperforms strong planning-based baselines (RHP, MHP) by 36-45% efficiency improvement on two multi-agent household task environments (TDW-MAT and C-WAH). Communication between AI agents does not significantly help, though human-agent communication improves both task performance and human trust. Fine-tuned CoLLAMA (LLAMA-2 with LoRA on 572 examples) achieves performance competitive with GPT-4. The Memory Module is the most critical component, while removing the Execution Module causes complete task failure.", 313 "red_flags": [ 314 { 315 "flag": "Single-run results for main system", 316 "detail": "CoELA (GPT-4) results are from a single run 'due to cost constraints,' while baselines are averaged over 5 runs. This makes it impossible to assess the variance of the main results and raises concerns about result reliability." 317 }, 318 { 319 "flag": "Very small user study", 320 "detail": "The user study has only 8 participants with no reported demographics, IRB approval, randomization procedure, or blinding. With 4 conditions, each subject did only 2-3 trials per condition, making per-condition sample sizes extremely small." 321 }, 322 { 323 "flag": "No variance or uncertainty quantification on main results", 324 "detail": "Despite the stochastic nature of both LLM outputs and environment dynamics, no standard deviations, confidence intervals, or error bars are reported for the primary CoELA results." 325 }, 326 { 327 "flag": "Communication ineffective for AI-AI but highlighted as contribution", 328 "detail": "The paper frames natural language communication as a key contribution, but the ablation shows it does not significantly help AI-AI cooperation. The benefit appears mainly for human-AI interaction, which is tested with only 8 subjects." 329 }, 330 { 331 "flag": "Cost transparency missing despite cost-constrained evaluation", 332 "detail": "The paper acknowledges that cost constraints limited them to single runs, but never reports the actual API costs or computational budget, making it impossible to assess practicality." 333 } 334 ], 335 "cited_papers": [ 336 { 337 "title": "GPT-4 Technical Report", 338 "authors": ["OpenAI"], 339 "year": 2023, 340 "relevance": "Core LLM evaluated in this paper; relevant to understanding GPT-4 capabilities for agentic tasks." 341 }, 342 { 343 "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models", 344 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 345 "year": 2023, 346 "arxiv_id": "2307.09288", 347 "relevance": "Open-source LLM used as alternative to GPT-4 and fine-tuned into CoLLAMA for embodied cooperation." 348 }, 349 { 350 "title": "Generative Agents: Interactive Simulacra of Human Behavior", 351 "authors": ["Joon Sung Park", "Joseph C. O'Brien", "Carrie J. Cai"], 352 "year": 2023, 353 "arxiv_id": "2304.03442", 354 "relevance": "LLM-based agent society with memory augmentation; directly related to multi-agent LLM cooperation." 355 }, 356 { 357 "title": "Voyager: An Open-Ended Embodied Agent with Large Language Models", 358 "authors": ["Guanzhi Wang", "Yuqi Xie", "Yunfan Jiang"], 359 "year": 2023, 360 "arxiv_id": "2305.16291", 361 "relevance": "LLM-driven embodied agent with skill library; related approach to using LLMs for embodied planning." 362 }, 363 { 364 "title": "Language Models as Zero-Shot Planners: Extracting Actionable Knowledge for Embodied Agents", 365 "authors": ["Wenlong Huang", "Pieter Abbeel", "Deepak Pathak", "Igor Mordatch"], 366 "year": 2022, 367 "relevance": "Foundational work on using LLMs for embodied planning without fine-tuning." 368 }, 369 { 370 "title": "Code as Policies: Language Model Programs for Embodied Control", 371 "authors": ["Jacky Liang", "Wenlong Huang", "Fei Xia"], 372 "year": 2022, 373 "arxiv_id": "2209.07753", 374 "relevance": "Uses code generation from LLMs for embodied control, an alternative approach to the prompting-based planning in this paper." 375 }, 376 { 377 "title": "Improving Factuality and Reasoning in Language Models through Multiagent Debate", 378 "authors": ["Yilun Du", "Shuang Li", "Antonio Torralba"], 379 "year": 2023, 380 "arxiv_id": "2305.14325", 381 "relevance": "Multi-LLM debate approach to improve reasoning; related multi-agent LLM cooperation paradigm." 382 }, 383 { 384 "title": "CAMEL: Communicative Agents for 'Mind' Exploration of Large Scale Language Model Society", 385 "authors": ["Guohao Li", "Hasan Abed Al Kader Hammoud"], 386 "year": 2023, 387 "arxiv_id": "2303.17760", 388 "relevance": "Multi-agent LLM communication framework for collaborative task solving." 389 }, 390 { 391 "title": "Sparks of Artificial General Intelligence: Early Experiments with GPT-4", 392 "authors": ["Sébastien Bubeck", "Varun Chandrasekaran", "Ronen Eldan"], 393 "year": 2023, 394 "relevance": "Evaluates GPT-4 capabilities including reasoning and Theory of Mind, which this paper relies on for cooperative planning." 395 }, 396 { 397 "title": "A Survey on Large Language Model Based Autonomous Agents", 398 "authors": ["Lei Wang", "Chen Ma", "Xueyang Feng"], 399 "year": 2023, 400 "arxiv_id": "2308.11432", 401 "relevance": "Comprehensive survey of LLM-based agents, covering planning, memory, and tool use capabilities." 402 }, 403 { 404 "title": "Cognitive Architectures for Language Agents", 405 "authors": ["Theodore Sumers", "Shunyu Yao", "Karthik Narasimhan", "Thomas L. Griffiths"], 406 "year": 2023, 407 "arxiv_id": "2309.02427", 408 "relevance": "Theoretical framework for language agent architectures, directly relevant to the cognitive-inspired modular design in this paper." 409 }, 410 { 411 "title": "The Rise and Potential of Large Language Model Based Agents: A Survey", 412 "authors": ["Zhiheng Xi", "Wenxiang Chen", "Xin Guo"], 413 "year": 2023, 414 "arxiv_id": "2309.07864", 415 "relevance": "Survey of LLM-based agents covering capabilities and limitations relevant to the survey scope." 416 } 417 ] 418 }