scan.json (30032B)
1 { 2 "paper": { 3 "title": "WALL-E: World Alignment by Rule Learning Improves World Model-based LLM Agents", 4 "authors": [ 5 "Siyu Zhou", 6 "Tianyi Zhou", 7 "Yijun Yang", 8 "Guodong Long", 9 "Deheng Ye", 10 "Jing Jiang", 11 "Chengqi Zhang" 12 ], 13 "year": 2024, 14 "venue": "arXiv preprint", 15 "arxiv_id": "2410.07484", 16 "doi": "10.48550/arXiv.2410.07484" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "WALL-E proposes a neurosymbolic world model combining LLMs with automatically learned rules for model-based agents using model-predictive control (MPC). On Minecraft TechTree tasks, WALL-E achieves 69% average success rate, surpassing baselines by 15-30% while using 8-20 fewer replanning rounds and 60-80% of the tokens. On ALFWorld, it reaches 95% success rate after only 6 iterations of rule learning. The rule learning process converges quickly (by iteration 4) and the learned rules are compact (6 rules for Minecraft, 11 for ALFWorld).", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper provides a GitHub link at the top: 'Project: https://github.com/elated-sawyer/WALL-E'." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper uses publicly available benchmarks: MineDojo TechTree tasks (Fan et al., 2022) and ALFWorld (Shridhar et al., 2020b), both standard public benchmarks." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper specifies the backend models (GPT-4o, GPT-3.5-Instruct) and benchmarks used, but provides no requirements.txt, Dockerfile, or detailed environment setup with library versions." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are included in the paper. The method is described in detail, but there are no specific commands or scripts to replicate results." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "Tables 1-4 report only point estimates for success rate, replanning rounds, and token costs with no confidence intervals, error bars, or ± notation." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims WALL-E 'outperforms' and 'surpasses' baselines based solely on comparing raw numbers without any statistical significance tests." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The paper reports effect sizes in context: '15-30% in success rate,' '8-20 fewer replanning rounds,' and '60-80% of tokens' compared to baselines, with per-task breakdowns in Tables 1-2 showing both WALL-E and baseline values." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "No justification for why 4 tasks per level were selected as test set, or why 30 training and 24 testing tasks are sufficient. No power analysis." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No variance, standard deviation, or any spread measure is reported across experimental runs. ALFWorld mentions 'averaged success rate over several trials' but does not report spread." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Extensive baselines are compared: DEPS, GITM, GPT-4V, Jarvis-1, Optimus-1, ReAct, Reflexion, AdaPlanner, RAFA, AutoGen, BUTLER, EMMA, and others across Tables 1 and 3." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Baselines include recent methods: Optimus-1 (2024), EMMA (2024), RAFA (2023), AdaPlanner (2024), Reflexion (2024). These represent state-of-the-art at time of writing." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Table 4 presents a systematic ablation study examining five configurations: LLM alone, LLM+WM, LLM+rules (no WM), LLM+WM+rules (in WM), and LLM+rules+WM+rules (in both). This isolates the contribution of rules and world model." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Three metrics are used: success rate, replanning rounds, and token cost (Tables 1 and 2). ALFWorld additionally uses per-task-type success rates." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "No human evaluation of the system's outputs or plans. All evaluation is automated through environment success/failure signals." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 4.1: 'We select four tasks from each level to serve as the testing set and the remaining tasks to construct the training set.' ALFWorld uses 'a set of 134 predefined testing tasks.'" 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Results are broken down by task difficulty level (Wood through Redstone) in Table 1, by action type (craft, mine, gather, fight) in Figure 5, and by ALFWorld task type (Pick, Clean, Heat, Cool, Examine, Picktwo) in Table 3." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": false, 111 "justification": "While Figure 1 illustrates a failure-recovery example and Appendix H mentions stochastic dynamics as a limitation, there is no systematic error analysis of where the system fails on test tasks or why certain task categories remain difficult." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Table 4 shows that adding a world model without rules does not significantly improve performance (38% vs 37%). Section 4.4 explicitly notes: 'MPC using a world model without applying any rules cannot significantly improve WALL-E's performance.'" 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims of 15-30% improvement (supported by Table 1: 69% vs 37-54%), 8-20 fewer replanning rounds (Table 1), 60-80% of tokens (Table 2), and 95% ALFWorld success after 6 iterations (Figure 4, Table 3) are all supported by the experimental results." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Causal claims about rule learning improving performance are supported by controlled ablation study in Table 4, which systematically adds/removes components (rules, world model) while holding other factors constant." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "Section 4.2 claims WALL-E is 'a general and environment-agnostic method' but tests only on two game environments (Minecraft and ALFWorld). Section 1 mentions applicability to 'medical care, education, autonomous driving' without any evidence. The title 'World Alignment' implies broad generality." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "No discussion of alternative explanations for the results. For example, the paper does not consider whether GPT-4o's strong prior knowledge of Minecraft mechanics (a heavily documented game) could explain the performance, or whether results would hold for environments less represented in LLM training data." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper measures success rate, replanning rounds, and token cost, and claims these reflect planning ability and sample efficiency. The metrics directly correspond to the claims being made — no significant proxy gap exists." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "Appendix E.1 states 'We utilize GPT-4o as the backend' and E.2 states 'GPT-3.5-Instruct as our backbone model.' These are marketing names without snapshot dates or API versions." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "Full prompt text is provided in Appendix B.1 (learning new rules), B.2 (refining rules), and B.3 (translating rules to code), with complete formatting and instructions." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "The paper mentions the λ parameter for rule pruning is 'very small to prioritize coverage maximum' and describes the MPC framework, but does not report LLM-specific hyperparameters (temperature, top-p, max tokens) for GPT-4o or GPT-3.5-Instruct calls." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The MPC framework, rule learning pipeline (5 steps in Figure 3), agent-world model interaction loop (Figure 2), state/action spaces (Appendix C), and code-based rule execution are described in extensive detail." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 4.1 and Appendices E.1-E.2 describe the data pipeline: MineDojo's standard evaluation pipeline, task selection for train/test splits, ALFWorld's conversion of dialogue history to structured JSON format. State and action spaces are fully documented in Appendix C." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Appendix H is titled 'Limitation and Future Work' and discusses two specific limitations: the simplicity of learned rules (transition-level only) and inability to handle stochastic dynamics." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Appendix H identifies study-specific threats: (1) rules are limited to transition-level assessment and cannot capture abstract planning rules; (2) stochastic environments where 'resource gathering at night in Minecraft often fails due to hostile creatures' are mishandled by the deterministic rule learning process." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper does not explicitly state what the results do NOT show. There is no statement bounding results to deterministic game environments, or acknowledging that the two test environments both have discrete, well-defined action spaces. The generality claims in Section 4.2 go unbounded." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "No raw trajectory data, detailed per-task results, or experimental logs are made available. Only aggregated results in tables and figures." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 4.1 and Appendix E describe data collection: MineDojo TechTree tasks with specific task levels (Table 5), 30 training/24 testing tasks for Minecraft, 134 predefined testing tasks for ALFWorld, and the iterative rule learning process (3 tasks per iteration, 10 iterations)." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data sources are standard benchmarks (MineDojo, ALFWorld)." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The full pipeline is documented: environment interaction → trajectory collection → comparison with predictions (Eq. 4) → rule learning (Eqs. 5-7) → rule pruning (Eq. 8) → evaluation. Train/test split criteria are described." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding or acknowledgments section is present in the paper. Author affiliations include Tencent (a commercial entity) but no funding disclosure." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: University of Technology Sydney, University of Maryland College Park, and Tencent." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "Cannot be assessed as no funding is disclosed. Three authors are from Tencent, which has commercial interest in LLM agent capabilities, but no funding relationship is described." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "The paper uses GPT-4o and GPT-3.5-Instruct but does not state the training data cutoff date for either model." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether GPT-4o's training data includes descriptions of MineDojo tasks, Minecraft crafting recipes, or ALFWorld task solutions, despite the LLM's prior knowledge being central to the method." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "MineDojo (2022) and ALFWorld (2020) were published before GPT-4o's training cutoff. The LLM's knowledge of Minecraft mechanics is explicitly leveraged (LLM_request functions), but the paper does not discuss whether this constitutes benchmark contamination." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": true, 292 "justification": "Table 2 reports inference token counts and API costs in USD for all methods across all task levels. For example, WALL-E averages 60,348 tokens ($0.41) per task." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "While per-task token costs are reported, the total computational budget for the full rule learning process (10 iterations × 3 tasks) is not stated. No GPU hours, total API spend, or training time is reported." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "Section E.1 mentions 'randomized seeds for both the environment and its starting position' but does not report results across multiple seeds or analyze seed sensitivity." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "ALFWorld mentions 'averaged success rate over several trials' without specifying the exact number. Minecraft does not state how many runs produced the reported results." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No mention of hyperparameter search budget. The λ parameter is described as 'very small' and the MPC horizon H is not specified with search details." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "Table 4 presents all five configurations tested in the ablation study, showing the full landscape of results. The selected configuration (LLM agent + LLM+rules world model) is justified by the highest average success rate." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The paper re-implements DEPS and uses its own experimental setup for comparisons. Some baseline results are marked '*-reported in previous work' (different conditions). No acknowledgment of self-implementation bias." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": true, 334 "justification": "Table 2 directly compares token usage and API costs across all methods alongside success rates in Table 1, allowing performance-per-cost assessment." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "No discussion of whether MineDojo TechTree tasks and ALFWorld actually measure the capabilities the paper claims to evaluate (planning, world modeling, generalization to complex environments)." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": false, 344 "justification": "WALL-E uses MPC with a world model while baselines use different scaffolding approaches (e.g., GITM uses buffered trajectories, ReAct uses reasoning+acting). Cross-method comparisons conflate the scaffold with the model. The ablation study partially addresses this within WALL-E's framework but not across baselines." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of temporal leakage. MineDojo (2022) and ALFWorld (2020) were published well before GPT-4o's training data collection. The LLM explicitly uses knowledge of Minecraft mechanics via LLM_request calls." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of feature leakage. The LLM_request function calls deliberately use GPT-4's world knowledge to make judgments (e.g., whether a tool is appropriate for mining), which is a design choice but could leak evaluation-relevant information." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether training and testing tasks share structural similarities (e.g., same crafting tree logic, similar action sequences across difficulty levels)." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention method is used or discussed." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "WALL-E surpasses existing LLM/VLM agents by 15-30% in success rate on Minecraft tasks.", 373 "evidence": "Table 1 shows WALL-E achieves 69% average success rate vs. GITM (54%), Jarvis-1 (42%), DEPS (37%). The margin ranges from 15% (vs GITM) to 32% (vs DEPS).", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "WALL-E requires 8-20 fewer replanning rounds than baselines in Minecraft.", 378 "evidence": "Table 1: WALL-E averages 15.77 replanning rounds vs. GITM (25.49) and DEPS (35.36), a reduction of ~10 and ~20 rounds respectively.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "WALL-E uses only 60-80% of the tokens compared to baselines in Minecraft.", 383 "evidence": "Table 2: WALL-E averages 60,349 tokens vs. GITM (74,639) and DEPS (93,561), which is 81% and 64% respectively.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "WALL-E achieves 95% success rate on ALFWorld after only 6 iterations.", 388 "evidence": "Figure 4 shows the learning curve reaching 95% at iteration 6. Table 3 confirms 95% overall success rate matching RAFA.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Rule learning converges by the 4th iteration with a compact rule set.", 393 "evidence": "Figure 5 shows cover rates plateauing by iteration 4. Figure 6 shows success rate hitting upper bound after 4 iterations. Final rule sets contain 6 rules for Minecraft and 11 for ALFWorld (Appendix D).", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "WALL-E is the only method that performs better than human players in the MineDojo benchmark.", 398 "evidence": "Table 1: WALL-E achieves 69% average success rate vs. 59% for human performance. However, humans outperform WALL-E on Iron (86% vs 63%) and perform comparably or better on Wood (100% vs 98%) and Stone (100% vs 91%).", 399 "supported": "weak" 400 }, 401 { 402 "claim": "Applying learned rules within the world model contributes ~30% improvement in success rate.", 403 "evidence": "Table 4: LLM+WM without rules achieves 38%, while LLM+(LLM+rules)WM achieves 69%, a 31% improvement.", 404 "supported": "moderate" 405 } 406 ], 407 "red_flags": [ 408 { 409 "flag": "No error bars or uncertainty quantification", 410 "detail": "All results in Tables 1-4 are point estimates without confidence intervals, standard deviations, or any measure of variability. The number of runs is not specified for Minecraft. This makes it impossible to assess whether observed differences are statistically meaningful." 411 }, 412 { 413 "flag": "No statistical significance tests", 414 "detail": "Claims of 'outperformance' and 'superiority' are made by comparing raw numbers without any significance testing. Given the stochasticity of both environments (randomized seeds, spawns, mob encounters), the differences could partly be due to chance." 415 }, 416 { 417 "flag": "Mixed baseline conditions", 418 "detail": "Some baseline results in Tables 1 and 3 are marked '*-reported in previous work' while others appear to be re-run. Results from different papers may use different seeds, hardware, or model versions, making direct comparison unreliable." 419 }, 420 { 421 "flag": "Overclaimed generality", 422 "detail": "The paper claims WALL-E is 'general and environment-agnostic' based on two game-like environments (Minecraft and ALFWorld) that both have discrete, well-defined action spaces. Section 1 mentions applicability to 'medical care, education, autonomous driving' without any supporting evidence." 423 }, 424 { 425 "flag": "Contamination concern deliberately unexplored", 426 "detail": "The method explicitly leverages GPT-4o's prior knowledge of Minecraft via LLM_request calls (e.g., asking whether a tool is appropriate for mining). This is a design feature, but the paper never discusses whether the LLM's extensive knowledge of these well-documented games inflates the results and would not transfer to novel environments." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 432 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik Narasimhan", "Yuan Cao"], 433 "year": 2023, 434 "relevance": "Foundational LLM agent framework combining reasoning and acting that WALL-E is compared against as a baseline." 435 }, 436 { 437 "title": "Reflexion: Language Agents with Verbal Reinforcement Learning", 438 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath", "Karthik Narasimhan", "Shunyu Yao"], 439 "year": 2024, 440 "relevance": "Key LLM agent baseline using verbal self-reflection for learning from mistakes, directly compared in both benchmarks." 441 }, 442 { 443 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation Framework", 444 "authors": ["Qingyun Wu", "Gagan Bansal", "Jieyu Zhang", "Yiran Wu", "Shaokun Zhang"], 445 "year": 2023, 446 "arxiv_id": "2308.08155", 447 "relevance": "Multi-agent conversation framework compared as a baseline on ALFWorld, relevant to agentic LLM workflows." 448 }, 449 { 450 "title": "Ghost in the Minecraft: Generally Capable Agents for Open-World Environments via Large Language Models with Text-based Knowledge and Memory", 451 "authors": ["Xizhou Zhu", "Yuntao Chen", "Hao Tian"], 452 "year": 2023, 453 "arxiv_id": "2305.17144", 454 "relevance": "Primary LLM agent baseline for Minecraft using buffered trajectories, directly compared against WALL-E's rule learning approach." 455 }, 456 { 457 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 458 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 459 "year": 2022, 460 "relevance": "Foundational prompting technique for LLM reasoning that underlies much of the agent planning work in this space." 461 }, 462 { 463 "title": "EmbodiedGPT: Vision-Language Pre-Training via Embodied Chain of Thought", 464 "authors": ["Yao Mu", "Qinglong Zhang", "Mengkang Hu"], 465 "year": 2023, 466 "arxiv_id": "2305.15021", 467 "relevance": "Represents the approach of fine-tuning LLMs for embodied agent tasks, contrasted with WALL-E's gradient-free rule learning approach." 468 }, 469 { 470 "title": "GPT-4 Technical Report", 471 "authors": ["OpenAI"], 472 "year": 2023, 473 "relevance": "Foundation model whose capabilities are leveraged as the backbone for WALL-E's world model and agent." 474 }, 475 { 476 "title": "Rt-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control", 477 "authors": ["Brianna Zitkovich", "Tianhe Yu", "Sichun Xu"], 478 "year": 2023, 479 "relevance": "VLM-based robotic control that represents an alternative paradigm to rule-based world modeling for embodied agents." 480 }, 481 { 482 "title": "Reason for Future, Act for Now: A Principled Framework for Autonomous LLM Agents with Provable Sample Efficiency", 483 "authors": ["Zhihan Liu", "Hao Hu", "Shenao Zhang"], 484 "year": 2023, 485 "arxiv_id": "2309.17382", 486 "relevance": "RAFA framework that achieves competitive 95% success on ALFWorld, directly compared as a planning-based agent baseline." 487 }, 488 { 489 "title": "Optimus-1: Hybrid Multimodal Memory Empowered Agents Excel in Long-Horizon Tasks", 490 "authors": ["Zaijing Li", "Yuquan Xie", "Rui Shao"], 491 "year": 2024, 492 "arxiv_id": "2408.03615", 493 "relevance": "State-of-the-art VLM agent for Minecraft that WALL-E outperforms, demonstrating the advantage of rule-based world alignment." 494 }, 495 { 496 "title": "Describe, Explain, Plan and Select: Interactive Planning with Large Language Models Enables Open-World Multi-Task Agents", 497 "authors": ["Zihao Wang", "Shaofei Cai", "Guanzhou Chen"], 498 "year": 2023, 499 "relevance": "DEPS method for LLM-based planning in open-world environments, used as a primary baseline in Minecraft experiments." 500 }, 501 { 502 "title": "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", 503 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"], 504 "year": 2024, 505 "relevance": "Key alignment technique that WALL-E's 'world alignment' concept parallels, extending alignment from human preferences to environment dynamics." 506 } 507 ], 508 "engagement_factors": { 509 "practical_relevance": { 510 "score": 2, 511 "justification": "The rule learning approach for aligning LLM world models is applicable to other agent scenarios, and code is released on GitHub." 512 }, 513 "surprise_contrarian": { 514 "score": 1, 515 "justification": "Shows that a few learned rules can bridge the gap between LLM prior knowledge and environment dynamics, somewhat surprising but builds on intuitive foundations." 516 }, 517 "fear_safety": { 518 "score": 0, 519 "justification": "No safety or security concerns raised; focuses on improving agent task completion in game environments." 520 }, 521 "drama_conflict": { 522 "score": 0, 523 "justification": "No controversy or provocative claims; straightforward benchmark improvement paper." 524 }, 525 "demo_ability": { 526 "score": 2, 527 "justification": "Code released on GitHub with Minecraft and ALFWorld implementations; users could try the approach in these environments." 528 }, 529 "brand_recognition": { 530 "score": 1, 531 "justification": "Authors from UTS, UMD, and Tencent — Tencent is recognizable but not a top-tier AI research brand for this audience." 532 } 533 } 534 }