scan.json (25772B)
1 { 2 "paper": { 3 "title": "PLAN-AND-ACT: Improving Planning of Agents for Long-Horizon Tasks", 4 "authors": ["Lutfi Eren Erdogan", "Nicholas Lee", "Sehoon Kim", "Suhong Moon", "Hiroki Furuta", "Gopala Anumanchipalli", "Kurt Keutzer", "Amir Gholami"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2503.09572" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "PLAN-AND-ACT separates high-level planning from low-level execution for LLM-based agents, achieving 57.58% on WebArena-Lite and 81.36% text-only on WebVoyager. A synthetic data generation pipeline produces 15,000 training examples in under an hour, with dynamic replanning providing the largest single improvement (+10.31%). Even an untrained executor benefits substantially from a well-trained planner (+34.39% over baseline).", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No repository URL or code release link is provided in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "The synthetic training data (15,000 plans, trajectories) is not released. The benchmarks used (WebArena-Lite, WebVoyager) are public, but the paper's generated training data is not made available." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "Table 5/Figure 5 lists training hyperparameters and mentions '8×A100' and 'torchtune' framework, but no requirements.txt, Dockerfile, or detailed dependency listing is provided." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions or README are provided." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results in Tables 1-4 are reported as point estimates (e.g., 57.58%, 81.36%) with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims SOTA and outperformance over baselines based solely on comparing numbers (e.g., 57.58% vs 49.1%) without any statistical significance tests." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports improvements with baseline context throughout, e.g., '10.31% over the static PLANNER', 'from 9.85% to 29.63%', 'from 45% to 57%' style comparisons in Section 5.2-5.3 and Table 1." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "WebArena-Lite has 165 test cases and WebVoyager's size is not stated. No justification for why these sample sizes are adequate for the claims made." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No standard deviations, variance, or multi-run statistics are reported. Results appear to be single-run numbers." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Table 1 includes multiple baselines: zero-shot LLaMA-3.3-70B, finetuned LLaMA, WebRL-3.1-70B, GPT-4-Turbo, GPT-4o, AWM, WebPilot. Tables 3-4 include additional baselines." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include contemporary methods like WebRL (2024), AgentOccam (2024), WebPilot (2024), and Agent-E (2024)." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Table 1 is essentially an extensive ablation showing incremental contributions: base planner, finetuning, synthetic trajectories, plan expansion, targeted augmentation, dynamic replanning, and CoT. Each row adds one component." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": false, 82 "justification": "Only a single metric is used: binary success rate. No other metrics (partial completion, efficiency, step count as metric) are used for evaluation." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation of plan quality or agent outputs is conducted. The WebArena-Lite benchmark is described as 'human-verified' but that refers to the benchmark construction, not evaluation of the system's outputs." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "WebArena-Lite provides separate training data (1,113 examples) and 165 test cases. The paper trains on the training split and evaluates on the test set." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Figure 4 and the appendix table (A.11) provide per-website breakdowns across GitLab, Reddit, Shopping Admin, Shopping, Map, and Multiple Websites." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 5.3 discusses failure patterns (dynamic content analysis, query refinement failures). Appendix A.2 provides detailed examples of replanning after failures. Section 4.3 discusses targeted augmentation based on failure analysis." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 1 shows that a base (unfinetuned) planner actually hurts finetuned executors (row 7 vs row 1). The paper honestly reports that naively finetuning the planner did not improve performance for the finetuned executor (row 8)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims 57.58% on WebArena-Lite and 81.36% text-only on WebVoyager, both supported by Tables 1 and 4 respectively." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper makes causal claims via ablation ('adding dynamic replanning improves by 10.31%'). The ablation design in Table 1 uses controlled single-variable manipulation, adding one component at a time while holding others constant." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims 'Long-Horizon Tasks' generally but evaluation is limited to web navigation (WebArena-Lite, WebArena, WebVoyager). The conclusion mentions 'broader applications in various digital environments' without bounding. The paper acknowledges this somewhat in the Limitations section but the title and framing overgeneralize." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper does not consider alternative explanations for the improvements. For example, the synthetic data pipeline uses GPT-4o as a teacher — improvements could partly reflect knowledge distillation from a stronger model rather than the planning architecture itself. This is not discussed." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper uses binary task success rate and does not overframe it — it claims success on specific benchmarks rather than broader 'intelligence' or 'planning capability' in general. The measurements match the claims' granularity." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Specific model versions are stated: LLaMA-3.3-70B-Instruct, LLaMA-3.1-8B-Instruct, QWQ-32B, GPT-4o (as teacher), WebRL-Llama-3.1-70B, ORM-Llama-3.1-8B, DeepSeek-R1-Distill-Llama-70B. These include size and variant specifications." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompt texts are provided in Appendices A.3-A.10, covering the Planner system prompt, Executor prompt, Plan Data Annotator, Synthetic Plan Generator, Replanner, and failure classification prompts." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Figure 5 (Table 5) reports training hyperparameters (learning rate 2e-5, AdamW, cosine scheduler, batch size 32, 1 epoch) and inference hyperparameters (temperature 0, max tokens 4196, max sequence length 32000)." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The two-component architecture (Planner + Executor) is described in detail in Section 3, including dynamic replanning (Section 3.3), information flow between components, and garbage collection by the Executor (Section 3.2)." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The synthetic data generation pipeline is documented in detail in Section 4, including trajectory generation with ORM filtering (Section 4.1), grounded plan generation (Section 4.2), and plan expansion with counts (10,000 + 5,000 synthetic plans). Section 4.1 describes the filtering process using an ORM." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "There is a dedicated 'Limitations' section after the Conclusion discussing dependency on a baseline model for trajectory generation and the inefficiency of replanning after every action." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The Limitations section raises specific concerns: dependency on a base model that can successfully complete web tasks (limiting applicability to new domains), and computational inefficiency of replanning after every action step." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what the results do NOT show. While the Limitations section mentions some constraints, there is no explicit bounding of scope — e.g., no statement that results are limited to web navigation and may not transfer to other long-horizon task domains." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw trajectory data, synthetic training data, or per-example results are made available." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 4 describes the synthetic data collection in detail: Alpaca-style query generation from seed data, teacher LLM trajectory collection, ORM filtering, grounded plan annotation, and plan expansion. Specific counts are provided (923 trajectories, 1,113 WebArena-Lite examples, 10,000 + 5,000 synthetic plans)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data comes from standard benchmarks and synthetic generation." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Figure 3 and Sections 4.1-4.3 document the full pipeline from seed queries through trajectory generation, ORM filtering, plan annotation, and plan expansion, with counts at each stage." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "The Acknowledgments section lists support from Apple, Nvidia, Microsoft Accelerating Foundation Model Research, Google Cloud/TRC, Intel, BDD, BAIR, Furiosa, KFAS, and JSPS KAKENHI." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed: UC Berkeley, University of Tokyo, ICSI. No authors are affiliated with companies whose products are being evaluated." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "The funders (Intel, Apple, Nvidia, Microsoft, Google) are hardware/platform providers, not producers of the specific models or benchmarks being evaluated. The paper evaluates open-source LLaMA models on academic benchmarks." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is provided." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper does not state the training data cutoff for LLaMA-3.3-70B-Instruct or GPT-4o used as the teacher model." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether WebArena-Lite test cases or their solutions appeared in LLaMA or GPT-4o training data." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "WebArena was published in 2023. The models used (LLaMA-3.3, GPT-4o) were trained after this. No contamination analysis is provided." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference cost, latency, or per-example API costs reported. The system calls the planner after every executor action, which could be expensive, but costs are not quantified." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "Training hardware is mentioned (8×A100) but total GPU hours, training time, and API costs for generating 15,000 synthetic examples with GPT-4o are not quantified. The paper mentions 'under an hour' for plan expansion but not the full compute budget." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No multi-seed results are reported. All results appear to be single-run." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is never stated. Results are presented without indicating how many runs produced them." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget is reported. The paper lists chosen hyperparameters (Figure 5) but does not describe how they were selected or how many configurations were tried." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper mentions using a 'held-out validation set' for failure analysis (Section 4.3) but does not describe how the best model configuration was selected or whether final results were on a separate test set from the validation set used for targeted augmentation." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical tests are performed, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors compare their system against baselines without acknowledging potential bias from implementing/running those baselines themselves. Some baseline numbers are cited from papers (marked with *) but others may be their own runs." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "No comparison at matched compute budgets. PLAN-AND-ACT uses GPT-4o for synthetic data generation and requires finetuning a 70B model, while some baselines are zero-shot. This compute difference is not discussed." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "The paper does not discuss whether WebArena-Lite's 165 tasks adequately represent 'long-horizon planning' capability. No discussion of construct validity or limitations of the benchmark as a measure of planning ability." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": false, 335 "justification": "Different baselines use different scaffolds (WebPilot has 6 agents, AgentOccam uses tree planning, WebRL uses RL), yet comparisons attribute differences to the method rather than the scaffold. The paper does not control for scaffolding differences." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "WebArena was published in 2023 and models trained after could have seen solutions. This is not discussed." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information not available in real usage scenarios." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The synthetic training data is generated from WebArena-Lite training examples. No discussion of whether the generated examples overlap with or are structurally similar to test cases." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention methods are applied." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "PLAN-AND-ACT achieves 57.58% success rate on WebArena-Lite, a new state-of-the-art.", 364 "evidence": "Table 1, last row. Previous SOTA was WebRL-3.1-70B at 49.1%.", 365 "supported": "moderate" 366 }, 367 { 368 "claim": "PLAN-AND-ACT achieves 81.36% text-only SOTA on WebVoyager with QWQ-32B.", 369 "evidence": "Table 4. Previous text-only best was WebVoyager with GPT-4-Turbo at 44.3% (text) and Agent-E at 73.1%.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Dynamic replanning improves performance by 10.31% over static planning.", 374 "evidence": "Table 1, row 12 vs row 11: 53.94% vs 43.63% with full executor.", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "A well-trained planner improves even an untrained executor by 34.39% (from 9.85% to 44.24%).", 379 "evidence": "Table 1, comparing row 1 col 1 (9.85%) to row 12 col 1 (44.24%).", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "15,000 synthetic training examples can be generated in under an hour using GPT-4o.", 384 "evidence": "Section 4.3 and Section 6 state this claim. No independent verification provided.", 385 "supported": "weak" 386 }, 387 { 388 "claim": "CoT reasoning adds 4.36% improvement to the framework.", 389 "evidence": "Table 1, row 13 vs row 12: 57.58% vs 53.94%. Table 2 shows 8B model with CoT matches 70B without.", 390 "supported": "moderate" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "No variance or multi-run reporting", 396 "detail": "All results are single point estimates on benchmarks with 165 (WebArena-Lite) test cases. With only 165 binary outcomes, success rate has substantial sampling variance (~3-4% for a 95% CI), making the claimed improvements (e.g., 4.36% for CoT) potentially within noise." 397 }, 398 { 399 "flag": "Teacher model knowledge distillation conflated with architecture", 400 "detail": "GPT-4o is used as the teacher for synthetic data generation. Performance improvements may partly reflect distillation of GPT-4o's knowledge rather than the planning architecture itself. This is not disentangled." 401 }, 402 { 403 "flag": "No code or data release", 404 "detail": "Despite being a systems paper with a synthetic data pipeline, neither the code, the 15,000 synthetic training examples, nor the trained model weights are released." 405 }, 406 { 407 "flag": "Scaffold confound in cross-method comparisons", 408 "detail": "Baselines use very different architectures (WebRL uses RL, WebPilot uses 6 agents, AgentOccam uses tree search), making it unclear whether improvements come from the planning architecture or other factors." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "WebRL: Training LLM Web Agents via Self-Evolving Online Curriculum Reinforcement Learning", 414 "authors": ["Qi, Z.", "Liu, X."], 415 "year": 2024, 416 "arxiv_id": "2411.02337", 417 "relevance": "Main baseline and SOTA on WebArena-Lite; RL-based approach to training web agents." 418 }, 419 { 420 "title": "AgentOccam: A Simple Yet Strong Baseline for LLM-Based Web Agents", 421 "authors": ["Yang, K.", "Liu, Y."], 422 "year": 2024, 423 "arxiv_id": "2410.13825", 424 "relevance": "Hierarchical planning baseline using GPT-4-Turbo for web navigation." 425 }, 426 { 427 "title": "WebPilot: A Versatile and Autonomous Multi-Agent System for Web Task Execution with Strategic Exploration", 428 "authors": ["Zhang, Y.", "Ma, Z."], 429 "year": 2024, 430 "arxiv_id": "2408.15978", 431 "relevance": "Multi-agent web navigation system with 6 agents; comparative baseline." 432 }, 433 { 434 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 435 "authors": ["Yao, S.", "Zhao, J."], 436 "year": 2022, 437 "arxiv_id": "2210.03629", 438 "relevance": "Foundational agent framework combining reasoning and action; baseline prompting approach." 439 }, 440 { 441 "title": "An LLM Compiler for Parallel Function Calling", 442 "authors": ["Kim, S.", "Moon, S."], 443 "year": 2023, 444 "arxiv_id": "2312.04511", 445 "relevance": "LLMCompiler architecture that inspired the planner-executor separation in PLAN-AND-ACT." 446 }, 447 { 448 "title": "Self-Instruct: Aligning Language Models with Self-Generated Instructions", 449 "authors": ["Wang, Y.", "Kordi, Y."], 450 "year": 2022, 451 "arxiv_id": "2212.10560", 452 "relevance": "Pioneering synthetic data generation method used as basis for the data pipeline." 453 }, 454 { 455 "title": "DigiRL: Training In-the-Wild Device-Control Agents with Autonomous Reinforcement Learning", 456 "authors": ["Bai, H.", "Zhou, Y."], 457 "year": 2024, 458 "arxiv_id": "2406.11896", 459 "relevance": "RL-based approach for training device-control agents with autonomous data collection." 460 }, 461 { 462 "title": "NNetscape Navigator: Complex Demonstrations for Web Agents without a Demonstrator", 463 "authors": ["Murty, S.", "Bahdanau, D.", "Manning, C. D."], 464 "year": 2024, 465 "arxiv_id": "2410.02907", 466 "relevance": "Interaction-first trajectory collection with retroactive labeling for web agent training data." 467 }, 468 { 469 "title": "AdaPlanner: Adaptive Planning from Feedback with Language Models", 470 "authors": ["Sun, H.", "Zhuang, Y."], 471 "year": 2023, 472 "relevance": "Adaptive planning baseline with in-plan and out-of-plan refinement for LLM agents." 473 }, 474 { 475 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 476 "authors": ["Guo, D.", "Yang, D."], 477 "year": 2025, 478 "arxiv_id": "2501.12948", 479 "relevance": "Source of the distilled reasoning model used for CoT trace generation in the pipeline." 480 }, 481 { 482 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 483 "authors": ["Zhou, S.", "Xu, F. F."], 484 "year": 2023, 485 "arxiv_id": "2307.13854", 486 "relevance": "Primary evaluation benchmark environment for web agent research." 487 }, 488 { 489 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 490 "authors": ["Wei, J.", "Wang, X."], 491 "year": 2022, 492 "relevance": "Foundational work on CoT reasoning applied in the PLAN-AND-ACT framework." 493 } 494 ] 495 }