scan.json (31218B)
1 { 2 "paper": { 3 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 4 "authors": [ 5 "Shunyu Yao", 6 "Jeffrey Zhao", 7 "Dian Yu", 8 "Nan Du", 9 "Izhak Shafran", 10 "Karthik Narasimhan", 11 "Yuan Cao" 12 ], 13 "year": 2022, 14 "venue": "ICLR 2023", 15 "arxiv_id": "2210.03629", 16 "doi": "10.48550/arXiv.2210.03629" 17 }, 18 "scan_version": 2, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "Project page with code linked in footnote: 'https://react-lm.github.io/'. Reproducibility Statement also provides 'associated GPT-3 ReAct prompting code at https://anonymous.4open.science/r/ReAct-2268/'." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "All four benchmarks used (HotpotQA, FEVER, ALFWorld, WebShop) are publicly available standard datasets. The paper does not modify them." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. The paper mentions PaLM-540B and GPT-3 (text-davinci-002) but does not specify library versions or dependencies." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are included in the paper. Full prompts are provided in Appendix C and code is linked, but there are no specific commands or README-style instructions to replicate experiments." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables 1, 3, and 4 are reported as point estimates without confidence intervals or error bars." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims 'ReAct outperforms Act consistently' and makes other comparative claims based solely on comparing point estimates without any statistical significance tests." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "The paper reports absolute improvements with baseline context: 'absolute success rate of 34% and 10% respectively' (Abstract). Tables 1, 3, 4 provide both the proposed method and baseline numbers, enabling effect size calculation (e.g., ReAct 71% vs BUTLER 37% on ALFWorld)." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification for sample sizes. Uses full evaluation splits for some benchmarks (134 ALFWorld games, 500 WebShop instructions) and a 500-sample subset for GPT-3 HotpotQA, but no power analysis or justification is given." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "For ALFWorld, 'avg' and 'best of 6' results across prompt permutations are reported in Table 3, but no standard deviation or spread measure is provided. HotpotQA and FEVER results are single-run point estimates with greedy decoding." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Multiple baselines are included: Standard prompting, CoT, CoT-SC, Act-only, BUTLER (imitation learning), IL, and IL+RL. Tables 1, 3, and 4 present systematic comparisons." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Baselines include CoT (Wei et al., 2022) and CoT-SC (Wang et al., 2022a) which were state-of-the-art prompting methods at the time, plus BUTLER (2020) and IL/IL+RL methods. Supervised SOTA numbers are also cited." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Systematic ablations are performed: ReAct vs Act (removes thoughts), ReAct vs CoT (removes actions/observations), ReAct vs ReAct-IM (different thought styles). Section 3.2 describes how baselines are constructed by 'systematically ablating ReAct trajectories.' Table 3 includes ReAct-IM ablation." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Multiple metrics used across tasks: exact match (HotpotQA), accuracy (FEVER), task-specific success rates broken down by 6 task types (ALFWorld), score and success rate (WebShop, Table 4)." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": true, 94 "justification": "Table 2 presents human analysis: 'we randomly sampled 50 trajectories with correct and incorrect answers from ReAct and CoT respectively (thus 200 examples in total), and manually labeled their success and failure modes.'" 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Exemplars are selected from the training set ('randomly select 6 and 3 cases from the training set'). ALFWorld uses '134 unseen evaluation games.' WebShop uses '500 test instructions.' Evaluation is on held-out data separate from prompt exemplars." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Table 3 provides per-task-type breakdown for ALFWorld across 6 categories (Pick, Clean, Heat, Cool, Look, Pick 2). Table 2 provides per-category breakdown of success and failure modes." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Table 2 presents a detailed failure analysis with 4 failure modes: reasoning error (47%), search result error (23%), hallucination (0% for ReAct, 56% for CoT), and label ambiguity (29%). Appendix E.1 provides concrete examples of each mode." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "ReAct underperforms CoT on HotpotQA (27.4 vs 29.4, Table 1). PaLM-8B/62B prompting with ReAct 'performs worst among four methods due to the difficulty to learn both reasoning and acting from in-context examples' (Section 3.3, Figure 3). ReAct's structural constraints reduce flexibility (Table 2 analysis)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims match results: ReAct 'overcomes prevalent issues of hallucination' (Table 2: 0% vs 56% hallucination), outperforms on FEVER (Table 1: 60.9 vs 56.3), outperforms IL/RL on ALFWorld and WebShop by '34% and 10%' (Tables 3, 4). Abstract appropriately hedges that ReAct is 'competitive with' CoT on QA." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper's causal claims ('reasoning traces help the model...', 'actions allow it to interface with...') are supported by controlled ablations: ReAct vs Act removes thoughts while keeping everything else constant, ReAct vs CoT removes actions. This single-variable manipulation design is adequate for the causal claims made." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The paper claims 'a general paradigm to combine reasoning and acting with language models for solving diverse language reasoning and decision making tasks' based on 4 benchmarks tested primarily on PaLM-540B. The title 'Synergizing Reasoning and Acting in Language Models' implies generality beyond the tested settings. GPT-3 experiments (Appendix A.1) partially address model generality but only on 2 of 4 tasks." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "Limited discussion of alternative explanations. Footnote 4 suggests repetitive thought loops 'could be due to the sub-optimal greedy decoding procedure.' Section 3.3 discusses trade-offs between factuality and flexibility. But no systematic consideration of confounds or alternative explanations for the observed improvements." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper's claims match the granularity of measurements. It measures exact match on HotpotQA, accuracy on FEVER, success rate on ALFWorld, and score/SR on WebShop. Claims about 'interpretability and trustworthiness' are supported by the concrete human analysis in Table 2 showing lower hallucination rates, not vague proxy assertions." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "PaLM-540B is specified by name and parameter count (Chowdhery et al., 2022). GPT-3 is specified as 'text-davinci-002' (Appendix A.1). PaLM-8B and PaLM-62B are specified for finetuning experiments." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Full prompts are provided in Appendix C (Sections C.1-C.4) for all four tasks and all prompt formats (Standard, CoT, Act, ReAct). These are actual prompt texts, not descriptions. WebShop prompt in Table 6, ALFWorld prompts in Tables 7-9." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Greedy decoding stated for main experiments. Temperature 0.7 stated for CoT-SC sampling with 21 trajectories. Finetuning details in Appendix B.1: batch size 64, training steps (4000 for ReAct/Act, 2000/1000 for Standard/CoT on 8B/62B)." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The ReAct scaffolding is described in detail: interleaved thought-action-observation format, Wikipedia API with three action types (search[entity], lookup[string], finish[answer]), ALFWorld text action space, WebShop action space (search, click, buy). Section 2 formally defines the augmented action space." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "The paper describes the 'question-only setup' for HotpotQA and FEVER (no support paragraphs). Exemplar selection: 'randomly select 6 and 3 cases from the training set.' Finetuning data: '3,000 trajectories with correct answers generated by ReAct.' ALFWorld evaluation setup follows Shridhar et al. (2020b)." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": false, 175 "justification": "No dedicated limitations section. The conclusion contains one sentence: 'complex tasks with large action spaces require more demonstrations to learn well, which unfortunately can easily go beyond the input length limit of in-context learning.' This is insufficient per the criterion requiring 'substantive discussion.'" 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": false, 180 "justification": "No threats-to-validity discussion. The paper does not discuss specific threats like prompt sensitivity, benchmark representativeness, or generalizability limits beyond the brief conclusion mention." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "No explicit statements about what the results do NOT show. The paper does not bound its claims to specific model families, task types, or settings. No equivalent of 'what the evidence does not show.'" 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw data (model output trajectories, intermediate results) is released. Code is linked but generated trajectories used for finetuning or the 200 manually analyzed examples are not made available." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Data collection is described: benchmarks are publicly available, exemplars 'randomly selected from the training set,' finetuning data is '3,000 trajectories with correct answers generated by ReAct,' human analysis sampled '50 trajectories with correct and incorrect answers from ReAct and CoT respectively.'" 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants were recruited. The paper uses standard public benchmarks and author-conducted manual analysis." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is straightforward and documented: select exemplars from training set → compose prompts → run inference with greedy decoding → extract answers → evaluate against gold labels. For finetuning: generate trajectories → filter for correct answers (3,000) → finetune." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Acknowledgments section: 'This work was supported in part by the National Science Foundation under Grant No. 2107048.'" 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations clearly listed: Princeton University (Yao, Narasimhan) and Google Research, Brain team (Zhao, Yu, Du, Shafran, Cao). The first author's Google internship is also noted." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "While NSF funding is independent, 5 of 7 authors are from Google Research, and the primary model evaluated (PaLM-540B) is Google's proprietary model. Google has a direct interest in demonstrating the capabilities of its LLMs. This conflict is not explicitly acknowledged." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is included in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "PaLM-540B's training data cutoff is not stated in the paper. The Chowdhery et al. (2022) reference describes PaLM but the cutoff date is not reproduced here." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether PaLM-540B's training data may contain examples from HotpotQA (2018), FEVER (2018), ALFWorld (2020), or WebShop (2022), all of which were publicly available before PaLM training." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "HotpotQA and FEVER were published in 2018, well before PaLM's training. The benchmarks and their solutions could be in the training data. No contamination analysis is performed." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants. The paper evaluates LLM prompting methods on automated benchmarks." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants. An Ethics Statement is included but addresses potential harms of LLM-environment interaction, not human subjects." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in the study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in the study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants. This is a benchmark evaluation study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants. This is a benchmark evaluation study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in the study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "No inference cost, latency, or tokens consumed per query is reported. The paper uses PaLM-540B extensively but never quantifies the computational cost of ReAct vs baselines." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "No total computational budget stated. Finetuning uses batch size 64 for 1000-4000 steps but total GPU hours, hardware, or API costs are not reported." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No random seed sensitivity analysis. Main experiments use greedy decoding (deterministic). ALFWorld runs 6 prompt permutations but these test prompt sensitivity, not seed sensitivity." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": true, 307 "justification": "Greedy decoding is stated for main experiments (implying single deterministic run). ALFWorld explicitly uses '6 prompts for each task type through each permutation of 2 annotated trajectories from the 3.' CoT-SC uses '21 CoT trajectories' with temperature 0.7." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "No formal hyperparameter search budget reported. The paper notes 'We find more examples do not improve performance' (footnote 2) suggesting some search, but the budget and configurations tried are not documented." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "For ALFWorld, both 'avg' and 'best of 6' results are reported transparently in Table 3. For ReAct+CoT-SC combinations, the selection heuristics are explicitly described (Section 3.2). Step limits (7 for HotpotQA, 5 for FEVER) are justified in footnote 3." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite numerous comparisons across methods and tasks." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors implement all baselines themselves (Standard, CoT, Act are constructed by 'systematically ablating ReAct trajectories'). No acknowledgment of self-comparison bias per Lucic et al. (2018)." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "ReAct generates more tokens per query than Act or CoT (thoughts + actions + observations), but computational cost differences are never discussed. CoT-SC requires 21 samples vs ReAct's single pass, but compute is not compared." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "No discussion of whether HotpotQA, FEVER, ALFWorld, or WebShop actually measure the claimed capabilities of 'reasoning and acting synergy.' The paper assumes benchmark validity without questioning it." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "The scaffold is the independent variable being studied. All methods (Standard, CoT, Act, ReAct) are evaluated on the same model (PaLM-540B) with the same benchmarks, isolating the effect of the prompting approach. The comparison is deliberately about scaffolding strategies." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "HotpotQA (2018) and FEVER (2018) benchmark problems existed years before PaLM's training. The paper does not discuss whether solutions could be in the training data." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether evaluation setups leak answer information. For example, CoT baselines use internal knowledge that may come from memorized benchmark answers." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of whether training exemplars and evaluation examples share structural similarities or come from the same data distribution." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection or prevention method used. No canary strings, membership inference, or decontamination analysis." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "ReAct outperforms Act-only prompting on both HotpotQA (27.4 vs 25.7 EM) and FEVER (60.9 vs 58.9 accuracy), demonstrating the value of reasoning to guide acting.", 371 "evidence": "Table 1, Section 3.3. Consistent across both knowledge-intensive tasks with PaLM-540B.", 372 "supported": "moderate" 373 }, 374 { 375 "claim": "ReAct reduces hallucination compared to CoT: 6% vs 14% false positive rate in successes, and 0% vs 56% hallucination as failure mode.", 376 "evidence": "Table 2, Section 3.3. Based on manual analysis of 200 randomly sampled trajectories (50 correct + 50 incorrect from each method).", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "ReAct outperforms imitation and reinforcement learning methods on ALFWorld by 34% absolute success rate (71% vs 37% best BUTLER).", 381 "evidence": "Table 3. Best-of-6 ReAct (71%) vs best-of-8 BUTLER (37%). Average ReAct (57%) also exceeds best BUTLER.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "ReAct outperforms IL and IL+RL methods on WebShop by 10% absolute success rate (40.0% vs 29.1%/28.7%).", 386 "evidence": "Table 4. One-shot ReAct prompting outperforms methods trained on 1,012-10,587 examples.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Combining ReAct with CoT-SC achieves the best prompting results, reaching CoT-SC performance with 21 samples using only 3-5 samples.", 391 "evidence": "Table 1 and Figure 2. ReAct→CoT-SC achieves 35.1 EM on HotpotQA; CoT-SC→ReAct achieves 64.6 on FEVER.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "Finetuned ReAct is the best method: PaLM-8B finetuned ReAct outperforms all PaLM-62B prompting methods.", 396 "evidence": "Figure 3, Section 3.3. Finetuning with 3,000 ReAct trajectories shows strong scaling behavior.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "Internal reasoning in ReAct substantially outperforms Inner Monologue-style dense external feedback (71% vs 53% overall success rate on ALFWorld).", 401 "evidence": "Table 3, Section 4. ReAct-IM ablation with consistent advantages on 5 of 6 task types.", 402 "supported": "strong" 403 } 404 ], 405 "methodology_tags": ["benchmark-eval"], 406 "key_findings": "ReAct introduces interleaved reasoning traces and actions for LLM task solving, demonstrating that reasoning helps guide acting (reducing hallucination from 56% to 0% vs CoT on HotpotQA) while actions ground reasoning in external knowledge (outperforming CoT on FEVER 60.9 vs 56.3). On interactive decision making, few-shot ReAct prompting outperforms trained IL/RL methods by 34% on ALFWorld and 10% on WebShop. The combination of ReAct with CoT self-consistency achieves the best prompting results on knowledge-intensive tasks, and finetuned ReAct at 8B scale outperforms all 62B prompting methods.", 407 "red_flags": [ 408 { 409 "flag": "No statistical tests or uncertainty quantification", 410 "detail": "All comparisons across 4 benchmarks rely on point estimates without statistical significance tests, confidence intervals, or error bars. Claims of 'outperforming' are based on raw number comparisons (e.g., 27.4 vs 25.7 on HotpotQA — a 1.7 point difference without any indication of whether this is statistically meaningful)." 411 }, 412 { 413 "flag": "Company evaluating its own model", 414 "detail": "Five of seven authors are from Google Research, Brain team, and the primary model evaluated is Google's PaLM-540B. The paper demonstrates PaLM's capabilities without acknowledging this conflict. Results on GPT-3 in Appendix A.1 partially mitigate this." 415 }, 416 { 417 "flag": "No contamination analysis", 418 "detail": "PaLM-540B was trained on massive web data, and HotpotQA (2018) and FEVER (2018) have been publicly available for years. The model may have memorized benchmark answers, which would inflate results for Standard and CoT baselines and potentially affect ReAct's knowledge retrieval patterns." 419 }, 420 { 421 "flag": "Selective reporting of best-of-K results", 422 "detail": "ALFWorld results prominently feature 'best of 6' trials (71% for ReAct vs 37% for BUTLER 'best of 8'). While averages are also reported for ReAct (57%), the headline claim of 34% improvement uses best-of-K comparisons, which overestimates expected performance." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "Chain of thought prompting elicits reasoning in large language models", 428 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Ed Chi", "Quoc Le", "Denny Zhou"], 429 "year": 2022, 430 "arxiv_id": "2201.11903", 431 "relevance": "Foundational prompting method for LLM reasoning; primary baseline and comparator for ReAct's reasoning component." 432 }, 433 { 434 "title": "Self-consistency improves chain of thought reasoning in language models", 435 "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans", "Quoc Le", "Ed Chi", "Sharan Narang", "Aakanksha Chowdhery", "Denny Zhou"], 436 "year": 2022, 437 "arxiv_id": "2203.11171", 438 "relevance": "Sampling-based improvement to CoT; combined with ReAct in the best-performing prompting configurations." 439 }, 440 { 441 "title": "WebGPT: Browser-assisted question-answering with human feedback", 442 "authors": ["Reiichiro Nakano", "Jacob Hilton", "Suchir Balaji", "Jeff Wu"], 443 "year": 2021, 444 "arxiv_id": "2112.09332", 445 "relevance": "Prior work on LLM-web interaction for QA without explicit reasoning; uses human feedback and RL rather than prompting." 446 }, 447 { 448 "title": "Do as I can, not as I say: Grounding language in robotic affordances", 449 "authors": ["Michael Ahn", "Anthony Brohan", "Noah Brown"], 450 "year": 2022, 451 "arxiv_id": "2204.01691", 452 "relevance": "SayCan: LLM-based robotic planning grounded by affordance models; key prior work on LLMs for decision making." 453 }, 454 { 455 "title": "Inner monologue: Embodied reasoning through planning with language models", 456 "authors": ["Wenlong Huang", "Fei Xia", "Ted Xiao"], 457 "year": 2022, 458 "arxiv_id": "2207.05608", 459 "relevance": "Closest prior work to ReAct for closed-loop LLM reasoning in interactive environments; ReAct explicitly compares against IM-style prompting." 460 }, 461 { 462 "title": "PaLM: Scaling language modeling with pathways", 463 "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"], 464 "year": 2022, 465 "arxiv_id": "2204.02311", 466 "relevance": "Primary model used in ReAct experiments (PaLM-540B); demonstrates scale effects on reasoning and acting capabilities." 467 }, 468 { 469 "title": "STaR: Bootstrapping reasoning with reasoning", 470 "authors": ["Eric Zelikman", "Yuhuai Wu", "Jesse Mu", "Noah D. Goodman"], 471 "year": 2022, 472 "arxiv_id": "2203.14465", 473 "relevance": "Self-bootstrapping approach for reasoning; ReAct's finetuning approach draws on this methodology." 474 }, 475 { 476 "title": "A generalist agent", 477 "authors": ["Scott Reed", "Konrad Zolna", "Emilio Parisotto"], 478 "year": 2022, 479 "arxiv_id": "2205.06175", 480 "relevance": "Gato: multi-modal generalist agent; represents the trend toward versatile LLM-based agents that ReAct contributes to." 481 }, 482 { 483 "title": "Large language models are zero-shot reasoners", 484 "authors": ["Takeshi Kojima", "Shixiang Shane Gu", "Machel Reid", "Yutaka Matsuo", "Yusuke Iwasawa"], 485 "year": 2022, 486 "arxiv_id": "2205.11916", 487 "relevance": "Zero-shot CoT; demonstrates emergent reasoning in LLMs without exemplars, complementary to ReAct's few-shot approach." 488 }, 489 { 490 "title": "Language models are few-shot learners", 491 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 492 "year": 2020, 493 "relevance": "GPT-3 paper; secondary model used in ReAct experiments (text-davinci-002) and foundation for in-context learning paradigm." 494 }, 495 { 496 "title": "ALFWorld: Aligning text and embodied environments for interactive learning", 497 "authors": ["Mohit Shridhar", "Xingdi Yuan", "Marc-Alexandre Côté", "Yonatan Bisk", "Adam Trischler", "Matthew Hausknecht"], 498 "year": 2020, 499 "arxiv_id": "2010.03768", 500 "relevance": "Text-based embodied benchmark used in ReAct evaluation; demonstrates LLM decision-making in simulated household environments." 501 }, 502 { 503 "title": "WebShop: Towards scalable real-world web interaction with grounded language agents", 504 "authors": ["Shunyu Yao", "Howard Chen", "John Yang", "Karthik Narasimhan"], 505 "year": 2022, 506 "arxiv_id": "2207.01206", 507 "relevance": "Real-world web interaction benchmark used in ReAct evaluation; tests practical applicability of LLM agents in noisy environments." 508 }, 509 { 510 "title": "Improving alignment of dialogue agents via targeted human judgements", 511 "authors": ["Amelia Glaese", "Nat McAleese", "Maja Trebacz"], 512 "year": 2022, 513 "relevance": "Sparrow: dialogue agent with API-calling capability trained via human feedback; contrasts with ReAct's cheaper prompting-based approach." 514 }, 515 { 516 "title": "Language models as zero-shot planners: Extracting actionable knowledge for embodied agents", 517 "authors": ["Wenlong Huang", "Pieter Abbeel", "Deepak Pathak", "Igor Mordatch"], 518 "year": 2022, 519 "arxiv_id": "2201.07207", 520 "relevance": "LLMs for action planning in embodied environments; prior work on using language priors for decision making without explicit reasoning." 521 } 522 ] 523 }