scan.json (30139B)
1 { 2 "paper": { 3 "title": "PlanRAG: A Plan-then-Retrieval Augmented Generation for Generative Large Language Models as Decision Makers", 4 "authors": [ 5 "Myeonghwa Lee", 6 "Seonho An", 7 "Min-Soo Kim" 8 ], 9 "year": 2024, 10 "venue": "North American Chapter of the Association for Computational Linguistics", 11 "arxiv_id": "2406.12430", 12 "doi": "10.48550/arXiv.2406.12430" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The abstract states 'We release our code and benchmark at https://github.com/myeon9h/PlanRAG' and provides a working URL." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The DQA benchmark (301 question-database pairs with both RDB and GDB versions) is released at the same GitHub repository. Game simulators for annotation are also released under the MIT license (Section 9)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or environment specification is provided in the paper. They mention using LangChain, MySQL, Neo4j, and vLLM but do not specify versions or provide a reproducible environment file." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are included in the paper. While code is released, the paper itself does not contain a 'Reproducing Results' section or describe how to run experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results are reported as point estimate accuracies (e.g., '64.3%' in Table 4) with no confidence intervals, error bars, or uncertainty quantification." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims PlanRAG 'outperforms' and 'significantly improves' over IterRAG based solely on comparing accuracy numbers (e.g., 64.3% vs 48.5%) without any statistical significance test." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Improvements are reported with baseline context: '15.8% in the Locating scenario and by 7.4% in the Building scenario' over IterRAG (Table 4). Per-category breakdowns (Tables 5, 6, Figure 5) provide baseline-relative context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The benchmark has 200 Locating and 101 Building pairs (Table 3) with no justification for why these sizes were chosen. No power analysis or discussion of whether these sizes are sufficient for the claims made." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Section 5.1 explicitly states 'All experiments are conducted in a zero-shot and single run setting.' No variance, standard deviation, or spread measure is reported across runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Table 4 compares four methods: SingleRAG-LM, IterRAG-LM, PlanRAG-LM, and PlanRAG-LM w/o RP. IterRAG-LM represents the state-of-the-art iterative RAG technique." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The iterative RAG baseline is based on ReAct (Yao et al., 2023) and references Trivedi et al. (2023) and Jiang et al. (2023b), all contemporary methods. All baselines use GPT-4." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "PlanRAG-LM w/o RP (without re-planning) in Table 4 ablates the re-planning component, showing 10.8% drop in Locating and 0.9% in Building. Appendix A.6.2 (Table 10) ablates prompt structure variations." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": false, 83 "justification": "Only accuracy (exact match with ground-truth decision) is used as an evaluation metric. No alternative metrics such as latency, cost per question, partial credit, or ranking quality are reported." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "Evaluation is entirely automated via semantic matching to ground-truth answers computed by game simulators. No human evaluation of answer quality, reasoning quality, or plan quality is included." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "The prompt structure was selected using 10% of questions sampled from each scenario (Table 10, Appendix A.6.2), but the main results in Table 4 are on the full benchmark including that 10%. No held-out test set is maintained." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by scenario (Locating vs Building, Table 4), database type (RDB vs GDB, Table 5), question difficulty (SR vs MR, Figure 5), error category (Figure 6), and re-planning frequency (Table 7)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5.2 provides detailed failure case analysis with five error categories (CAN, MIS, DEEP, QUR, OTH) and Figure 6 shows the distribution. Specific examples are given in Figure 4(a)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "PlanRAG-LM has slightly more DEEP errors than IterRAG-LM (Figure 6). Re-planning accuracy degrades with more re-planning iterations (Table 7). PlanRAG with GPT-3.5 performs worse than IterRAG (Table 11). Llama 2 and Phi-2 achieve 0% on all conditions." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims '15.8% improvement in Locating and 7.4% in Building over state-of-the-art iterative RAG' which matches Table 4 (64.3-48.5=15.8 for Locating, 45.0-37.6=7.4 for Building)." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims PlanRAG 'significantly enhances the capability of decision making' (causal). The ablation (w/o RP) partially supports re-planning's effect, but the main comparison between PlanRAG and IterRAG confounds planning with prompt structure changes — Table 10 shows prompt structure alone affects accuracy substantially. The compute/token difference between methods is also uncontrolled." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title says 'Generative Large Language Models as Decision Makers' broadly, but results are on only two video game scenarios using only GPT-4 (other models achieve 0%). The paper motivates with Pfizer business examples but tests only on game-derived data." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for PlanRAG's improvement: whether it's the planning structure, the additional prompt tokens, the extra retrieval iterations, or simply having a more verbose reasoning format. The prompt structure comparison in Table 10 hints at this but is not discussed as an alternative explanation." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper frames video game decision accuracy as evidence that LLMs can serve as business decision makers (Introduction motivates with Pfizer examples), but does not acknowledge the gap between game-derived benchmarks and actual business decision making. The Limitations section discusses database type scope but not the game-to-business proxy gap." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The main experiments use 'GPT-4 (OpenAI, 2023)' without a version specifier or snapshot date. Appendix A.7 footnote 9 specifies 'gpt-3.5-turbo-0125' for the GPT-3.5 variant, but the primary GPT-4 model lacks version specification." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt texts are provided in Appendix A.6: Figures 8-11 show complete prompts for SingleRAG, IterRAG, and PlanRAG on the GDB Locating scenario, including format instructions and tool descriptions. Business rules are provided in Figure 7." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 5.1 states 'GPT-4 with a zero temperature.' Appendix A.7 states 'temperature to zero and 0.1 for GPT-3.5-turbo and other open models, respectively.'" 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The PlanRAG architecture is described in detail in Section 4 with Figure 3: planning, retrieving & answering, and re-planning steps. The LangChain interface, SQL/Cypher query execution, and ReAct-based prompt structure are all documented." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix A.1 describes data collection from game savefiles via parser, storage according to DB schema, question generation via templates, and quality control (omitting low-TPcountry countries, filtering multiple-answer Building problems)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 8 'Limitations' contains three substantive paragraphs discussing database type scope, low-level method scope, and single-LM architecture limitation." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "The Limitations section discusses what wasn't covered (other database types, fine-tuned models, multi-LM frameworks) but does not address specific threats to the validity of the reported results, such as single-run reliability, game-to-business generalization, or GPT-4-specific effects." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 8 explicitly states: focused on graph/relational databases only, does not focus on low-level methods like fine-tuned Cypher generators, and PlanRAG was implemented using a single LM only. These are specific boundaries." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The benchmark databases, questions, and game simulators are released at the GitHub repository. The simulators allow deterministic reproduction of ground-truth annotations (Appendix A.3-A.4 provide full algorithms)." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Appendix A.1 describes selecting savefiles, extracting data via game savefile parser, storing in DB schema, generating questions from templates, and quality control criteria for both scenarios." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data comes from video game savefiles and deterministic simulators. This is a standard benchmark construction, not a human subjects study." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from game savefiles → parser → DB schema → question templates → quality filtering is documented in Appendix A.1. Simulator algorithms are provided in Appendix A.4. Statistics of the resulting data are in Table 3." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "The Acknowledgements section lists NRF grant No. 2018R1A5A1060031 and RS-2023-00281635, and IITP grant No. 2019-0-01267, all funded by the Korean government (MSIT)." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All three authors are affiliated with School of Computing, KAIST. As an academic institution, there is no product conflict with the evaluated techniques." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "Funding comes from the Korean National Research Foundation and IITP, government agencies with no financial interest in PlanRAG's performance outcomes." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses GPT-4 to answer questions about Europa Universalis IV and Victoria 3 game mechanics but does not state GPT-4's training data cutoff date." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "Europa Universalis IV and Victoria 3 have extensive online documentation (wikis, forums, strategy guides) that GPT-4 was likely trained on. The paper does not discuss whether this prior knowledge could inflate performance." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "While the specific DQA database instances are new, the game mechanics, business rules, and strategies are extensively documented online. GPT-4 likely has knowledge of EU4/V3 trade and building mechanics. This contamination vector is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. All evaluation is automated via game simulators." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The Ethical Considerations section (Section 9) discusses game license compliance and historical sensitivity, not human subjects." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No API costs, tokens consumed, or per-question latency are reported despite using GPT-4 API for 602 question-database pairs across multiple methods with iterative retrieval." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Appendix A.7 mentions 'eight Nvidia A100 (80GB) GPUs' for open model inference, but no total GPU hours, API spend, or wall-clock time for the main GPT-4 experiments is stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Section 5.1 states 'All experiments are conducted in a zero-shot and single run setting.' No seed sensitivity analysis is performed despite using temperature 0.1 for open models." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 5.1 explicitly states 'single run setting,' making it clear that each result comes from exactly one run." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Table 10 shows prompt structure was explored with four variants on 10% of questions, but no formal search budget, number of total configurations tried, or compute spent on this search is reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The prompt structure was selected based on Table 10 results on 10% of questions, but this selection subset is included in the final evaluation (Table 4 reports on the full benchmark). The selection is not on a separate validation set." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement both their own PlanRAG and the baseline IterRAG. They do not acknowledge the author-evaluation bias where authors' implementations of baselines may systematically underperform (Lucic et al., 2018)." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "PlanRAG involves additional planning and re-planning iterations compared to IterRAG, consuming more tokens and API calls, but this compute difference is not quantified or discussed as a confound." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The DQA benchmark is derived from Europa Universalis IV and Victoria 3 video games as a proxy for real business decision making. The paper claims these games 'well imitate real business situations' but provides no evidence for this construct validity claim." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "PlanRAG uses a fundamentally different prompt structure (Plan-Thought-Action-Observation-Replan) compared to IterRAG (Thought-Action-Observation). Table 10 shows prompt structure alone significantly affects accuracy, but this confound is not addressed when interpreting the main results." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "GPT-4 was likely trained on extensive EU4 and Victoria 3 game mechanics documentation, strategy guides, and wiki pages. The paper does not discuss whether this temporal overlap could help the model on DQA questions." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The business rules R are provided as input to the model. Whether the model also benefits from pre-existing knowledge of these game mechanics (feature leakage from training data) is not discussed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Multiple questions share the same database D (Section A.2: 'Some instances share the same database D, but not all use the same one'). Non-independence between questions sharing a database is not discussed." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, or decontamination analysis is performed." 359 } 360 } 361 }, 362 "scan_version": 3, 363 "active_modules": [ 364 "experimental_rigor", 365 "data_leakage" 366 ], 367 "claims": [ 368 { 369 "claim": "PlanRAG outperforms the state-of-the-art iterative RAG by 15.8% on Locating and 7.4% on Building scenarios.", 370 "evidence": "Table 4 shows PlanRAG-LM achieves 64.3% on Locating (vs 48.5% for IterRAG-LM) and 45.0% on Building (vs 37.6% for IterRAG-LM). Single run, zero-shot, GPT-4.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Re-planning is important for decision making, improving accuracy by 10.8% on Locating and 0.9% on Building.", 375 "evidence": "Table 4 ablation: PlanRAG-LM w/o RP achieves 53.5% vs 64.3% on Locating and 44.1% vs 45.0% on Building. Table 7 shows re-planning statistics.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "PlanRAG reduces the rate of missed data analysis compared to IterRAG (1.3% vs 3.3% on Locating, 21.8% vs 33.2% on Building).", 380 "evidence": "Table 6 directly measures missed analysis rates for critical values (IV, TPtotal for Locating; CO, PD for Building).", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "PlanRAG significantly reduces CAN (improper candidate) and MIS (missed data) errors for both scenarios.", 385 "evidence": "Figure 6 shows error category distribution for IterRAG-LM and PlanRAG-LM, with visible reductions in CAN and MIS categories.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "PlanRAG is more effective for Single Retrieval (SR) questions than Multiple Retrieval (MR) questions relative to IterRAG.", 390 "evidence": "Figure 5 shows larger accuracy gaps for SR questions (e.g., 65.6% vs 43.9% in Locating SR) than MR questions (61.9% vs 49.4% in Locating MR).", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "LLMs smaller than GPT-4 (Llama 2, Phi-2) cannot solve Decision QA problems at all, and GPT-3.5 performs poorly with PlanRAG.", 395 "evidence": "Table 11: Llama 2 (70B, 13B) and Phi-2 achieve 0% across all settings. GPT-3.5 with PlanRAG achieves 0-4% on most settings, performing worse than IterRAG with GPT-3.5.", 396 "supported": "strong" 397 } 398 ], 399 "methodology_tags": [ 400 "benchmark-eval" 401 ], 402 "key_findings": "PlanRAG, an iterative plan-then-retrieval augmented generation technique, outperforms iterative RAG by 15.8% and 7.4% on two Decision QA scenarios built from grand strategy video games. The planning step is most beneficial for questions that IterRAG underestimates in difficulty, and re-planning provides substantial gains on the simpler Locating scenario but minimal improvement on the harder Building scenario. Only GPT-4 can meaningfully address Decision QA; smaller models (Llama 2, Phi-2) score 0%, and GPT-3.5 cannot follow PlanRAG's complex prompt structure.", 403 "red_flags": [ 404 { 405 "flag": "Single-run evaluation with no variance", 406 "detail": "All experiments are conducted as single runs with zero-temperature GPT-4. No variance, confidence intervals, or significance tests are reported, yet the paper uses language like 'significantly improves' and 'far more effective.' The 15.8% and 7.4% improvements have no statistical backing." 407 }, 408 { 409 "flag": "Video game benchmark presented as business decision making", 410 "detail": "The paper motivates with Pfizer distribution network examples and frames the work as LLMs for business decision making, but all evaluation is on Europa Universalis IV and Victoria 3 game mechanics. The claim that these games 'well imitate real business situations' lacks evidence." 411 }, 412 { 413 "flag": "Potential game knowledge contamination", 414 "detail": "EU4 and Victoria 3 have extensive online wikis, strategy guides, and forum discussions that GPT-4 was almost certainly trained on. The model may already know optimal trade steering and building strategies, inflating performance independently of the RAG technique." 415 }, 416 { 417 "flag": "Scaffold confound between methods", 418 "detail": "PlanRAG uses a different prompt structure (Plan-Thought-Action-Observation-Replan) than IterRAG (Thought-Action-Observation). Table 10 shows prompt structure alone causes substantial accuracy variation (27.5% to 57.5%), yet the main comparison attributes all improvement to the planning concept rather than the prompt engineering." 419 }, 420 { 421 "flag": "Unquantified compute difference", 422 "detail": "PlanRAG performs additional planning and re-planning iterations (Table 7 shows up to 4+ re-plannings), consuming more API calls and tokens than IterRAG. This compute difference is never quantified, making it unclear whether PlanRAG's improvement comes from planning or simply from using more compute." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 428 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R. Narasimhan", "Yuan Cao"], 429 "year": 2023, 430 "relevance": "Core baseline technique; PlanRAG extends ReAct's Thought-Action-Observation framework with planning and re-planning steps." 431 }, 432 { 433 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 434 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin"], 435 "year": 2020, 436 "relevance": "Foundational RAG technique that PlanRAG builds upon; defines the retrieve-then-generate paradigm for language models." 437 }, 438 { 439 "title": "Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-Intensive Multi-Step Questions", 440 "authors": ["Harsh Trivedi", "Niranjan Balasubramanian", "Tushar Khot", "Ashish Sabharwal"], 441 "year": 2023, 442 "relevance": "Iterative RAG baseline method that PlanRAG compares against; interleaves retrieval with reasoning for complex questions." 443 }, 444 { 445 "title": "Active Retrieval Augmented Generation", 446 "authors": ["Zhengbao Jiang", "Frank F Xu", "Luyu Gao", "Zhiqing Sun", "Qian Liu"], 447 "year": 2023, 448 "arxiv_id": "2305.06983", 449 "relevance": "Iterative RAG technique that decides when to retrieve; represents the state-of-the-art RAG approach compared against PlanRAG." 450 }, 451 { 452 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 453 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Fei Xia"], 454 "year": 2022, 455 "relevance": "Foundational prompting technique for LLM reasoning that informs the Thought step in PlanRAG's prompt structure." 456 }, 457 { 458 "title": "GPT-4 Technical Report", 459 "authors": ["OpenAI"], 460 "year": 2023, 461 "relevance": "Primary model used for all main experiments; only model capable of solving Decision QA problems in this study." 462 }, 463 { 464 "title": "Llama 2: Open Foundation and Fine-tuned Chat Models", 465 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 466 "year": 2023, 467 "arxiv_id": "2307.09288", 468 "relevance": "Open-source LLM evaluated in Appendix A.7 showing 0% accuracy on Decision QA, demonstrating the task's difficulty for smaller models." 469 }, 470 { 471 "title": "StructGPT: A General Framework for Large Language Model to Reason over Structured Data", 472 "authors": ["Jinhao Jiang", "Kun Zhou", "Zican Dong", "Keming Ye"], 473 "year": 2023, 474 "arxiv_id": "2305.09645", 475 "relevance": "Framework for LLM reasoning over structured databases, directly related to Decision QA's task of querying structured data for decision making." 476 }, 477 { 478 "title": "REPLUG: Retrieval-Augmented Black-Box Language Models", 479 "authors": ["Weijia Shi", "Sewon Min", "Michihiro Yasunaga", "Minjoon Seo"], 480 "year": 2023, 481 "arxiv_id": "2301.12652", 482 "relevance": "RAG technique for augmenting black-box LMs with retrieval, representing the broader family of retrieval-augmented approaches." 483 }, 484 { 485 "title": "Dense Passage Retrieval for Open-Domain Question Answering", 486 "authors": ["Vladimir Karpukhin", "Barlas Oguz", "Sewon Min", "Patrick Lewis"], 487 "year": 2020, 488 "relevance": "Foundational retrieval method for knowledge-based QA; paper contrasts Decision QA with knowledge-based QA approaches." 489 } 490 ], 491 "engagement_factors": { 492 "practical_relevance": { 493 "score": 1, 494 "justification": "The PlanRAG technique is conceptually applicable to business decision support, but the video game evaluation setting limits immediate practical adoption." 495 }, 496 "surprise_contrarian": { 497 "score": 1, 498 "justification": "Adding planning to RAG is an intuitive extension; the finding that only GPT-4 can handle Decision QA while smaller models score 0% is somewhat surprising." 499 }, 500 "fear_safety": { 501 "score": 0, 502 "justification": "No AI safety or security concerns are raised by this work." 503 }, 504 "drama_conflict": { 505 "score": 0, 506 "justification": "No controversy or conflict; a straightforward technical contribution." 507 }, 508 "demo_ability": { 509 "score": 2, 510 "justification": "Code and benchmark released on GitHub with game simulators; someone could reproduce the experiments with API access." 511 }, 512 "brand_recognition": { 513 "score": 1, 514 "justification": "KAIST is a respected institution but not a top AI brand; the paper uses GPT-4 which adds some recognition." 515 } 516 } 517 }