scan-v5.json (25344B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Decomposed Prompting: A Modular Approach for Solving Complex Tasks", 6 "authors": [ 7 "Tushar Khot", 8 "Harsh Trivedi", 9 "Matthew Finlayson", 10 "Yao Fu", 11 "Kyle Richardson", 12 "Peter Clark", 13 "Ashish Sabharwal" 14 ], 15 "year": 2022, 16 "venue": "International Conference on Learning Representations", 17 "arxiv_id": "2210.02406", 18 "doi": "10.48550/arXiv.2210.02406" 19 }, 20 "checklist": { 21 "claims_and_evidence": { 22 "abstract_claims_supported": { 23 "applies": true, 24 "answer": true, 25 "justification": "All abstract claims about DECOMP outperforming prior few-shot prompting on symbolic and textual tasks are backed by Figures 7-16 across 8 datasets; modular structure, recursive decomposition, and symbolic integration are all demonstrated empirically.", 26 "source": "haiku" 27 }, 28 "causal_claims_justified": { 29 "applies": true, 30 "answer": true, 31 "justification": "The 'CoT w/ rollout' ablation uses the identical reasoning procedure as DECOMP but in a monolithic prompt, isolating modularization as the causal factor; alternative decomposition schemes in Appendix E further support robustness.", 32 "source": "haiku" 33 }, 34 "generalization_bounded": { 35 "applies": true, 36 "answer": false, 37 "justification": "The title and conclusion claim DECOMP as a general approach for 'complex tasks' but evaluations cover only 8 NLP benchmarks; no explicit discussion of where DECOMP would not generalize.", 38 "source": "haiku" 39 }, 40 "alternative_explanations_discussed": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper does not discuss whether improvements stem from higher-quality prompt engineering for DECOMP, greater computation per query, or other confounds beyond modular structure.", 44 "source": "haiku" 45 }, 46 "proxy_outcome_distinction": { 47 "applies": true, 48 "answer": true, 49 "justification": "Exact Match and Answer F1 are used as direct measures of task correctness and match the granularity of the claims; no conflation of measurement with broader capabilities.", 50 "source": "haiku" 51 } 52 }, 53 "limitations_and_scope": { 54 "limitations_section_present": { 55 "applies": true, 56 "answer": false, 57 "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion paragraph is brief and does not systematically discuss shortcomings.", 58 "source": "haiku" 59 }, 60 "threats_to_validity_specific": { 61 "applies": true, 62 "answer": false, 63 "justification": "No specific threats are discussed, such as sensitivity to prompt wording choices, benchmark contamination in GPT-3 training, or limited dataset diversity.", 64 "source": "haiku" 65 }, 66 "scope_boundaries_stated": { 67 "applies": true, 68 "answer": false, 69 "justification": "The paper does not explicitly state what results do not show or which task types DECOMP would be unsuitable for.", 70 "source": "haiku" 71 } 72 }, 73 "conflicts_of_interest": { 74 "funding_disclosed": { 75 "applies": true, 76 "answer": true, 77 "justification": "Acknowledgements state: 'This work was supported in part by the National Science Foundation under grants IIS2007290.'", 78 "source": "haiku" 79 }, 80 "affiliations_disclosed": { 81 "applies": true, 82 "answer": true, 83 "justification": "Author affiliations are clearly listed on the title page: Allen Institute for AI, Stony Brook University, and University of Edinburgh.", 84 "source": "haiku" 85 }, 86 "funder_independent_of_outcome": { 87 "applies": true, 88 "answer": true, 89 "justification": "NSF is an independent government funding agency with no financial stake in whether DECOMP outperforms CoT prompting.", 90 "source": "haiku" 91 }, 92 "financial_interests_declared": { 93 "applies": true, 94 "answer": false, 95 "justification": "No competing interests statement, patent disclosures, or equity declarations appear anywhere in the paper.", 96 "source": "haiku" 97 } 98 }, 99 "scope_and_framing": { 100 "key_terms_defined": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 3 formally defines 'decomposer,' 'sub-task handler,' 'prompting program,' and the inference procedure with mathematical notation (P = (f1,Q1,A1),...) and illustrative figures.", 104 "source": "haiku" 105 }, 106 "intended_contribution_clear": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper explicitly states it contributes DECOMP, a new modular prompting approach supporting hierarchical decomposition, recursion, and symbolic module integration.", 110 "source": "haiku" 111 }, 112 "engagement_with_prior_work": { 113 "applies": true, 114 "answer": true, 115 "justification": "Section 2 engages substantively with CoT, Least-to-Most, Successive Prompting, and Neural Modular Networks, explaining specifically how DECOMP differs from and extends each approach.", 116 "source": "haiku" 117 } 118 } 119 }, 120 "type_checklist": { 121 "empirical": { 122 "artifacts": { 123 "code_released": { 124 "applies": true, 125 "answer": true, 126 "justification": "Footnote 1 states: 'Datasets, Code and Prompts available at https://github.com/allenai/DecomP.'", 127 "source": "haiku" 128 }, 129 "data_released": { 130 "applies": true, 131 "answer": true, 132 "justification": "All benchmarks used (HotpotQA, 2WikiMultihopQA, MuSiQue, CommaQA, GSM8K, MultiArith) are standard publicly available datasets.", 133 "source": "haiku" 134 }, 135 "environment_specified": { 136 "applies": true, 137 "answer": false, 138 "justification": "No requirements.txt, Dockerfile, or software dependency specifications are mentioned; only model names are identified.", 139 "source": "haiku" 140 }, 141 "reproduction_instructions": { 142 "applies": true, 143 "answer": true, 144 "justification": "Appendix G reproduces all prompts verbatim across 50+ pages, Section 3.2 describes the inference procedure step-by-step with Figure 3, and code is released on GitHub.", 145 "source": "haiku" 146 } 147 }, 148 "statistical_methodology": { 149 "confidence_intervals_or_error_bars": { 150 "applies": true, 151 "answer": false, 152 "justification": "Results are point estimates averaged over 3 prompts; no standard deviations, confidence intervals, or error bars are reported anywhere in the paper.", 153 "source": "haiku" 154 }, 155 "significance_tests": { 156 "applies": true, 157 "answer": false, 158 "justification": "No statistical significance tests are applied despite multiple comparative claims between DECOMP and baselines.", 159 "source": "haiku" 160 }, 161 "effect_sizes_reported": { 162 "applies": true, 163 "answer": true, 164 "justification": "Absolute score differences are reported throughout (e.g., 14-17 pt math QA improvement, EM going from 22.7% to 98% for letter concatenation at N=3).", 165 "source": "haiku" 166 }, 167 "sample_size_justified": { 168 "applies": true, 169 "answer": false, 170 "justification": "Sample sizes (100, 200, 300 examples) are chosen for API cost reasons without power analysis or formal justification.", 171 "source": "haiku" 172 }, 173 "variance_reported": { 174 "applies": true, 175 "answer": false, 176 "justification": "Results are averaged over 3 prompts but standard deviation is not reported; Appendix D shows per-prompt results without variance statistics.", 177 "source": "haiku" 178 } 179 }, 180 "evaluation_design": { 181 "baselines_included": { 182 "applies": true, 183 "answer": true, 184 "justification": "Multiple baselines: standard prompting, CoT, CoT w/ rollout, Least-to-Most w/ rollout; for open-domain QA also no-context and no-decomposition retrieval baselines.", 185 "source": "haiku" 186 }, 187 "baselines_contemporary": { 188 "applies": true, 189 "answer": true, 190 "justification": "CoT (Wei et al., 2022) and Least-to-Most (Zhou et al., 2023) were the leading few-shot prompting approaches at the time of submission.", 191 "source": "haiku" 192 }, 193 "ablation_study": { 194 "applies": true, 195 "answer": true, 196 "justification": "'CoT w/ rollout' ablation uses DECOMP's identical reasoning steps in a single prompt to isolate the effect of modularity; Appendix E tests alternative decomposition schemes.", 197 "source": "haiku" 198 }, 199 "multiple_metrics": { 200 "applies": true, 201 "answer": true, 202 "justification": "Exact Match is used for symbolic and CommaQA tasks; Answer F1 is used for open-domain QA datasets; task-appropriate metrics throughout.", 203 "source": "haiku" 204 }, 205 "human_evaluation": { 206 "applies": false, 207 "answer": false, 208 "justification": "Human evaluation is not applicable; all tasks use standard automated metrics on NLP benchmarks with ground-truth answers.", 209 "source": "haiku" 210 }, 211 "held_out_test_set": { 212 "applies": true, 213 "answer": true, 214 "justification": "For open-domain QA, results are on '300 held-out dev questions in each dataset' separate from the 100-question hyperparameter tuning set; symbolic tasks use separate test sets.", 215 "source": "haiku" 216 }, 217 "per_category_breakdown": { 218 "applies": true, 219 "answer": true, 220 "justification": "Results broken down by dataset (8 datasets), input length (N=3,4,5 for letter concatenation), and decomposition granularity (coarse vs. fine for CommaQA); per-prompt breakdowns in Appendix D.", 221 "source": "haiku" 222 }, 223 "failure_cases_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Appendix F provides explicit error analysis with concrete examples of failure modes for both DECOMP and CoT on letter concatenation and CommaQA tasks.", 227 "source": "haiku" 228 }, 229 "negative_results_reported": { 230 "applies": true, 231 "answer": true, 232 "justification": "The paper reports DECOMP is only 'comparable' to the retrieval baseline on HotpotQA with Codex, and that performance drops to near-zero for smaller models (curie-001).", 233 "source": "haiku" 234 } 235 }, 236 "setup_transparency": { 237 "model_versions_specified": { 238 "applies": true, 239 "answer": true, 240 "justification": "Specific model identifiers are given: text-davinci-002, code-davinci-002, davinci-001, text-curie-001, Flan-T5-Large/XL/XXL with parameter counts (0.7B, 3B, 11B).", 241 "source": "haiku" 242 }, 243 "prompts_provided": { 244 "applies": true, 245 "answer": true, 246 "justification": "Appendix G provides all prompts verbatim — decomposer prompts and every sub-task handler prompt for every task — covering 50+ pages of the paper.", 247 "source": "haiku" 248 }, 249 "hyperparameters_reported": { 250 "applies": true, 251 "answer": false, 252 "justification": "Temperature, top-p, and other generation hyperparameters are not reported; only the retrieval count K is described as a tuned hyperparameter.", 253 "source": "haiku" 254 }, 255 "scaffolding_described": { 256 "applies": true, 257 "answer": true, 258 "justification": "Section 3.2 and Figure 3 describe the inference procedure in detail: how the controller iteratively passes inputs/outputs between the decomposer and sub-task handlers until EOQ.", 259 "source": "haiku" 260 }, 261 "data_preprocessing_documented": { 262 "applies": true, 263 "answer": true, 264 "justification": "Appendix A describes retrieval corpus creation (430,225 paragraphs for 2WikiMultihopQA, 139,416 for MuSiQue) and CommaQA truncation to fit GPT-3 context limits.", 265 "source": "haiku" 266 } 267 }, 268 "data_integrity": { 269 "raw_data_available": { 270 "applies": true, 271 "answer": true, 272 "justification": "All benchmarks (HotpotQA, 2WikiMultihopQA, MuSiQue, CommaQA, GSM8K, MultiArith) are publicly available; code for generating test examples is released.", 273 "source": "haiku" 274 }, 275 "data_collection_described": { 276 "applies": true, 277 "answer": true, 278 "justification": "Symbolic task test examples described (names from popularity lists, 100 examples per condition); open-domain QA corpus construction from train/dev/test paragraphs described in Appendix A.", 279 "source": "haiku" 280 }, 281 "recruitment_methods_described": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants were recruited; all evaluation uses standard NLP benchmarks.", 285 "source": "haiku" 286 }, 287 "data_pipeline_documented": { 288 "applies": true, 289 "answer": true, 290 "justification": "The pipeline from corpus creation through hyperparameter tuning on a 100-question held-out set to final evaluation on 300 questions is described in Appendix A.", 291 "source": "haiku" 292 } 293 }, 294 "contamination": { 295 "training_cutoff_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "Training data cutoffs for GPT-3 (text-davinci-002, code-davinci-002) are not stated in the paper.", 299 "source": "haiku" 300 }, 301 "train_test_overlap_discussed": { 302 "applies": true, 303 "answer": false, 304 "justification": "Potential overlap between GPT-3 training data and benchmark test sets is not discussed anywhere in the paper.", 305 "source": "haiku" 306 }, 307 "benchmark_contamination_addressed": { 308 "applies": true, 309 "answer": false, 310 "justification": "Several benchmarks (HotpotQA 2018, MultiArith 2015) were publicly available before GPT-3's training cutoff; this contamination risk is not acknowledged.", 311 "source": "haiku" 312 } 313 }, 314 "human_studies": { 315 "pre_registered": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants involved.", 319 "source": "haiku" 320 }, 321 "irb_or_ethics_approval": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants involved.", 325 "source": "haiku" 326 }, 327 "demographics_reported": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants involved.", 331 "source": "haiku" 332 }, 333 "inclusion_exclusion_criteria": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants involved.", 337 "source": "haiku" 338 }, 339 "randomization_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants involved.", 343 "source": "haiku" 344 }, 345 "blinding_described": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants involved.", 349 "source": "haiku" 350 }, 351 "attrition_reported": { 352 "applies": false, 353 "answer": false, 354 "justification": "No human participants involved.", 355 "source": "haiku" 356 } 357 }, 358 "cost_and_practicality": { 359 "inference_cost_reported": { 360 "applies": true, 361 "answer": false, 362 "justification": "API costs are acknowledged implicitly (subsampling to 300/200 examples 'due to costs') but no actual cost figures or call counts are reported.", 363 "source": "haiku" 364 }, 365 "compute_budget_stated": { 366 "applies": true, 367 "answer": false, 368 "justification": "No total compute budget, API call counts, or wall-clock time estimates are provided.", 369 "source": "haiku" 370 } 371 } 372 } 373 }, 374 "claims": [ 375 { 376 "claim": "DECOMP outperforms CoT and Least-to-Most prompting on kth letter concatenation, particularly for longer inputs (N=4,5 words)", 377 "evidence": "Figure 7: DECOMP achieves 96-98% EM across N=3,4,5 vs. CoT 22.7/12.0/6.0% and L2M 74.7/70.5/66.0%", 378 "supported": "strong" 379 }, 380 { 381 "claim": "Recursive DECOMP enables length generalization for list reversal far beyond what CoT achieves", 382 "evidence": "Figure 8: DECOMP achieves 42% EM at N=10 items vs. CoT 4.5%; base CoT 'does not generalize at all to longer sequences'", 383 "supported": "strong" 384 }, 385 { 386 "claim": "DECOMP outperforms CoT on long-context multi-hop QA (CommaQA-E) including compositional generalization", 387 "evidence": "Figure 10: DecomP(fine) 64.2% vs. CoT 55% on IID; 59.7% vs. 33.8% on compositional generalization split", 388 "supported": "strong" 389 }, 390 { 391 "claim": "DECOMP with retrieval (Decomp-Ctxt) outperforms retrieval baselines on open-domain multi-hop QA", 392 "evidence": "Figure 12: Decomp-Ctxt outperforms NoDecomp-Ctxt on MuSiQue and 2WikiMultihopQA; HotpotQA with Codex is 'comparable' rather than better", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "DECOMP-based error correction improves CoT math QA by 14-17 points through a targeted answer-extraction sub-task", 397 "evidence": "Figure 16: GSM8K 36.0→50.7% (+14.7), MultiArith 78.0→95.0% (+17) by adding a GPT-3 answer-extraction sub-module", 398 "supported": "strong" 399 }, 400 { 401 "claim": "Modular structure itself (not just the reasoning procedure) drives DECOMP's improvements over CoT", 402 "evidence": "Figure 7: CoT w/ rollout (same reasoning, monolithic) scores 74.7/70.5/66.0% vs. DECOMP 98/96/97% for N=3,4,5; rolled-out reasoning fails without modularity", 403 "supported": "strong" 404 } 405 ], 406 "methodology_tags": [ 407 "benchmark-eval" 408 ], 409 "key_findings": "Decomposed Prompting (DECOMP) outperforms Chain-of-Thought and Least-to-Most prompting across symbolic reasoning and multi-hop QA tasks by decomposing complex tasks into modular sub-tasks with dedicated few-shot prompts. The central finding is that separate sub-task prompts are more effective than unrolling the same reasoning steps into a single CoT — demonstrating that modularity itself drives improvements, not just the reasoning procedure. DECOMP uniquely enables recursive decomposition for length generalization on list reversal, hierarchical decomposition for sub-tasks too hard for few-shot prompting, and seamless integration of symbolic systems like ElasticSearch for open-domain QA, and achieves 14-17 point gains on math QA through targeted error-correction post-processing.", 410 "red_flags": [ 411 { 412 "flag": "No confidence intervals or significance tests", 413 "detail": "All results are point estimates averaged over 3 prompts with no standard deviations or statistical significance testing, making it impossible to assess whether improvements are reliable." 414 }, 415 { 416 "flag": "Benchmark contamination unaddressed", 417 "detail": "Several benchmarks (HotpotQA 2018, MultiArith 2015) were publicly available before GPT-3's training cutoff; no discussion of potential contamination." 418 }, 419 { 420 "flag": "Subsampled evaluation due to API costs", 421 "detail": "GSM8K subsampled to 300 examples and MultiArith to 200 'due to costs with API usage' without power analysis; may reduce result reliability." 422 }, 423 { 424 "flag": "No limitations section", 425 "detail": "The paper lacks any dedicated discussion of limitations, failure modes beyond error analysis appendix, or conditions under which DECOMP would be expected to underperform." 426 }, 427 { 428 "flag": "Generation hyperparameters unreported", 429 "detail": "Temperature, top-p, and other generation hyperparameters for GPT-3 API calls are not reported, impeding exact reproduction." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models", 435 "relevance": "Primary baseline and motivation; DECOMP is explicitly designed to overcome CoT's limitations on complex multi-step tasks" 436 }, 437 { 438 "title": "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models", 439 "relevance": "Closest related work; directly compared as baseline with rollout variant; DECOMP differs by allowing non-linear decomposition structures" 440 }, 441 { 442 "title": "Language Models are Few-Shot Learners (GPT-3)", 443 "relevance": "Foundation model used throughout experiments; establishes the few-shot in-context learning paradigm DECOMP builds on" 444 }, 445 { 446 "title": "Successive Prompting for Decomposing Complex Questions", 447 "relevance": "Related decomposition approach; DECOMP extends with diverse and recursive decomposition structures beyond sequential question generation" 448 }, 449 { 450 "title": "PAL: Program-aided Language Models", 451 "relevance": "Related work on integrating symbolic computation with LLM reasoning; context for DECOMP's symbolic module integration" 452 }, 453 { 454 "title": "MuSiQue: Multi-hop Questions via Single-hop Question Composition", 455 "relevance": "Key evaluation benchmark for multi-hop open-domain QA" 456 }, 457 { 458 "title": "HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering", 459 "relevance": "Key evaluation benchmark; results show DECOMP comparable but not clearly better than retrieval baseline with Codex" 460 }, 461 { 462 "title": "Text Modular Networks: Learning to Decompose Tasks in the Language of Existing Models", 463 "relevance": "Direct precursor to DECOMP using supervised training for decomposition; DECOMP replaces supervised next-question generator with few-shot LLM" 464 }, 465 { 466 "title": "Training Verifiers to Solve Math Word Problems (GSM8K)", 467 "relevance": "Math QA benchmark demonstrating DECOMP's error-correction improvement of 14 points" 468 }, 469 { 470 "title": "Training Language Models to Follow Instructions with Human Feedback (InstructGPT)", 471 "relevance": "Primary model (text-davinci-002) used in most experiments" 472 } 473 ], 474 "engagement_factors": { 475 "practical_relevance": { 476 "score": 3, 477 "justification": "Code and all prompts released on GitHub; technique directly usable by any developer with GPT-3 API access; demonstrates improvements on real NLP tasks." 478 }, 479 "surprise_contrarian": { 480 "score": 2, 481 "justification": "Counterintuitive finding that modular prompts outperform CoT even when CoT uses identical reasoning steps (rollout); modularity matters independently of the reasoning procedure." 482 }, 483 "fear_safety": { 484 "score": 0, 485 "justification": "No AI safety or risk concerns raised; purely a performance improvement paper on NLP benchmarks." 486 }, 487 "drama_conflict": { 488 "score": 1, 489 "justification": "Mild competitive framing against CoT which was a dominant paradigm at the time; no major controversy." 490 }, 491 "demo_ability": { 492 "score": 3, 493 "justification": "Code on GitHub, all prompts provided in the paper appendix; can be replicated with GPT-3 API access; worked examples in paper are immediately tryable." 494 }, 495 "brand_recognition": { 496 "score": 2, 497 "justification": "Allen Institute for AI (AI2) is a well-known NLP research lab; uses GPT-3 (text-davinci-002) which was the flagship model at publication time." 498 } 499 }, 500 "hn_data": { 501 "threads": [ 502 { 503 "hn_id": "37816614", 504 "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning in LMs", 505 "points": 79, 506 "comments": 11, 507 "url": "https://news.ycombinator.com/item?id=37816614", 508 "created_at": "2023-10-09T03:24:13Z" 509 }, 510 { 511 "hn_id": "25773418", 512 "title": "Adversarial Grammatical Error Correction", 513 "points": 3, 514 "comments": 0, 515 "url": "https://news.ycombinator.com/item?id=25773418", 516 "created_at": "2021-01-14T07:48:57Z" 517 }, 518 { 519 "hn_id": "33182502", 520 "title": "Code Librarian: A Software Package Recommendation System", 521 "points": 2, 522 "comments": 0, 523 "url": "https://news.ycombinator.com/item?id=33182502", 524 "created_at": "2022-10-12T20:19:58Z" 525 }, 526 { 527 "hn_id": "39202830", 528 "title": "Low-Resource Languages Jailbreak GPT-4", 529 "points": 1, 530 "comments": 0, 531 "url": "https://news.ycombinator.com/item?id=39202830", 532 "created_at": "2024-01-31T12:11:05Z" 533 } 534 ], 535 "top_points": 79, 536 "total_points": 85, 537 "total_comments": 11 538 } 539 }