scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33364B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Decomposed Prompting: A Modular Approach for Solving Complex Tasks",
      6     "authors": [
      7       "Tushar Khot",
      8       "Harsh Trivedi",
      9       "Matthew Finlayson",
     10       "Yao Fu",
     11       "Kyle Richardson",
     12       "Peter Clark",
     13       "Ashish Sabharwal"
     14     ],
     15     "year": 2022,
     16     "venue": "International Conference on Learning Representations",
     17     "arxiv_id": "2210.02406",
     18     "doi": "10.48550/arXiv.2210.02406"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract's claims — outperforming prior work on few-shot prompting, hierarchical decomposition for hard sub-tasks, recursive decomposition for length generalization, better QA performance, and incorporating symbolic retrieval — are all supported by experimental results in Sections 4.1-4.5.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Causal claims like 'the separate prompts are more effective at teaching hard sub-tasks than a single CoT prompt' (Section 4.1) are supported by controlled comparisons where only the decomposition strategy varies. The ablation studies (Section E) use single-variable manipulation across decomposition schemes.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The conclusion claims DECOMP is 'an effective few-shot paradigm for solving complex tasks' broadly, but experiments cover only specific symbolic manipulation tasks, synthetic QA, and multi-hop QA. The title 'Solving Complex Tasks' is broader than the tested settings.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss alternative explanations for why DECOMP outperforms baselines. For example, it does not consider whether the improvement comes from effectively giving the model more tokens/context, prompt engineering effects, or task-specific factors rather than the decomposition principle itself.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper measures Exact Match and Answer F1 on specific tasks and frames claims in terms of those metrics. There is no proxy gap — claims are about task accuracy on the tested benchmarks.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper has no dedicated limitations section. The conclusion (Section 5) is a brief summary of contributions with no discussion of limitations.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of how prompt selection, model choice, or task selection might affect the conclusions.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what settings were not tested, or what types of complex tasks DECOMP might not be suited for.",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Acknowledgements section states: 'This work was supported in part by the National Science Foundation under grants IIS2007290.'",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All author affiliations are clearly listed: Allen Institute for AI (AI2), Stony Brook University, University of Edinburgh. The work is done during an internship at AI2 (footnote on first page).",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The National Science Foundation is an independent funding agency with no financial stake in whether Decomposed Prompting outperforms alternative methods.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is included in the paper.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms including 'decomposer,' 'sub-task handlers,' 'prompting program,' and the formal definition of DECOMP are precisely defined in Section 3; the distinction from CoT and Least-to-Most is made explicit.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper clearly states it contributes Decomposed Prompting (DECOMP), a modular few-shot prompting framework, along with empirical evaluation across four task types.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 2 situates DECOMP relative to CoT, Least-to-Most, successive prompting, and the neural modular networks literature, explaining how DECOMP differs from and extends prior approaches.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "The paper states in Section 1 footnote: 'Datasets, Code and Prompts available at https://github.com/allenai/DecomP.' A working repository URL is provided.",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The paper states datasets are available at the GitHub repository. The evaluation uses publicly available datasets (HotpotQA, MuSiQue, 2WikiMultihopQA, CommaQA-E, GSM8K, MultiArith) and provides custom evaluation data at the repository.",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "The paper specifies model names (text-davinci-002, code-davinci-002, Flan-T5 variants) but does not provide environment specifications such as requirements.txt, Dockerfile, or library versions needed to reproduce the experiments.",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": true,
    144           "justification": "All prompts are provided verbatim in Appendix G (spanning ~25 pages). The decomposition structure, operators, and sub-task handlers are described in detail. Combined with the code release, a researcher could reconstruct the experiments.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Results are reported as point estimates (e.g., 98.0%, 42.0%, 25.4%). While results are averaged across three prompts, no confidence intervals or error bars are shown in the figures or tables.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are used. Claims like 'DECOMP outperforms chain-of-thought' are based solely on comparing point estimates without any statistical test.",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Effect sizes are reported with baseline context throughout, e.g., '17 pt improvement on MultiArith (78 → 95) and 14 pt improvement on GSM8K (36 → 50.6)' (Section 4.5). Results figures show both baseline and method scores.",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "Evaluation uses 100 examples for letter concatenation, 300 for GSM8K, 200 for MultiArith, and 300 for open-domain QA. The only justification given is 'due to costs with API usage' (Appendix B footnotes), not a power analysis or statistical justification.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "While results are averaged across 3 prompt variants, standard deviation is not systematically reported. Footnote 6 mentions 'the std. dev is zero here' for one case, but main results lack spread measures. Per-prompt results are shown in Appendix D but without aggregate variance statistics.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Multiple baselines are included: standard CoT, CoT with rollout, Least-to-Most prompting, No-Context QA, NoDecomp-Context QA. Comparisons are made across all evaluation tasks.",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines include Chain-of-Thought (Wei et al., 2022) and Least-to-Most (Zhou et al., 2023), which are contemporary state-of-the-art prompting methods at the time of publication.",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Section E and Appendix E compare alternative decomposition schemes (e.g., loop vs generate for letter concatenation, mid-split vs tail-split for reversal). Section 4.3 compares coarse vs fine-grained decomposition granularity. These function as ablation studies.",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "Within each evaluation task, only a single metric is used: Exact Match for symbolic tasks and CommaQA, Answer F1 for open-domain QA. No task is evaluated with multiple metrics simultaneously.",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "All evaluation is fully automated (Exact Match, Answer F1). No human evaluation of system outputs is performed, despite the paper making claims about the quality of decompositions and sub-task handling.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "For open-domain QA, hyperparameters are tuned on 'a held out set of 100 questions for each dataset' and evaluated on '300 held-out dev questions' (Appendix A.2). Letter concatenation test examples use a separate word list from training.",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down by input length (N=3,4,5 words for letter concatenation; N=4,6,8,10 for reversal), by dataset (3 open-domain QA datasets), by model size (Flan-T5-Large/XL/XXL, Codex), and by decomposition granularity (coarse/fine).",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Appendix F provides detailed error analysis for letter concatenation (Section F.1) and CommaQA (Section F.2), showing specific examples of sub-task errors, incorrect letter extraction, and incorrect QA answers.",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": true,
    232           "justification": "Section 4.2 reports that 'CoT version of our decomposition strategy fails because the unrolled prompt becomes too long and convoluted.' Section 4.4 notes Decomp-Ctxt does not outperform NoDecomp-Ctxt on HotpotQA with Codex. Appendix C shows performance drops with smaller models.",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "Specific model identifiers are provided: 'text-davinci-002 InstructGPT3 model' (Section 4), 'code-davinci-002' (Section 4.4), 'davinci-001' (Section 4.2), 'text-curie-001' (Appendix C), 'Flan-T5-Large (0.7B), Flan-T5-XL (3B), and Flan-T5-XXL (11B)' (Section 4.4).",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Full prompt text is provided in Appendix G spanning approximately 25 pages, covering all decomposer prompts, sub-task handler prompts, CoT baselines, and Least-to-Most prompts for all tasks.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "The paper states 'greedy search' is used for inference (Section 3.2), effectively specifying temperature=0. Retrieval hyperparameter K is tuned via grid search with explicit ranges stated (Appendix A.2). Number of in-context examples is specified per prompt.",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "The decomposer-handler framework is described in detail in Section 3, including the controller that routes sub-queries to handlers, the foreach and foreach_merge operators (Section 3.1), the inference procedure with EOQ markers (Section 3.2), and hierarchical/recursive capabilities (Section 3.3).",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Evaluation data construction is documented: letter concatenation words from name lists (footnote 4), CommaQA-E dataset generation with context limit constraints (Section 4.3), open-domain QA corpus construction by combining paragraphs (Appendix A.1 with exact corpus sizes: 430,225 and 139,416 paragraphs).",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Datasets and code are released at https://github.com/allenai/DecomP (footnote 1). Evaluation datasets, prompts, and model outputs can be inspected.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Data sources are described: letter concatenation words from forebears.io name lists (footnote 4), CommaQA-E from Khot et al. (2022) with size reduction for context limits, open-domain QA from HotpotQA/MuSiQue/2WikiMultihopQA with corpus construction details (Appendix A.1).",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants. Data sources are standard benchmarks and synthetically generated evaluation sets.",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The pipeline from raw datasets to evaluation is documented: corpus construction with paragraph counts (Appendix A.1), hyperparameter tuning split (100 questions) vs evaluation split (300 questions), and random sampling from test sets with stated sample sizes.",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "The paper uses GPT-3 (text-davinci-002, code-davinci-002) and Flan-T5 models without stating their training data cutoff dates. This is necessary to assess whether evaluation benchmarks appeared in training data.",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of potential train/test overlap. The paper uses public benchmarks (HotpotQA, MuSiQue, GSM8K, MultiArith) with models that may have been trained on them.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "No benchmark contamination analysis is performed despite using GPT-3 models on public benchmarks. HotpotQA (2018), GSM8K (2021), and MultiArith (2015) were all published before the GPT-3 training data was collected.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study. All evaluation is automated on benchmark datasets.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants. The study evaluates LLM performance on automated benchmarks.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "No inference cost or API cost is reported. The paper mentions subsampling 'due to costs with API usage' (footnotes in Appendix B) but never quantifies the actual costs. DECOMP makes multiple API calls per example but the cost is not compared to baselines.",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total computational budget is stated. The paper does not report total API spend, number of tokens consumed, or computation time.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "No seed sensitivity analysis is reported. While results are averaged across 3 prompt variants, there is no analysis of sensitivity to random seeds in the models themselves.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": true,
    382           "justification": "The number of prompt variants is stated: 'We create three different prompts for all our baselines and present the average' (Section 4.1). Per-prompt results are shown in Appendix D.",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": true,
    388           "justification": "Appendix A.2 specifies the search ranges: 'For NoDecomp-Ctxt, we search K ∈ {6, 8, 10} for GPT3 models and K ∈ {2, 4, 6, 8} for Flan-T5-* models. For Decomp-Ctxt, we search K ∈ {2, 4, 6}.' Selection is on a held-out set of 100 questions.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": true,
    394           "justification": "Appendix A.2 describes configuration selection: 'We select it based on a grid search on a set of values to maximize performance on a held out set of 100 questions for each dataset.' Selection is done on a separate set from the evaluation set.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": false,
    399           "answer": false,
    400           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "The authors implement both their own system and the baselines (CoT, Least-to-Most). No discussion of potential bias from authors implementing baselines, despite Lucic et al. (2018) showing systematic underperformance of author-implemented baselines.",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "DECOMP makes multiple API calls per example (decomposer + multiple sub-task handlers), substantially more than CoT (single call). Appendix E notes one decomposition uses O(n) vs O(log n) calls, but no systematic compute-performance comparison across methods is provided.",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": false,
    418           "justification": "The paper does not discuss whether the benchmarks (letter concatenation, list reversal, CommaQA) actually measure the claimed capabilities of 'solving complex tasks.' The synthetic tasks may not represent real-world complexity.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": true,
    423           "answer": true,
    424           "justification": "When comparing models (Flan-T5 variants, Codex), the same DECOMP scaffold is used. The main comparisons (DECOMP vs CoT vs L2M) are explicitly comparing different decomposition strategies, which is the intended variable.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "No discussion of temporal leakage. Several benchmarks (HotpotQA 2018, MultiArith 2015, GSM8K 2021) were published before GPT-3's training data collection, meaning solutions could be in the training data.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of feature leakage. The few-shot prompts contain examples with answers that could prime the model, but this is not analyzed as a potential confound.",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No discussion of whether evaluation examples are independent of the training data. The letter concatenation task uses common first and last names that likely appear frequently in GPT-3's training data.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, or decontamination procedures are mentioned.",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "DECOMP outperforms CoT and Least-to-Most on symbolic reasoning tasks even when those baselines use the same reasoning procedure (rolled-out CoT)",
    459       "evidence": "Figure 7 shows DECOMP achieves 98/96/97% EM vs. CoT w/ rollout at 74.7/70.5/66.0% for N=3,4,5 on kth letter concatenation; results replicated across 3 prompts",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "DECOMP achieves near-perfect length generalization on kth letter concatenation while CoT-based approaches degrade with longer inputs",
    464       "evidence": "Figure 7: DECOMP maintains ~97% EM at N=5 while CoT drops to 6% and CoT w/ rollout drops to 53.3%; std. dev. is zero for DECOMP",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "Separate sub-task prompts are more effective than a single monolithic CoT prompt for teaching hard sub-tasks",
    469       "evidence": "DECOMP vs. CoT w/ rollout comparison in Fig. 7 holds fixed the reasoning procedure; DECOMP's advantage attributed to isolated sub-task optimization",
    470       "supported": "moderate"
    471     },
    472     {
    473       "claim": "DECOMP can integrate symbolic retrieval (ElasticSearch) to outperform strong retrieval baselines on open-domain multi-hop QA",
    474       "evidence": "Figure 12 shows Decomp-Ctxt outperforms NoDecomp-Ctxt on all three ODQA datasets with Flan-T5-XXL and on MuSiQue/2WikiMultihopQA with Codex",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "Post-processing CoT outputs with a GPT-3 answer extractor via DECOMP yields 14-17 point improvements on math QA",
    479       "evidence": "Figure 16: MultiArith 78→95% (+17 pts), GSM8K 36→50.6% (+14.6 pts); limited to comparison with CoT only, no other post-processing baselines",
    480       "supported": "moderate"
    481     },
    482     {
    483       "claim": "DECOMP generalizes to unseen compositional splits without performance degradation",
    484       "evidence": "Figure 10: DECOMP shows small gains on the compositional generalization split of CommaQA while CoT drops; attributed to independently-trained QA sub-tasks",
    485       "supported": "moderate"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval"
    490   ],
    491   "key_findings": "Decomposed Prompting (DECOMP) outperforms Chain-of-Thought and Least-to-Most prompting across symbolic manipulation and multi-hop QA tasks by decomposing complex tasks into modular sub-tasks with dedicated prompts or symbolic functions. The modular design enables recursive decomposition for length generalization, integration of external retrieval APIs, and isolated optimization of sub-task handlers. A key ablation shows that DECOMP's advantage over CoT w/ rollout—which uses identical reasoning steps in a single prompt—is due specifically to the modular structure, not just the reasoning procedure. The approach is flexible enough to subsume post-processing as a sub-task, yielding 14-17 point improvements on math QA by fixing CoT answer extraction errors.",
    492   "red_flags": [
    493     {
    494       "flag": "No statistical significance tests",
    495       "detail": "All comparative claims are made without significance testing; performance differences on small test sets (100-300 examples) are reported as fact without confidence measures."
    496     },
    497     {
    498       "flag": "No limitations section",
    499       "detail": "The paper has no dedicated limitations or threats-to-validity section; the significant manual effort required to design decomposition prompts per task is presented as a feature rather than acknowledged as a limitation."
    500     },
    501     {
    502       "flag": "Benchmark contamination unaddressed",
    503       "detail": "GPT-3 models are evaluated on HotpotQA, GSM8K, and MultiArith benchmarks that were publicly available before training cutoffs; no contamination analysis is performed."
    504     },
    505     {
    506       "flag": "Model version snapshots absent",
    507       "detail": "Only marketing names like 'text-davinci-002' are given without snapshot dates; results may not be reproducible as OpenAI updates models over time."
    508     },
    509     {
    510       "flag": "Alternative explanation not addressed",
    511       "detail": "DECOMP provides more in-context examples per sub-task handler than CoT provides for the whole task; this confound is acknowledged as an advantage but not controlled for experimentally."
    512     }
    513   ],
    514   "cited_papers": [
    515     {
    516       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    517       "relevance": "Primary baseline; DECOMP is directly compared against CoT throughout all experiments"
    518     },
    519     {
    520       "title": "Least-to-Most Prompting Enables Complex Reasoning in Large Language Models",
    521       "relevance": "Key competing method; compared on letter concatenation and discussed as closest prior work"
    522     },
    523     {
    524       "title": "Language Models are Few-Shot Learners (GPT-3)",
    525       "relevance": "Foundation model used for all experiments; establishes few-shot prompting paradigm"
    526     },
    527     {
    528       "title": "Training language models to follow instructions with human feedback (InstructGPT)",
    529       "relevance": "text-davinci-002, the primary model used in experiments, is from this work"
    530     },
    531     {
    532       "title": "Successive Prompting for Decomposing Complex Questions",
    533       "relevance": "Most closely related prior work; DECOMP extends successive prompting with diverse decomposition structures"
    534     },
    535     {
    536       "title": "MuSiQue: Multi-hop Questions via Single-hop Question Composition",
    537       "relevance": "One of three ODQA benchmarks evaluated; tests multi-hop reasoning"
    538     },
    539     {
    540       "title": "HotpotQA: A Dataset for Diverse, Explainable Multi-hop Question Answering",
    541       "relevance": "One of three ODQA benchmarks; standard multi-hop QA evaluation"
    542     },
    543     {
    544       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    545       "relevance": "Math QA benchmark where DECOMP achieves 14-point improvement over CoT"
    546     },
    547     {
    548       "title": "Text Modular Networks: Learning to Decompose Tasks in the Language of Existing Models",
    549       "relevance": "Direct predecessor that DECOMP takes inspiration from; contrasted with DECOMP's few-shot approach"
    550     },
    551     {
    552       "title": "PAL: Program-aided Language Models",
    553       "relevance": "Related work combining LLMs with symbolic execution for reasoning tasks"
    554     }
    555   ],
    556   "engagement_factors": {
    557     "practical_relevance": {
    558       "score": 2,
    559       "justification": "Decomposed prompting is a directly applicable technique for building LLM pipelines with modular sub-task handlers, relevant to prompt engineers and AI application developers."
    560     },
    561     "surprise_contrarian": {
    562       "score": 1,
    563       "justification": "The finding that separate sub-task prompts outperform a single CoT using the same reasoning procedure is mildly surprising, but modular decomposition beating monolithic approaches is not counterintuitive."
    564     },
    565     "fear_safety": {
    566       "score": 0,
    567       "justification": "No safety, security, or risk concerns are raised or relevant to this work."
    568     },
    569     "drama_conflict": {
    570       "score": 0,
    571       "justification": "No controversy or conflict; the paper positions itself as a natural extension of CoT and least-to-most prompting rather than challenging them."
    572     },
    573     "demo_ability": {
    574       "score": 2,
    575       "justification": "Code and prompts are released on GitHub (allenai/DecomP), allowing reproduction with moderate effort though it requires GPT-3 API access."
    576     },
    577     "brand_recognition": {
    578       "score": 1,
    579       "justification": "Allen Institute for AI (AI2) is well-respected in NLP research but not a household name in the broader tech community."
    580     }
    581   },
    582   "hn_data": {
    583     "threads": [
    584       {
    585         "hn_id": "37816614",
    586         "title": "Language Agent Tree Search Unifies Reasoning Acting and Planning in LMs",
    587         "points": 79,
    588         "comments": 11,
    589         "url": "https://news.ycombinator.com/item?id=37816614",
    590         "created_at": "2023-10-09T03:24:13Z"
    591       },
    592       {
    593         "hn_id": "25773418",
    594         "title": "Adversarial Grammatical Error Correction",
    595         "points": 3,
    596         "comments": 0,
    597         "url": "https://news.ycombinator.com/item?id=25773418",
    598         "created_at": "2021-01-14T07:48:57Z"
    599       },
    600       {
    601         "hn_id": "33182502",
    602         "title": "Code Librarian: A Software Package Recommendation System",
    603         "points": 2,
    604         "comments": 0,
    605         "url": "https://news.ycombinator.com/item?id=33182502",
    606         "created_at": "2022-10-12T20:19:58Z"
    607       },
    608       {
    609         "hn_id": "39202830",
    610         "title": "Low-Resource Languages Jailbreak GPT-4",
    611         "points": 1,
    612         "comments": 0,
    613         "url": "https://news.ycombinator.com/item?id=39202830",
    614         "created_at": "2024-01-31T12:11:05Z"
    615       }
    616     ],
    617     "top_points": 79,
    618     "total_points": 85,
    619     "total_comments": 11
    620   }
    621 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs