scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31542B)
      1 {
      2   "paper": {
      3     "title": "Decomposed Prompting: A Modular Approach for Solving Complex Tasks",
      4     "authors": [
      5       "Tushar Khot",
      6       "Harsh Trivedi",
      7       "Matthew Finlayson",
      8       "Yao Fu",
      9       "Kyle Richardson",
     10       "Peter Clark",
     11       "Ashish Sabharwal"
     12     ],
     13     "year": 2022,
     14     "venue": "International Conference on Learning Representations (ICLR 2023)",
     15     "arxiv_id": "2210.02406",
     16     "doi": "10.48550/arXiv.2210.02406"
     17   },
     18   "scan_version": 3,
     19   "active_modules": [
     20     "experimental_rigor",
     21     "data_leakage"
     22   ],
     23   "methodology_tags": [
     24     "benchmark-eval"
     25   ],
     26   "key_findings": "Decomposed Prompting (DECOMP) outperforms Chain-of-Thought and Least-to-Most prompting across symbolic reasoning, long-context QA, and open-domain multi-hop QA by decomposing complex tasks into modular sub-task handlers with independent few-shot prompts. The approach enables hierarchical decomposition (for hard sub-tasks like kth letter extraction), recursive decomposition (achieving near-perfect length generalization on list reversal), and seamless integration of external tools like Elasticsearch retrieval. A simple post-processing decomposition for answer extraction in math QA yields 14-17 point improvements over standard CoT.",
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper states in Section 1 footnote: 'Datasets, Code and Prompts available at https://github.com/allenai/DecomP.' A working repository URL is provided."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper states datasets are available at the GitHub repository. The evaluation uses publicly available datasets (HotpotQA, MuSiQue, 2WikiMultihopQA, CommaQA-E, GSM8K, MultiArith) and provides custom evaluation data at the repository."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper specifies model names (text-davinci-002, code-davinci-002, Flan-T5 variants) but does not provide environment specifications such as requirements.txt, Dockerfile, or library versions needed to reproduce the experiments."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "All prompts are provided verbatim in Appendix G (spanning ~25 pages). The decomposition structure, operators, and sub-task handlers are described in detail. Combined with the code release, a researcher could reconstruct the experiments."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Results are reported as point estimates (e.g., 98.0%, 42.0%, 25.4%). While results are averaged across three prompts, no confidence intervals or error bars are shown in the figures or tables."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No statistical significance tests are used. Claims like 'DECOMP outperforms chain-of-thought' are based solely on comparing point estimates without any statistical test."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Effect sizes are reported with baseline context throughout, e.g., '17 pt improvement on MultiArith (78 → 95) and 14 pt improvement on GSM8K (36 → 50.6)' (Section 4.5). Results figures show both baseline and method scores."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Evaluation uses 100 examples for letter concatenation, 300 for GSM8K, 200 for MultiArith, and 300 for open-domain QA. The only justification given is 'due to costs with API usage' (Appendix B footnotes), not a power analysis or statistical justification."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "While results are averaged across 3 prompt variants, standard deviation is not systematically reported. Footnote 6 mentions 'the std. dev is zero here' for one case, but main results lack spread measures. Per-prompt results are shown in Appendix D but without aggregate variance statistics."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Multiple baselines are included: standard CoT, CoT with rollout, Least-to-Most prompting, No-Context QA, NoDecomp-Context QA. Comparisons are made across all evaluation tasks."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Baselines include Chain-of-Thought (Wei et al., 2022) and Least-to-Most (Zhou et al., 2023), which are contemporary state-of-the-art prompting methods at the time of publication."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section E and Appendix E compare alternative decomposition schemes (e.g., loop vs generate for letter concatenation, mid-split vs tail-split for reversal). Section 4.3 compares coarse vs fine-grained decomposition granularity. These function as ablation studies."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Within each evaluation task, only a single metric is used: Exact Match for symbolic tasks and CommaQA, Answer F1 for open-domain QA. No task is evaluated with multiple metrics simultaneously."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "All evaluation is fully automated (Exact Match, Answer F1). No human evaluation of system outputs is performed, despite the paper making claims about the quality of decompositions and sub-task handling."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "For open-domain QA, hyperparameters are tuned on 'a held out set of 100 questions for each dataset' and evaluated on '300 held-out dev questions' (Appendix A.2). Letter concatenation test examples use a separate word list from training."
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Results are broken down by input length (N=3,4,5 words for letter concatenation; N=4,6,8,10 for reversal), by dataset (3 open-domain QA datasets), by model size (Flan-T5-Large/XL/XXL, Codex), and by decomposition granularity (coarse/fine)."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Appendix F provides detailed error analysis for letter concatenation (Section F.1) and CommaQA (Section F.2), showing specific examples of sub-task errors, incorrect letter extraction, and incorrect QA answers."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 4.2 reports that 'CoT version of our decomposition strategy fails because the unrolled prompt becomes too long and convoluted.' Section 4.4 notes Decomp-Ctxt does not outperform NoDecomp-Ctxt on HotpotQA with Codex. Appendix C shows performance drops with smaller models."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "The abstract's claims — outperforming prior work on few-shot prompting, hierarchical decomposition for hard sub-tasks, recursive decomposition for length generalization, better QA performance, and incorporating symbolic retrieval — are all supported by experimental results in Sections 4.1-4.5."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Causal claims like 'the separate prompts are more effective at teaching hard sub-tasks than a single CoT prompt' (Section 4.1) are supported by controlled comparisons where only the decomposition strategy varies. The ablation studies (Section E) use single-variable manipulation across decomposition schemes."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The conclusion claims DECOMP is 'an effective few-shot paradigm for solving complex tasks' broadly, but experiments cover only specific symbolic manipulation tasks, synthetic QA, and multi-hop QA. The title 'Solving Complex Tasks' is broader than the tested settings."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper does not discuss alternative explanations for why DECOMP outperforms baselines. For example, it does not consider whether the improvement comes from effectively giving the model more tokens/context, prompt engineering effects, or task-specific factors rather than the decomposition principle itself."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The paper measures Exact Match and Answer F1 on specific tasks and frames claims in terms of those metrics. There is no proxy gap — claims are about task accuracy on the tested benchmarks."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Specific model identifiers are provided: 'text-davinci-002 InstructGPT3 model' (Section 4), 'code-davinci-002' (Section 4.4), 'davinci-001' (Section 4.2), 'text-curie-001' (Appendix C), 'Flan-T5-Large (0.7B), Flan-T5-XL (3B), and Flan-T5-XXL (11B)' (Section 4.4)."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Full prompt text is provided in Appendix G spanning approximately 25 pages, covering all decomposer prompts, sub-task handler prompts, CoT baselines, and Least-to-Most prompts for all tasks."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The paper states 'greedy search' is used for inference (Section 3.2), effectively specifying temperature=0. Retrieval hyperparameter K is tuned via grid search with explicit ranges stated (Appendix A.2). Number of in-context examples is specified per prompt."
    166       },
    167       "scaffolding_described": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The decomposer-handler framework is described in detail in Section 3, including the controller that routes sub-queries to handlers, the foreach and foreach_merge operators (Section 3.1), the inference procedure with EOQ markers (Section 3.2), and hierarchical/recursive capabilities (Section 3.3)."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Evaluation data construction is documented: letter concatenation words from name lists (footnote 4), CommaQA-E dataset generation with context limit constraints (Section 4.3), open-domain QA corpus construction by combining paragraphs (Appendix A.1 with exact corpus sizes: 430,225 and 139,416 paragraphs)."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper has no dedicated limitations section. The conclusion (Section 5) is a brief summary of contributions with no discussion of limitations."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of how prompt selection, model choice, or task selection might affect the conclusions."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show, what settings were not tested, or what types of complex tasks DECOMP might not be suited for."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Datasets and code are released at https://github.com/allenai/DecomP (footnote 1). Evaluation datasets, prompts, and model outputs can be inspected."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Data sources are described: letter concatenation words from forebears.io name lists (footnote 4), CommaQA-E from Khot et al. (2022) with size reduction for context limits, open-domain QA from HotpotQA/MuSiQue/2WikiMultihopQA with corpus construction details (Appendix A.1)."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No human participants. Data sources are standard benchmarks and synthetically generated evaluation sets."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The pipeline from raw datasets to evaluation is documented: corpus construction with paragraph counts (Appendix A.1), hyperparameter tuning split (100 questions) vs evaluation split (300 questions), and random sampling from test sets with stated sample sizes."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Acknowledgements section states: 'This work was supported in part by the National Science Foundation under grants IIS2007290.'"
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "All author affiliations are clearly listed: Allen Institute for AI (AI2), Stony Brook University, University of Edinburgh. The work is done during an internship at AI2 (footnote on first page)."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The National Science Foundation is an independent funding agency with no financial stake in whether Decomposed Prompting outperforms alternative methods."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is included in the paper."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "The paper uses GPT-3 (text-davinci-002, code-davinci-002) and Flan-T5 models without stating their training data cutoff dates. This is necessary to assess whether evaluation benchmarks appeared in training data."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of potential train/test overlap. The paper uses public benchmarks (HotpotQA, MuSiQue, GSM8K, MultiArith) with models that may have been trained on them."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "No benchmark contamination analysis is performed despite using GPT-3 models on public benchmarks. HotpotQA (2018), GSM8K (2021), and MultiArith (2015) were all published before the GPT-3 training data was collected."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study. All evaluation is automated on benchmark datasets."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants. The study evaluates LLM performance on automated benchmarks."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants in this study."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "No inference cost or API cost is reported. The paper mentions subsampling 'due to costs with API usage' (footnotes in Appendix B) but never quantifies the actual costs. DECOMP makes multiple API calls per example but the cost is not compared to baselines."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No total computational budget is stated. The paper does not report total API spend, number of tokens consumed, or computation time."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "No seed sensitivity analysis is reported. While results are averaged across 3 prompt variants, there is no analysis of sensitivity to random seeds in the models themselves."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "The number of prompt variants is stated: 'We create three different prompts for all our baselines and present the average' (Section 4.1). Per-prompt results are shown in Appendix D."
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": true,
    319         "justification": "Appendix A.2 specifies the search ranges: 'For NoDecomp-Ctxt, we search K ∈ {6, 8, 10} for GPT3 models and K ∈ {2, 4, 6, 8} for Flan-T5-* models. For Decomp-Ctxt, we search K ∈ {2, 4, 6}.' Selection is on a held-out set of 100 questions."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": true,
    324         "justification": "Appendix A.2 describes configuration selection: 'We select it based on a grid search on a set of values to maximize performance on a held out set of 100 questions for each dataset.' Selection is done on a separate set from the evaluation set."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": false,
    328         "answer": false,
    329         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors implement both their own system and the baselines (CoT, Least-to-Most). No discussion of potential bias from authors implementing baselines, despite Lucic et al. (2018) showing systematic underperformance of author-implemented baselines."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "DECOMP makes multiple API calls per example (decomposer + multiple sub-task handlers), substantially more than CoT (single call). Appendix E notes one decomposition uses O(n) vs O(log n) calls, but no systematic compute-performance comparison across methods is provided."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "The paper does not discuss whether the benchmarks (letter concatenation, list reversal, CommaQA) actually measure the claimed capabilities of 'solving complex tasks.' The synthetic tasks may not represent real-world complexity."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "When comparing models (Flan-T5 variants, Codex), the same DECOMP scaffold is used. The main comparisons (DECOMP vs CoT vs L2M) are explicitly comparing different decomposition strategies, which is the intended variable."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of temporal leakage. Several benchmarks (HotpotQA 2018, MultiArith 2015, GSM8K 2021) were published before GPT-3's training data collection, meaning solutions could be in the training data."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of feature leakage. The few-shot prompts contain examples with answers that could prime the model, but this is not analyzed as a potential confound."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether evaluation examples are independent of the training data. The letter concatenation task uses common first and last names that likely appear frequently in GPT-3's training data."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference tests, or decontamination procedures are mentioned."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "DECOMP outperforms CoT and Least-to-Most prompting on kth letter concatenation, achieving near-100% accuracy where CoT achieves 6-23%",
    378       "evidence": "Figure 7 shows DECOMP at 97-98% vs CoT at 6-22.7% across N=3,4,5 words with k=3. Results averaged across 3 prompts. Per-prompt results in Fig 18 confirm consistency.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "Recursive DECOMP enables length generalization for list reversal, far exceeding CoT on longer sequences",
    383       "evidence": "Figure 8 shows DECOMP at 42% on N=10 vs CoT at 4.5% and CoT w/ rollout at 1%. Uses davinci-001 to demonstrate gains on a weaker model.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "DECOMP outperforms CoT on CommaQA-E for both IID and compositional generalization settings",
    388       "evidence": "Figure 10 shows DECOMP(fine) at 59.7/64.2% vs CoT at 42.0/33.8% on IID/Comp.Gen. splits. Per-prompt results in Fig 19 confirm consistency across prompt choices.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "DECOMP with retrieval significantly outperforms no-retrieval and simple-retrieval baselines on open-domain multi-hop QA",
    393       "evidence": "Figure 12 shows Decomp-Ctxt outperforms No-Ctxt and NoDecomp-Ctxt on MuSiQue (25.4 vs 18.3/21.4) and 2WikiMultihopQA (64.1 vs 38.1/47.1) with Codex. Exception: comparable to NoDecomp-Ctxt on HotpotQA.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Post-processing CoT with a GPT-3 answer extractor yields 14-17 point improvements on math QA",
    398       "evidence": "Figure 16 shows improvement from 78→95 on MultiArith and 36→50.6 on GSM8K. However, evaluation uses random subsamples (200 and 300 examples respectively) of the test sets.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "The separate sub-task prompts are more effective at teaching hard sub-tasks than a single CoT prompt using the same reasoning procedure",
    403       "evidence": "Section 4.1 compares DECOMP to 'CoT w/ rollout' which describes the same decomposition steps in a single prompt. DECOMP (98%) outperforms CoT w/ rollout (70.5%) on N=4, k=3. The reasoning steps are identical but factored into separate prompts.",
    404       "supported": "strong"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "No error bars or uncertainty quantification",
    410       "detail": "Results are reported as point estimates averaged across 3 prompts. No confidence intervals, error bars, or significance tests are used despite making comparative claims. The reader cannot assess whether differences are statistically meaningful."
    411     },
    412     {
    413       "flag": "No contamination analysis",
    414       "detail": "GPT-3 models are evaluated on public benchmarks (HotpotQA, GSM8K, MultiArith) that predate the models' training data. No analysis of whether the models have seen evaluation data during training, which could inflate accuracy for all methods."
    415     },
    416     {
    417       "flag": "No limitations discussion",
    418       "detail": "The paper lacks any limitations section or discussion of threats to validity. Important limitations such as API cost overhead, sensitivity to prompt design, and generalizability to other task types are not addressed."
    419     },
    420     {
    421       "flag": "Compute cost not compared",
    422       "detail": "DECOMP makes multiple API calls per example (decomposer + multiple sub-task handlers), potentially much more expensive than CoT's single call. This cost is never quantified or compared, despite the paper acknowledging API costs forced them to subsample evaluation sets."
    423     },
    424     {
    425       "flag": "Small evaluation sample sizes without justification",
    426       "detail": "Evaluation uses 100-300 examples per dataset, subsampled due to API costs. No power analysis or justification for whether these sample sizes are sufficient to draw reliable conclusions."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Chain of thought prompting elicits reasoning in large language models",
    432       "authors": [
    433         "Jason Wei",
    434         "Xuezhi Wang",
    435         "Dale Schuurmans",
    436         "Maarten Bosma",
    437         "Ed Chi",
    438         "Quoc Le",
    439         "Denny Zhou"
    440       ],
    441       "year": 2022,
    442       "relevance": "Foundational chain-of-thought prompting method that DECOMP extends; key baseline for all experiments."
    443     },
    444     {
    445       "title": "Language models are few-shot learners",
    446       "authors": [
    447         "Tom Brown",
    448         "Benjamin Mann",
    449         "Nick Ryder"
    450       ],
    451       "year": 2020,
    452       "relevance": "GPT-3 paper establishing few-shot prompting capabilities that underpin this work."
    453     },
    454     {
    455       "title": "Least-to-most prompting enables complex reasoning in large language models",
    456       "authors": [
    457         "Denny Zhou",
    458         "Nathanael Scharli",
    459         "Le Hou",
    460         "Jason Wei"
    461       ],
    462       "year": 2023,
    463       "relevance": "Closest prior work to DECOMP; generates sub-questions from easiest to hardest. Direct baseline comparison."
    464     },
    465     {
    466       "title": "Successive prompting for decomposing complex questions",
    467       "authors": [
    468         "Dheeru Dua",
    469         "Shivanshu Gupta",
    470         "Sameer Singh",
    471         "Matt Gardner"
    472       ],
    473       "year": 2022,
    474       "relevance": "Similar decomposition approach using sequential prompting for question answering; key comparison point."
    475     },
    476     {
    477       "title": "PAL: Program-aided language models",
    478       "authors": [
    479         "Luyu Gao",
    480         "Aman Madaan",
    481         "Shuyan Zhou"
    482       ],
    483       "year": 2022,
    484       "arxiv_id": "2211.10435",
    485       "relevance": "Combines LLMs with symbolic computation for reasoning tasks; related modular approach to LLM problem-solving."
    486     },
    487     {
    488       "title": "Toolformer: Language models can teach themselves to use tools",
    489       "authors": [
    490         "Timo Schick",
    491         "Jane Dwivedi-Yu",
    492         "Roberto Dessi"
    493       ],
    494       "year": 2023,
    495       "arxiv_id": "2302.04761",
    496       "relevance": "Tool-augmented LLMs that learn to call external APIs; related approach to integrating external functions."
    497     },
    498     {
    499       "title": "Self-consistency improves chain of thought reasoning in language models",
    500       "authors": [
    501         "Xuezhi Wang",
    502         "Jason Wei",
    503         "Dale Schuurmans",
    504         "Quoc Le",
    505         "Ed Chi",
    506         "Denny Zhou"
    507       ],
    508       "year": 2023,
    509       "relevance": "Alternative method for improving CoT reasoning via sampling consistency; complementary to decomposition approach."
    510     },
    511     {
    512       "title": "Training verifiers to solve math word problems",
    513       "authors": [
    514         "Karl Cobbe",
    515         "Vineet Kosaraju",
    516         "Mohammad Bavarian"
    517       ],
    518       "year": 2021,
    519       "arxiv_id": "2110.14168",
    520       "relevance": "GSM8K benchmark used for evaluation; training verifiers for mathematical reasoning."
    521     },
    522     {
    523       "title": "PaLM: Scaling language modeling with pathways",
    524       "authors": [
    525         "Aakanksha Chowdhery",
    526         "Sharan Narang",
    527         "Jacob Devlin"
    528       ],
    529       "year": 2022,
    530       "arxiv_id": "2204.02311",
    531       "relevance": "Large-scale LLM demonstrating chain-of-thought capabilities; related to the scaling of prompting approaches."
    532     },
    533     {
    534       "title": "Language model cascades",
    535       "authors": [
    536         "David Dohan",
    537         "Winnie Xu",
    538         "Aitor Lewkowycz"
    539       ],
    540       "year": 2022,
    541       "arxiv_id": "2207.10342",
    542       "relevance": "Theoretical framework for composing LLMs as probabilistic programs; formalizes the cascade structure DECOMP uses."
    543     },
    544     {
    545       "title": "Training language models to follow instructions with human feedback",
    546       "authors": [
    547         "Long Ouyang",
    548         "Jeff Wu",
    549         "Xu Jiang"
    550       ],
    551       "year": 2022,
    552       "relevance": "InstructGPT paper for the text-davinci-002 model used as the primary LLM in DECOMP experiments."
    553     },
    554     {
    555       "title": "Measuring and narrowing the compositionality gap in language models",
    556       "authors": [
    557         "Ofir Press",
    558         "Muru Zhang",
    559         "Sewon Min",
    560         "Ludwig Schmidt",
    561         "Noah A Smith",
    562         "Mike Lewis"
    563       ],
    564       "year": 2022,
    565       "arxiv_id": "2210.03350",
    566       "relevance": "Analyzes compositional reasoning limitations in LLMs; DECOMP addresses by decomposing compositional tasks."
    567     }
    568   ],
    569   "engagement_factors": {
    570     "practical_relevance": {
    571       "score": 2,
    572       "justification": "Decomposed prompting is a directly applicable technique for building LLM pipelines with modular sub-task handlers, relevant to prompt engineers and AI application developers."
    573     },
    574     "surprise_contrarian": {
    575       "score": 1,
    576       "justification": "The finding that separate sub-task prompts outperform a single CoT using the same reasoning procedure is mildly surprising, but modular decomposition beating monolithic approaches is not counterintuitive."
    577     },
    578     "fear_safety": {
    579       "score": 0,
    580       "justification": "No safety, security, or risk concerns are raised or relevant to this work."
    581     },
    582     "drama_conflict": {
    583       "score": 0,
    584       "justification": "No controversy or conflict; the paper positions itself as a natural extension of CoT and least-to-most prompting rather than challenging them."
    585     },
    586     "demo_ability": {
    587       "score": 2,
    588       "justification": "Code and prompts are released on GitHub (allenai/DecomP), allowing reproduction with moderate effort though it requires GPT-3 API access."
    589     },
    590     "brand_recognition": {
    591       "score": 1,
    592       "justification": "Allen Institute for AI (AI2) is well-respected in NLP research but not a household name in the broader tech community."
    593     }
    594   }
    595 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs