scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27918B)
      1 {
      2   "paper": {
      3     "title": "Successive Prompting for Decomposing Complex Questions",
      4     "authors": ["Dheeru Dua", "Shivanshu Gupta", "Sameer Singh", "Matt Gardner"],
      5     "year": 2022,
      6     "venue": "Conference on Empirical Methods in Natural Language Processing",
      7     "arxiv_id": "2212.04092",
      8     "doi": "10.48550/arXiv.2212.04092"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Successive prompting iteratively decomposes complex questions into simple QA pairs with decoupled supervision, enabling tailored in-context examples per step and specialized modules for symbolic reasoning. On a few-shot variant of DROP (300 training examples), the best model achieves 50.2% F1 on the test set, a ~5% absolute improvement over the state-of-the-art TASE model with similar supervision. Synthetic data generated from Wikipedia tables universally improves all model architectures, and the modular design enables mixing in-context and fine-tuned components—replacing in-context QA with a fine-tuned QA module yields ~10% F1 improvement.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'The code and data are available at https://github.com/dDua/succesive_prompting' in Section 1."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The GitHub repository claims to include data. DROP is a publicly available benchmark. Synthetic data generation from Wikipedia tables is also described and included."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions HuggingFace transformers, T5-large UnifiedQA, and GPT-J (6B), but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While training details are described (learning rate, input length, beam size), no step-by-step reproduction instructions or runnable scripts are provided in the paper itself."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Tables 2, 3, and 4 all report single F1 point estimates with no confidence intervals, error bars, or ± notation."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims successive prompting outperforms baselines by comparing F1 numbers directly (e.g., '∼5% improvement in F1') without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports absolute F1 improvements with baseline context throughout: e.g., 51.3 vs 45.9 F1 on dev (Table 3), 50.2 vs 45.1 on test, 31.9 vs 27.6 for successive prompting vs CoT (Table 2). The reader can assess the magnitude of improvement."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The 300 training examples are selected via vertex cover to cover the majority of DROP training data, but there is no statistical justification for why 300 is sufficient, no power analysis, and no discussion of whether 300 examples provide adequate signal."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "All results are single-run numbers. No standard deviations, variance across seeds, or spread measures are reported in any table."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against Standard prompting, Chain-of-Thought, UnifiedQA, PReasM, and TASE across multiple experimental settings (Tables 2, 3)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include CoT (Wei et al., 2022), PReasM (Yoran et al., 2021), TASE (Segal et al., 2020), and UnifiedQA (Khashabi et al., 2020), all published within 2 years of this 2022 paper."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Table 2 ablates the symbolic calculator (w/ calc vs w/o calc). Table 4 ablates QD and QA modules between in-context and fine-tuned variants. Tables 2 and 3 ablate data sources (Syn-Only, DROP-Only, Syn+DROP) and supervision levels (0-shot, w/o decomp, w/ decomp)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only F1 is reported as an evaluation metric across all experiments. No other metrics (e.g., exact match, accuracy) are used."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section 4.4 describes manual analysis of 50 correct and 50 incorrect predictions, evaluating decomposition quality (88% accuracy for in-context, 96% for fine-tuned) and categorizing error types."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The paper explicitly states 'We follow the standard practice of using test set for only our ﬁnal best performing model' (Section 4.2), reporting dev set for all models and test set only for the best (50.2 F1)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 3 breaks down results by model type (symbolic vs non-symbolic), supervision level (0-shot, w/o decomp, w/ decomp), and data source (with/without synthetic). Section 4.4 breaks down errors by category (incorrect QA, incorrect QD, out-of-scope)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4.4 provides detailed error analysis with three error categories and specific percentages. Figure 4 shows qualitative failure examples including incorrect decompositions and implicit reasoning failures."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 4 reveals that fine-tuned QD with in-context QA (31.4 F1) barely improves over all in-context (30.8), showing that better decomposition alone doesn't help without better answering. Table 2 shows removing the calculator hurts. The Limitations section discusses where the approach fails."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims '∼5% absolute F1' improvement. Table 3 shows 51.3 vs 45.9 on dev (5.4 F1) and test results of 50.2 vs 45.1 (5.1 F1), supporting this claim."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are made via controlled ablations: removing the calculator (Table 2), swapping QD/QA components between in-context and fine-tuned (Table 4), and varying data sources (Tables 2, 3). These are single-variable manipulations adequate for the claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The Limitations section explicitly states 'we have only shown the applicability of one speciﬁc version of this idea in one speciﬁc setting' and discusses challenges in applying the method to other domains (commonsense, causal reasoning)."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not substantively discuss alternative explanations for the observed improvements. For example, successive prompting makes multiple LM queries per question, so the improvement could partly reflect more compute rather than better decomposition. This compute confound is noted only briefly in Limitations but not analyzed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures F1 on DROP and claims F1 on DROP. No proxy gap exists — the metric matches the granularity of the claims."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "GPT-J (6B) is specified by name and size—it is a single open-source model with no version ambiguity. T5-large version of UnifiedQA is similarly specific."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Figure 2 shows complete prompt examples with actual text for both QD and QA stages. Appendix A provides the control codes used for fine-tuning. The prompting format is fully reproducible."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 4.2 reports: learning rate 5e-5, max input length 768, beam search size 5, max decomposition steps 10, 6 in-context examples, 2 epochs cross-entropy then contrastive, up to 3 negative samples, 80K dynamic sampling per epoch."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The iterative QD→QA pipeline is the core contribution and is described in detail: separate index lookup for QD and QA stages, alternating generation steps, termination condition ('There are no more questions left to ask'), and symbolic calculator routing. Figures 1 and 2 provide visual walkthroughs."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4 describes selecting 300 examples via sentence embedding + cosine similarity + vertex cover. Section 3 describes synthetic data generation from Wikipedia tables with curated templates, including statistics (141K complex questions, 525K QD examples, 257K QA examples). Dynamic sampling strategy per epoch is described."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section is present after the Conclusion, providing substantive discussion of the method's constraints."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The Limitations section discusses specific threats: decomposition data may be impossible to obtain for some domains, some complex questions resist decomposition (commonsense, causal reasoning), granularity depends on model capabilities, and increased computational cost from multiple queries."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The Limitations section explicitly states what was NOT tested: other kinds of complex questions (commonsense, causal reasoning), other domains beyond DROP, different decomposition granularities. The paper says 'we have only shown the applicability of one speciﬁc version of this idea in one speciﬁc setting.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "DROP is publicly available. The code and data repository is linked (https://github.com/dDua/succesive_prompting). The 300 annotated examples and synthetic data are described as available."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4 describes the 300-example selection pipeline: QQP-based sentence embedding → cosine similarity → top-50 neighbors → connection graph → vertex cover. Section 3 details synthetic data generation from Wikipedia tables with 10 operation types and higher-order combinations."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are the public DROP benchmark and synthetically generated data from Wikipedia tables."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented: Wikipedia tables → template-based paragraph and question generation → 10 operation types with higher-order combinations → 141K complex questions → 525K QD and 257K QA examples (Table 5). Dynamic sampling at each epoch is described."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgements list DARPA MCS program (Contract No. N660011924033), AI2, NSF IIS-1817183, and Hasso Plattner Institute fellowship."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed: UC Irvine, Allen Institute for AI, and Microsoft Semantic Machines."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "DARPA, NSF, AI2, and HPI are government/academic funders with no financial stake in the evaluation outcomes. The paper evaluates open-source models (GPT-J, T5/UnifiedQA), not products from the funders."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interest statement is provided. One author is at Microsoft Semantic Machines, but no conflict disclosure is made."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "GPT-J (6B) was trained on The Pile with no training cutoff date stated in the paper. T5/UnifiedQA training data cutoff is also not stated."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "DROP was published in 2019 and GPT-J was trained on internet data collected after that. No discussion of whether DROP examples appear in GPT-J's or T5's training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "DROP (2019) was available online before the models were trained. No contamination analysis is performed despite the risk that these models may have seen DROP examples during pre-training."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. The paper evaluates language models on benchmark datasets."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The Ethics Statement confirms: 'This work does not deal with any social impacts or biases.'"
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The Limitations section acknowledges 'this method also increases the computational requirements' due to multiple LM queries per question, but no actual cost, latency, or token counts are reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No GPU hours, training time, API costs, or hardware specifications are provided despite using large language models for both in-context learning and fine-tuning."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "All results appear to be single-seed. No mention of multiple random seeds or seed sensitivity analysis."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs producing the reported results is never stated."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Hyperparameters are reported (learning rate 5e-5, etc.) but no search budget, number of configurations tried, or search method is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper explicitly states 'We follow the standard practice of using test set for only our ﬁnal best performing model' and uses dev set performance for all model selection decisions (Table 3)."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement and compare against baselines (TASE, UnifiedQA, PReasM) without acknowledging potential bias from evaluating their own system against their implementations of baselines."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Successive prompting makes multiple LM queries per question while CoT makes one, but performance is never reported as a function of compute. The compute disparity is acknowledged qualitatively in Limitations but not controlled for."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "DROP is used without discussion of whether it adequately measures the claimed compositional reasoning capability. No comparison with alternative benchmarks or analysis of what DROP specifically tests."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "The successive prompting scaffold IS the contribution being evaluated. Different prompting methods (Standard, CoT, SP) are the independent variable, so the scaffold confound is the experimental design itself."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "DROP was published in 2019 and the models (GPT-J, T5) were trained on later data. No discussion of temporal leakage—the models may have encountered DROP passages or questions during pre-training."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether in-context examples or the evaluation setup leak answer information. The QA index retrieval could potentially surface related examples that leak information about the test question."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of structural similarities between training and test data. The 300 DROP training examples were selected to cover training data via vertex cover, but independence from the test set is not analyzed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is used—no canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Successive prompting achieves ~5% absolute F1 improvement over SOTA on few-shot DROP",
    365       "evidence": "Table 3: SP w/ decomp = 51.3 F1 (dev) vs TASE+Synthetic w/ decomp = 45.9 F1 (dev). Test set: SP = 50.2 vs TASE = 45.1 (Section 4.2).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Successive prompting outperforms Chain-of-Thought by 4.3% F1 with in-context learning",
    370       "evidence": "Table 2: Succ.(w/ calc.) = 31.9 vs CoT = 27.6 for Syn+DROP setting. The 3.5% gap in Syn-Only (28.8 vs 25.3) also supports this.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Synthetic data from Wikipedia tables universally improves model performance",
    375       "evidence": "Table 3 shows consistent improvements for all model types when synthetic data is added: UnifiedQA (+2.1 to +5.4), PReasM (+0.6 to +5.3), TASE (+17.8 to +18.3 in some settings).",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Modular approach with symbolic calculator is better than end-to-end LM",
    380       "evidence": "Table 2: Succ.(w/ calc.) = 31.9 vs Succ.(w/o calc.) = 29.9 for Syn+DROP (2.0 F1 improvement). Paper states '1.5% F1' referring to DROP-Only column (30.8 vs 29.3).",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Fine-tuned QA module provides ~10% F1 improvement over in-context QA",
    385       "evidence": "Table 4: QD In-Context + QA Fine-tuning = 40.3 vs QD In-Context + QA In-Context = 30.8 (9.5 F1 improvement).",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "QD accuracy is 88% for in-context and 96% for fine-tuned model on correct predictions",
    390       "evidence": "Section 4.4: Manual analysis of 50 correct predictions. Incorrect decompositions were 'mainly because the decomposed question is identical to the original question.'",
    391       "supported": "weak"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "No error bars or uncertainty quantification",
    397       "detail": "All results across Tables 2, 3, and 4 are single-run point estimates with no variance, standard deviation, confidence intervals, or significance tests. The ~5% F1 improvement claim cannot be assessed for statistical reliability."
    398     },
    399     {
    400       "flag": "Single evaluation metric",
    401       "detail": "Only F1 is reported. DROP also supports Exact Match, and compositional reasoning could benefit from decomposition accuracy or reasoning step correctness metrics."
    402     },
    403     {
    404       "flag": "No contamination analysis",
    405       "detail": "DROP (2019) was publicly available before GPT-J and T5 were trained. The paper does not discuss whether these models may have seen DROP passages or answers during pre-training, which would inflate results for all methods."
    406     },
    407     {
    408       "flag": "Uncontrolled compute comparison",
    409       "detail": "Successive prompting makes multiple LM queries per question while CoT makes one. Performance improvements could partly reflect more compute rather than better decomposition. The Limitations section acknowledges this qualitatively but does not control for it."
    410     }
    411   ],
    412   "cited_papers": [
    413     {
    414       "title": "Chain of thought prompting elicits reasoning in large language models",
    415       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans", "Maarten Bosma", "Ed Chi", "Quoc Le", "Denny Zhou"],
    416       "year": 2022,
    417       "arxiv_id": "2201.11903",
    418       "relevance": "Foundational prompting technique for LLM reasoning that successive prompting builds upon and compares against."
    419     },
    420     {
    421       "title": "Language models are few-shot learners",
    422       "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"],
    423       "year": 2020,
    424       "relevance": "Introduced few-shot prompting as a paradigm for large language models, foundational to the in-context learning approach used in this paper."
    425     },
    426     {
    427       "title": "Show your work: Scratchpads for intermediate computation with language models",
    428       "authors": ["Maxwell Nye", "Anders Johan Andreassen", "Guy Gur-Ari"],
    429       "year": 2021,
    430       "arxiv_id": "2112.00114",
    431       "relevance": "Early work on intermediate reasoning steps for LMs, precursor to chain-of-thought and successive prompting approaches."
    432     },
    433     {
    434       "title": "MRKL systems: A modular, neuro-symbolic architecture that combines large language models, external knowledge sources and discrete reasoning",
    435       "authors": ["Ehud Karpas", "Omri Abend", "Yonatan Belinkov"],
    436       "year": 2022,
    437       "arxiv_id": "2205.00445",
    438       "relevance": "Modular neuro-symbolic approach combining LLMs with external tools and discrete reasoning, closely related to successive prompting's modular design."
    439     },
    440     {
    441       "title": "Least-to-most prompting enables complex reasoning in large language models",
    442       "authors": ["Denny Zhou", "Nathanael Schärli", "Le Hou", "Jason Wei"],
    443       "year": 2022,
    444       "arxiv_id": "2205.10625",
    445       "relevance": "Concurrent work with similar decomposition approach, but decomposes all sub-problems first then solves sequentially, unlike the interleaved approach of successive prompting."
    446     },
    447     {
    448       "title": "Decomposed prompting: A modular approach for solving complex tasks",
    449       "authors": ["Tushar Khot", "Harsh Trivedi", "Matthew Finlayson", "Yao Fu"],
    450       "year": 2022,
    451       "arxiv_id": "2210.02406",
    452       "relevance": "Contemporary modular prompting approach for complex task decomposition, part of the emerging paradigm of compositional LLM reasoning."
    453     },
    454     {
    455       "title": "Measuring and narrowing the compositionality gap in language models",
    456       "authors": ["Ofir Press", "Muru Zhang", "Sewon Min"],
    457       "year": 2022,
    458       "arxiv_id": "2210.03350",
    459       "relevance": "Measures how well LMs compose reasoning steps and shows gaps in compositional reasoning that methods like successive prompting aim to address."
    460     },
    461     {
    462       "title": "PaLM: Scaling language modeling with pathways",
    463       "authors": ["Aakanksha Chowdhery", "Sharan Narang", "Jacob Devlin"],
    464       "year": 2022,
    465       "arxiv_id": "2204.02311",
    466       "relevance": "Large-scale language model demonstrating few-shot prompting capabilities, context for the scaling trends that prompting methods leverage."
    467     },
    468     {
    469       "title": "Text modular networks: Learning to decompose tasks in the language of existing models",
    470       "authors": ["Tushar Khot", "Daniel Khashabi", "Kyle Richardson"],
    471       "year": 2021,
    472       "relevance": "Direct predecessor to successive prompting's modular QD/QA design, limited to first-order reasoning which successive prompting extends."
    473     },
    474     {
    475       "title": "STaR: Bootstrapping reasoning with reasoning",
    476       "authors": ["Eric Zelikman", "Yuhuai Wu", "Noah D. Goodman"],
    477       "year": 2022,
    478       "arxiv_id": "2203.14465",
    479       "relevance": "Uses self-generated reasoning traces to bootstrap LLM reasoning capability, related to the synthetic data bootstrapping approach in successive prompting."
    480     },
    481     {
    482       "title": "UNIFIEDQA: Crossing format boundaries with a single QA system",
    483       "authors": ["Daniel Khashabi", "Sewon Min", "Tushar Khot"],
    484       "year": 2020,
    485       "relevance": "Pre-trained QA system used as the base model for fine-tuning experiments in this paper."
    486     }
    487   ]
    488 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs