scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30690B)
      1 {
      2   "paper": {
      3     "title": "Demonstrate-Search-Predict: Composing retrieval and language models for knowledge-intensive NLP",
      4     "authors": [
      5       "Omar Khattab",
      6       "Keshav Santhanam",
      7       "Xiang Lisa Li",
      8       "David Hall",
      9       "Percy Liang",
     10       "Christopher Potts",
     11       "Matei Zaharia"
     12     ],
     13     "year": 2022,
     14     "venue": "arXiv preprint",
     15     "arxiv_id": "2212.14024",
     16     "doi": "10.48550/arXiv.2212.14024"
     17   },
     18   "scan_version": 3,
     19   "active_modules": [
     20     "experimental_rigor",
     21     "data_leakage"
     22   ],
     23   "methodology_tags": [
     24     "benchmark-eval"
     25   ],
     26   "key_findings": "DSP framework composes frozen language models and retrieval models in multi-step pipelines for knowledge-intensive NLP, achieving 37–120% gains over vanilla GPT-3.5, 8–39% over retrieve-then-read, and 80–290% over self-ask on Open-SQuAD, HotPotQA, and QReCC. The DEMONSTRATE stage bootstraps pipeline-aware demonstrations from end-task labels without hand-labeling intermediate transformations. The paper identifies a 'self-distraction' failure mode in self-ask where delegating control flow to LM completions produces tangential decompositions.",
     27   "checklist": {
     28     "artifacts": {
     29       "code_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The abstract and §1 state 'We release DSP at https://github.com/stanfordnlp/dsp' with a working URL to the public repository."
     33       },
     34       "data_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "All evaluation uses publicly available benchmarks: Open-SQuAD (Rajpurkar et al., 2016), HotPotQA (Yang et al., 2018), and QReCC (Anantha et al., 2020), with public Wikipedia corpus dumps from Chen et al. (2017), Yang et al. (2018), and Karpukhin et al. (2020)."
     38       },
     39       "environment_specified": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No requirements.txt, Dockerfile, or environment specification is provided in the paper. The paper mentions Python implementation but provides no dependency or version details."
     43       },
     44       "reproduction_instructions": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "No step-by-step reproduction instructions are included in the paper. Code is released but the paper itself contains no 'Reproducing Results' section or commands to replicate experiments."
     48       }
     49     },
     50     "statistical_methodology": {
     51       "confidence_intervals_or_error_bars": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Table 1 reports only point estimates (e.g., 36.6 EM, 49.0 F1). Despite averaging across 5 seeds (§3.1), no confidence intervals, error bars, or ± notation are provided."
     55       },
     56       "significance_tests": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Claims like 'outperforms the vanilla LM baseline by 126% EM relative gains' are made by comparing point estimates in Table 1 without any statistical significance test."
     60       },
     61       "effect_sizes_reported": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper reports relative gains with baseline context: '37–120%, 8–39%, and 80–290% relative gains' (abstract). Table 1 provides absolute numbers for all systems, allowing readers to compute effect magnitudes (e.g., 16.2→36.6 EM for Open-SQuAD)."
     65       },
     66       "sample_size_justified": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The evaluation uses 1000 validation questions subsampled across 5 seeds (200 per seed), and 16-shot training examples. No justification is given for these choices and no power analysis is discussed."
     70       },
     71       "variance_reported": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "§3.1 states results are averaged across 5 seeds, but no standard deviation, IQR, or any spread measure is reported in Table 1 or the text. The reader cannot assess result stability across seeds."
     75       }
     76     },
     77     "evaluation_design": {
     78       "baselines_included": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Table 1 compares against vanilla LM, retrieve-then-read, self-ask (two configurations), and reports concurrent in-context learning SoTA results from Si et al. (2022), Sun et al. (2022), Wang et al. (2022b), and Yao et al. (2022)."
     82       },
     83       "baselines_contemporary": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Baselines include contemporaneous work: self-ask (Press et al., 2022), ReAct (Yao et al., 2022), recite-and-answer (Sun et al., 2022), and self-consistency prompting (Wang et al., 2022b), all from 2022."
     87       },
     88       "ablation_study": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "DSP has three stages (Demonstrate, Search, Predict) with multiple design choices (self-consistency, query fusion, multi-hop), but no systematic ablation study removes individual components to measure their contribution. The baselines are different systems, not ablated versions of DSP."
     92       },
     93       "multiple_metrics": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Table 1 reports both EM and F1 for Open-SQuAD and HotPotQA, and both F1 and novel-F1 (nF1) for QReCC."
     97       },
     98       "human_evaluation": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "Evaluation is entirely automated using EM and F1 metrics. No human evaluation of answer quality, groundedness, or coherence is conducted."
    102       },
    103       "held_out_test_set": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "§3.1 explicitly states: 'We report the validation set accuracy on all three datasets.' Held-out test datasets (Open-NaturalQuestions, FEVER) are mentioned but deferred to 'a future version of this report.'"
    107       },
    108       "per_category_breakdown": {
    109         "applies": true,
    110         "answer": false,
    111         "justification": "Table 1 reports only aggregate scores per dataset. No per-category breakdown (e.g., by question type, difficulty, number of hops) is provided."
    112       },
    113       "failure_cases_discussed": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "§3.5 discusses the 'self-distraction' failure mode of self-ask with a concrete example (Edgardo Mortara question producing tangential decomposition). The paper also discusses limitations of retrieve-then-read in Figure 1."
    117       },
    118       "negative_results_reported": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "Every experiment shows DSP outperforming all baselines. No configurations, design choices, or approaches that were tried and failed are reported. The paper also notes that branch, knn, and crossval primitives are 'work-in-progress' but doesn't discuss attempted approaches that didn't work."
    122       }
    123     },
    124     "claims_and_evidence": {
    125       "abstract_claims_supported": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Abstract claims of '37–120%, 8–39%, and 80–290% relative gains' are supported by Table 1 results. The 'new state-of-the-art in-context learning results' claim is supported by comparison with concurrent work in §3.5."
    129       },
    130       "causal_claims_justified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper claims DSP 'delivers gains' over baselines. The study design holds the LM (GPT-3.5) and RM (ColBERTv2) constant across all conditions, varying only the pipeline architecture. This controlled single-variable manipulation is adequate for claiming the pipeline design causes different performance."
    134       },
    135       "generalization_bounded": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The title claims 'knowledge-intensive NLP' broadly but results are limited to three QA-style tasks with a single LM (GPT-3.5) and single RM (ColBERTv2). The paper acknowledges 'Future versions of this report will include additional test tasks and LM choices' but the title and abstract generalize beyond the tested settings."
    139       },
    140       "alternative_explanations_discussed": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper does not discuss alternative explanations for the gains. For instance, DSP's self-consistency uses n=20 samples, consuming much more compute than baselines — the gains could partly reflect this compute gap rather than pipeline design. No robustness checks or alternative factors are discussed."
    144       },
    145       "proxy_outcome_distinction": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "The paper measures EM and F1 on three QA benchmarks but frames contributions as advancing 'knowledge-intensive NLP' broadly. No discussion acknowledges that QA metrics are a narrow proxy for the broader capabilities claimed."
    149       }
    150     },
    151     "setup_transparency": {
    152       "model_versions_specified": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "§3.2 specifies 'GPT-3.5 (text-davinci-002; Brown et al. 2020; Ouyang et al. 2022)' and 'ColBERTv2 (Santhanam et al., 2022b)' with precise model identifiers."
    156       },
    157       "prompts_provided": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "§2.4 and §2.5 provide actual prompt text (e.g., 'My task is to write a simple query that gathers information...', 'My task is to answer questions using Web documents...'). The full code is also released at the GitHub repository where all prompt templates are available."
    161       },
    162       "hyperparameters_reported": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "§3.1–3.4 report key hyperparameters: greedy decoding for n=1, temperature t=0.7 for n>1, k=7 passages for open-domain QA, k=5 for multi-hop, n=20 for self-consistency, n=10 for query fusion, 16-shot training examples."
    166       },
    167       "scaffolding_described": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The entire paper describes the DSP pipeline architecture in detail: the three-stage framework (Demonstrate, Search, Predict), data flow between components, control flow with code snippets, and specific configurations for each task. §2.1–2.5 provide comprehensive descriptions."
    171       },
    172       "data_preprocessing_documented": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "§3.1 documents evaluation subsampling (1000 questions, 5 seeds, 200 per seed). §3.5 describes HotPotQA filtering to 'hard' examples and 90/10 train/val splits. QReCC filtering criteria are explicitly stated (removing empty answers, short conversations, and 'other interesting' questions)."
    176       }
    177     },
    178     "limitations_and_scope": {
    179       "limitations_section_present": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion discusses the framework's potential but does not substantively address limitations."
    183       },
    184       "threats_to_validity_specific": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No specific threats to validity are discussed anywhere in the paper."
    188       },
    189       "scope_boundaries_stated": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No explicit scope boundaries are stated. The paper mentions 'Future versions of this report will include additional test tasks and LM choices' but does not state what the current results do NOT show or what claims are NOT being made."
    193       }
    194     },
    195     "data_integrity": {
    196       "raw_data_available": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "All benchmarks used are publicly available: SQuAD, HotPotQA, and QReCC datasets and their associated Wikipedia corpus dumps are downloadable."
    200       },
    201       "data_collection_described": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "§3.1 and §3.5 describe which datasets were used, which splits, which Wikipedia corpus versions (Dec 2016, Nov 2017 abstracts, Dec 2018), and how evaluation was conducted across seeds."
    205       },
    206       "recruitment_methods_described": {
    207         "applies": false,
    208         "answer": false,
    209         "justification": "No human participants. All data comes from standard public benchmarks (SQuAD, HotPotQA, QReCC)."
    210       },
    211       "data_pipeline_documented": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The pipeline from data to results is documented: dataset selection, corpus alignment, subsampling strategy, seed-based evaluation split, and QReCC filtering criteria with explicit markers (§3.1, §3.5)."
    215       }
    216     },
    217     "conflicts_of_interest": {
    218       "funding_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Acknowledgements section lists: IBM (HAI founding member), Stanford DAWN project supported by Ant Financial, Facebook, Google, VMware, plus Cisco, SAP, and NSF CAREER grant CNS-1651570."
    222       },
    223       "affiliations_disclosed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "All seven authors are listed as Stanford University. The paper evaluates GPT-3.5 (OpenAI) and ColBERTv2 (developed by some of the same authors at Stanford), and affiliations are transparent."
    227       },
    228       "funder_independent_of_outcome": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Funding comes from academic (NSF, Stanford HAI) and corporate (IBM, Google, Facebook, etc.) sources supporting general AI research. None of the funders have a specific financial interest in DSP outperforming the baselines tested."
    232       },
    233       "financial_interests_declared": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No competing interests or financial interests statement is present. Some authors developed ColBERTv2, which is used as the RM, and this potential interest is not formally declared."
    237       }
    238     },
    239     "contamination": {
    240       "training_cutoff_stated": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "GPT-3.5 (text-davinci-002) is used but its training data cutoff date is not stated anywhere in the paper."
    244       },
    245       "train_test_overlap_discussed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "No discussion of whether SQuAD (2016), HotPotQA (2018), or QReCC (2020) examples appeared in GPT-3.5's training data, despite all being publicly available before the model's training."
    249       },
    250       "benchmark_contamination_addressed": {
    251         "applies": true,
    252         "answer": false,
    253         "justification": "All three benchmarks (SQuAD 2016, HotPotQA 2018, QReCC 2020) were published and publicly available online well before GPT-3.5's training cutoff. No contamination risk discussion is provided."
    254       }
    255     },
    256     "human_studies": {
    257       "pre_registered": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study. All evaluation is automated on NLP benchmarks."
    261       },
    262       "irb_or_ethics_approval": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "demographics_reported": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "inclusion_exclusion_criteria": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "randomization_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "blinding_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       },
    287       "attrition_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants."
    291       }
    292     },
    293     "cost_and_practicality": {
    294       "inference_cost_reported": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "DSP programs call GPT-3.5 many times per example (e.g., n=20 self-consistency samples, multi-hop search with n=10 query fusion, plus demonstration bootstrapping) but no API costs, token counts, or latency measurements are reported."
    298       },
    299       "compute_budget_stated": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "§3.1 mentions controlling 'the language model API spending budget' by splitting evaluation across seeds, but the actual budget, total API spend, or compute resources used are never quantified."
    303       }
    304     },
    305     "experimental_rigor": {
    306       "seed_sensitivity_reported": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "§3.1 states results are averaged across 5 seeds, but per-seed results are not reported and no sensitivity analysis shows how much results vary across seeds."
    310       },
    311       "number_of_runs_stated": {
    312         "applies": true,
    313         "answer": true,
    314         "justification": "§3.1 clearly states: 'report average quality across five seeds where each seed fixes a single k-shot training set of examples.'"
    315       },
    316       "hyperparameter_search_budget": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "Various hyperparameters are set (k=7 passages, n=20 for self-consistency, k=3 demonstrations, etc.) but no search budget, search method, or justification for these choices is provided."
    320       },
    321       "best_config_selection_justified": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "Hyperparameter choices (k values for retrieval, n values for generation, number of demonstrations) appear tuned but no selection criterion or validation-based justification is described."
    325       },
    326       "multiple_comparison_correction": {
    327         "applies": false,
    328         "answer": false,
    329         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    330       },
    331       "self_comparison_bias_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The authors implement their own baselines (vanilla LM, retrieve-then-read) and compare against their own DSP framework. They also modified the self-ask prompt for comparison. No acknowledgment of author-evaluation bias per Lucic et al. (2018)."
    335       },
    336       "compute_budget_vs_performance": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "DSP uses dramatically more compute than baselines: multi-hop retrieval with n=10 query fusion, n=20 self-consistency, plus demonstration bootstrapping. The vanilla LM uses a single LM call. This compute gap is never discussed or controlled for."
    340       },
    341       "benchmark_construct_validity": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "The paper uses SQuAD, HotPotQA, and QReCC with EM/F1 metrics to evaluate 'knowledge-intensive NLP' capabilities without discussing whether these benchmarks and metrics adequately measure the claimed construct."
    345       },
    346       "scaffold_confound_addressed": {
    347         "applies": true,
    348         "answer": true,
    349         "justification": "The paper explicitly controls for the scaffold confound by holding the LM (GPT-3.5 text-davinci-002) and RM (ColBERTv2) constant across all pipeline variants. Differences are attributable to pipeline design, which is the stated variable under study."
    350       }
    351     },
    352     "data_leakage": {
    353       "temporal_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "GPT-3.5 was trained on data that may include SQuAD (2016), HotPotQA (2018), and QReCC (2020) questions and answers. No temporal leakage discussion is present."
    357       },
    358       "feature_leakage_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the evaluation setup introduces information leakage (e.g., whether retrieved passages or demonstration bootstrapping provide unfair hints)."
    362       },
    363       "non_independence_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of train/test independence. The 16-shot training examples are sampled from the same distribution as test examples, and the bootstrapping process uses the same retrieval corpus."
    367       },
    368       "leakage_detection_method": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination)."
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "DSP programs deliver 37–120% relative EM gains over vanilla GPT-3.5 across three knowledge-intensive QA tasks.",
    378       "evidence": "Table 1: Open-SQuAD 16.2→36.6 EM (126% gain), HotPotQA 28.3→51.4 EM (82% gain). All using GPT-3.5 and ColBERTv2 with 16-shot training examples.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "DSP programs deliver 8–39% relative EM gains over a standard retrieve-then-read pipeline.",
    383       "evidence": "Table 1: Open-SQuAD 33.8→36.6 EM (8% gain), HotPotQA 36.9→51.4 EM (39% gain).",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "DSP programs deliver 80–290% relative EM gains over the self-ask pipeline.",
    388       "evidence": "Table 1: HotPotQA 28.6→51.4 EM (80% gain), Open-SQuAD 9.0→36.6 EM (307% gain). The self-ask pipeline was modified with refinements for fairer comparison.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "The DEMONSTRATE stage can bootstrap pipeline-aware demonstrations from end-task labels without hand-labeling intermediate transformations.",
    393       "evidence": "§2.3 describes the annotate primitive and §3.4 shows its use in all three programs. No ablation isolates the DEMONSTRATE stage's contribution to the final numbers.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "Self-ask suffers from a 'self-distraction' problem where the LM decomposes questions tangentially.",
    398       "evidence": "§3.5 provides one concrete example (Edgardo Mortara question), and self-ask's low Open-SQuAD score (9.3% EM) supports the claim. However, this is a single illustrative example without systematic error analysis.",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "DSP in-context learning results are competitive with fine-tuned systems like Fusion-in-Decoder on Open-SQuAD.",
    403       "evidence": "§3.5 compares DSP's 36.6% EM with FiD-base at ~36% EM (with 5 passages), though FiD reaches 48% EM with 100 passages. Comparison is qualitative and on different evaluation conditions.",
    404       "supported": "weak"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "No error bars despite multiple seeds",
    410       "detail": "Results are averaged across 5 seeds but no variance, standard deviation, or confidence intervals are reported. The reader cannot assess whether the differences between DSP and baselines are within seed-to-seed noise."
    411     },
    412     {
    413       "flag": "No ablation study",
    414       "detail": "DSP has multiple components (Demonstrate, Search with multi-hop and query fusion, Predict with self-consistency) but no ablation isolates their individual contributions. The gains could be driven primarily by self-consistency (n=20 samples) rather than the novel architectural contributions."
    415     },
    416     {
    417       "flag": "Massive compute gap not addressed",
    418       "detail": "DSP calls GPT-3.5 dozens of times per example (demonstration bootstrapping, multi-hop with n=10 query fusion, n=20 self-consistency) while the vanilla LM uses 1 call and retrieve-then-read uses 1–2 calls. The cost difference is never reported or controlled for."
    419     },
    420     {
    421       "flag": "Benchmark contamination risk",
    422       "detail": "SQuAD (2016), HotPotQA (2018), and QReCC (2020) were all publicly available before GPT-3.5's training data cutoff. The model may have memorized answers, and this is never discussed."
    423     },
    424     {
    425       "flag": "No limitations section",
    426       "detail": "The paper has no dedicated limitations, threats to validity, or scope boundaries section despite making broad claims about 'knowledge-intensive NLP' from three QA tasks with one LM."
    427     },
    428     {
    429       "flag": "Baseline modifications may disadvantage comparison",
    430       "detail": "The self-ask baseline was modified from its original form (different passages, added training examples, prompt changes). While presented as improvements, the authors also note some primitives (branch, knn, crossval) are 'work-in-progress', raising questions about the completeness of the DSP implementation tested."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Language models are few-shot learners",
    436       "authors": [
    437         "T. Brown",
    438         "B. Mann",
    439         "N. Ryder"
    440       ],
    441       "year": 2020,
    442       "relevance": "Foundational in-context learning paper establishing the few-shot prompting paradigm that DSP builds upon."
    443     },
    444     {
    445       "title": "Chain of thought prompting elicits reasoning in large language models",
    446       "authors": [
    447         "J. Wei",
    448         "X. Wang",
    449         "D. Schuurmans"
    450       ],
    451       "year": 2022,
    452       "arxiv_id": "2201.11903",
    453       "relevance": "Introduces chain-of-thought prompting for LLM reasoning, a key component used in DSP's PREDICT stage."
    454     },
    455     {
    456       "title": "Self-consistency improves chain of thought reasoning in language models",
    457       "authors": [
    458         "X. Wang",
    459         "J. Wei",
    460         "D. Schuurmans"
    461       ],
    462       "year": 2022,
    463       "arxiv_id": "2203.11171",
    464       "relevance": "Self-consistency voting method used in DSP's PREDICT stage for selecting among multiple generated candidates."
    465     },
    466     {
    467       "title": "Measuring and narrowing the compositionality gap in language models",
    468       "authors": [
    469         "O. Press",
    470         "M. Zhang",
    471         "S. Min"
    472       ],
    473       "year": 2022,
    474       "arxiv_id": "2210.03350",
    475       "relevance": "Introduces self-ask pipeline, the primary comparison baseline, representing LLM self-decomposition for multi-hop QA."
    476     },
    477     {
    478       "title": "ReAct: Synergizing reasoning and acting in language models",
    479       "authors": [
    480         "S. Yao",
    481         "J. Zhao",
    482         "D. Yu"
    483       ],
    484       "year": 2022,
    485       "arxiv_id": "2210.03629",
    486       "relevance": "Contemporaneous framework for combining LLM reasoning with tool use (Wikipedia API search), achieving 35.1% EM on HotPotQA."
    487     },
    488     {
    489       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    490       "authors": [
    491         "P. Lewis",
    492         "E. Perez",
    493         "A. Piktus"
    494       ],
    495       "year": 2020,
    496       "relevance": "Foundational RAG paper combining retrieval and generation for knowledge-intensive tasks, directly motivating DSP's architecture."
    497     },
    498     {
    499       "title": "Language model cascades",
    500       "authors": [
    501         "D. Dohan",
    502         "W. Xu",
    503         "A. Lewkowycz"
    504       ],
    505       "year": 2022,
    506       "arxiv_id": "2207.10342",
    507       "relevance": "Theoretical framework for composing language model calls in cascaded pipelines, conceptually related to DSP's composition approach."
    508     },
    509     {
    510       "title": "Decomposed prompting: A modular approach for solving complex tasks",
    511       "authors": [
    512         "T. Khot",
    513         "H. Trivedi",
    514         "M. Finlayson"
    515       ],
    516       "year": 2022,
    517       "arxiv_id": "2210.02406",
    518       "relevance": "Modular decomposition approach for complex tasks using multiple LM calls, closely related to DSP's multi-step pipeline design."
    519     },
    520     {
    521       "title": "Star: Bootstrapping reasoning with reasoning",
    522       "authors": [
    523         "E. Zelikman",
    524         "Y. Wu",
    525         "N. Goodman"
    526       ],
    527       "year": 2022,
    528       "arxiv_id": "2203.14465",
    529       "relevance": "Self-bootstrapping approach for LLM rationale generation, generalized by DSP's DEMONSTRATE stage for pipeline-aware annotation."
    530     },
    531     {
    532       "title": "ColBERTv2: Effective and efficient retrieval via lightweight late interaction",
    533       "authors": [
    534         "K. Santhanam",
    535         "O. Khattab",
    536         "J. Saad-Falcon"
    537       ],
    538       "year": 2022,
    539       "doi": "10.18653/v1/2022.naacl-main.272",
    540       "relevance": "The retrieval model used in all DSP experiments; represents state-of-the-art dense retrieval for knowledge-intensive tasks."
    541     },
    542     {
    543       "title": "Few-shot learning with retrieval augmented language models",
    544       "authors": [
    545         "G. Izacard",
    546         "P. Lewis",
    547         "M. Lomeli"
    548       ],
    549       "year": 2022,
    550       "arxiv_id": "2208.03299",
    551       "relevance": "Concurrent work on retrieval-augmented few-shot learning, representing the retrieve-then-read paradigm that DSP aims to improve upon."
    552     },
    553     {
    554       "title": "Large language models can self-improve",
    555       "authors": [
    556         "J. Huang",
    557         "S. Gu",
    558         "L. Hou"
    559       ],
    560       "year": 2022,
    561       "arxiv_id": "2210.11610",
    562       "relevance": "LLM self-improvement via self-generated rationales, related to DSP's DEMONSTRATE bootstrapping approach."
    563     }
    564   ],
    565   "engagement_factors": {
    566     "practical_relevance": {
    567       "score": 3,
    568       "justification": "DSP is released as a pip-installable Python library (stanfordnlp/dsp) that practitioners can directly use to build RAG pipelines for knowledge-intensive tasks."
    569     },
    570     "surprise_contrarian": {
    571       "score": 1,
    572       "justification": "The 'self-distraction' failure mode of self-ask is a minor novel observation, but the overall finding that structured pipelines beat naive prompting is expected."
    573     },
    574     "fear_safety": {
    575       "score": 0,
    576       "justification": "No safety, security, or risk concerns are raised or relevant to the work."
    577     },
    578     "drama_conflict": {
    579       "score": 1,
    580       "justification": "The 80-290% gains over self-ask and the 'self-distraction' critique mildly challenge that popular prompting approach, but without strong controversy."
    581     },
    582     "demo_ability": {
    583       "score": 2,
    584       "justification": "The GitHub repo is public with code examples, but requires API keys for GPT-3.5 and a ColBERTv2 index setup, making it moderate-effort to reproduce."
    585     },
    586     "brand_recognition": {
    587       "score": 2,
    588       "justification": "From Stanford NLP (Percy Liang, Matei Zaharia, Omar Khattab) — well-known in the NLP/ML community, and this became the foundation for the widely-used DSPy framework."
    589     }
    590   }
    591 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs