scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33500B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Demonstrate-Search-Predict: Composing retrieval and language models for knowledge-intensive NLP",
      6     "authors": [
      7       "O. Khattab",
      8       "Keshav Santhanam",
      9       "Xiang Lisa Li",
     10       "David Hall",
     11       "Percy Liang",
     12       "Christopher Potts",
     13       "Matei Zaharia"
     14     ],
     15     "year": 2022,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2212.14024",
     18     "doi": "10.48550/arXiv.2212.14024"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Abstract claims of '37–120%, 8–39%, and 80–290% relative gains' are supported by Table 1 results. The 'new state-of-the-art in-context learning results' claim is supported by comparison with concurrent work in §3.5.",
     26         "source": "opus"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper claims DSP 'delivers gains' over baselines. The study design holds the LM (GPT-3.5) and RM (ColBERTv2) constant across all conditions, varying only the pipeline architecture. This controlled single-variable manipulation is adequate for claiming the pipeline design causes different performance.",
     32         "source": "opus"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The title claims 'knowledge-intensive NLP' broadly but results are limited to three QA-style tasks with a single LM (GPT-3.5) and single RM (ColBERTv2). The paper acknowledges 'Future versions of this report will include additional test tasks and LM choices' but the title and abstract generalize beyond the tested settings.",
     38         "source": "opus"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss alternative explanations for the gains. For instance, DSP's self-consistency uses n=20 samples, consuming much more compute than baselines — the gains could partly reflect this compute gap rather than pipeline design. No robustness checks or alternative factors are discussed.",
     44         "source": "opus"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper measures EM and F1 on three QA benchmarks but frames contributions as advancing 'knowledge-intensive NLP' broadly. No discussion acknowledges that QA metrics are a narrow proxy for the broader capabilities claimed.",
     50         "source": "opus"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion discusses the framework's potential but does not substantively address limitations.",
     58         "source": "opus"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No specific threats to validity are discussed anywhere in the paper.",
     64         "source": "opus"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No explicit scope boundaries are stated. The paper mentions 'Future versions of this report will include additional test tasks and LM choices' but does not state what the current results do NOT show or what claims are NOT being made.",
     70         "source": "opus"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Acknowledgements section lists: IBM (HAI founding member), Stanford DAWN project supported by Ant Financial, Facebook, Google, VMware, plus Cisco, SAP, and NSF CAREER grant CNS-1651570.",
     78         "source": "opus"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All seven authors are listed as Stanford University. The paper evaluates GPT-3.5 (OpenAI) and ColBERTv2 (developed by some of the same authors at Stanford), and affiliations are transparent.",
     84         "source": "opus"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Funding comes from academic (NSF, Stanford HAI) and corporate (IBM, Google, Facebook, etc.) sources supporting general AI research. None of the funders have a specific financial interest in DSP outperforming the baselines tested.",
     90         "source": "opus"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests or financial interests statement is present. Some authors developed ColBERTv2, which is used as the RM, and this potential interest is not formally declared.",
     96         "source": "opus"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "'Knowledge-intensive tasks', 'in-context learning', 'demonstrations', and each DSP stage (Demonstrate, Search, Predict) are defined with conceptual descriptions and code examples.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper explicitly lists four contributions: arguing for task-aware strategies, showing DSP reduces their burden, bootstrapping demonstrations as an emergent capacity, and new SOTA in-context learning results on three tasks.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Extensive engagement with retrieval-augmented NLP (RAG, DPR, FiD, Baleen), in-context learning (Brown et al. 2020), chain-of-thought, self-ask, and multi-hop QA, explicitly positioning DSP relative to each.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "The abstract and §1 state 'We release DSP at https://github.com/stanfordnlp/dsp' with a working URL to the public repository.",
    127           "source": "opus"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "All evaluation uses publicly available benchmarks: Open-SQuAD (Rajpurkar et al., 2016), HotPotQA (Yang et al., 2018), and QReCC (Anantha et al., 2020), with public Wikipedia corpus dumps from Chen et al. (2017), Yang et al. (2018), and Karpukhin et al. (2020).",
    133           "source": "opus"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No requirements.txt, Dockerfile, or environment specification is provided in the paper. The paper mentions Python implementation but provides no dependency or version details.",
    139           "source": "opus"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No step-by-step reproduction instructions are included in the paper. Code is released but the paper itself contains no 'Reproducing Results' section or commands to replicate experiments.",
    145           "source": "opus"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Table 1 reports only point estimates (e.g., 36.6 EM, 49.0 F1). Despite averaging across 5 seeds (§3.1), no confidence intervals, error bars, or ± notation are provided.",
    153           "source": "opus"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Claims like 'outperforms the vanilla LM baseline by 126% EM relative gains' are made by comparing point estimates in Table 1 without any statistical significance test.",
    159           "source": "opus"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "The paper reports relative gains with baseline context: '37–120%, 8–39%, and 80–290% relative gains' (abstract). Table 1 provides absolute numbers for all systems, allowing readers to compute effect magnitudes (e.g., 16.2→36.6 EM for Open-SQuAD).",
    165           "source": "opus"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The evaluation uses 1000 validation questions subsampled across 5 seeds (200 per seed), and 16-shot training examples. No justification is given for these choices and no power analysis is discussed.",
    171           "source": "opus"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "§3.1 states results are averaged across 5 seeds, but no standard deviation, IQR, or any spread measure is reported in Table 1 or the text. The reader cannot assess result stability across seeds.",
    177           "source": "opus"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Table 1 compares against vanilla LM, retrieve-then-read, self-ask (two configurations), and reports concurrent in-context learning SoTA results from Si et al. (2022), Sun et al. (2022), Wang et al. (2022b), and Yao et al. (2022).",
    185           "source": "opus"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines include contemporaneous work: self-ask (Press et al., 2022), ReAct (Yao et al., 2022), recite-and-answer (Sun et al., 2022), and self-consistency prompting (Wang et al., 2022b), all from 2022.",
    191           "source": "opus"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "DSP has three stages (Demonstrate, Search, Predict) with multiple design choices (self-consistency, query fusion, multi-hop), but no systematic ablation study removes individual components to measure their contribution. The baselines are different systems, not ablated versions of DSP.",
    197           "source": "opus"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Table 1 reports both EM and F1 for Open-SQuAD and HotPotQA, and both F1 and novel-F1 (nF1) for QReCC.",
    203           "source": "opus"
    204         },
    205         "human_evaluation": {
    206           "applies": true,
    207           "answer": false,
    208           "justification": "Evaluation is entirely automated using EM and F1 metrics. No human evaluation of answer quality, groundedness, or coherence is conducted.",
    209           "source": "opus"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": false,
    214           "justification": "§3.1 explicitly states: 'We report the validation set accuracy on all three datasets.' Held-out test datasets (Open-NaturalQuestions, FEVER) are mentioned but deferred to 'a future version of this report.'",
    215           "source": "opus"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": false,
    220           "justification": "Table 1 reports only aggregate scores per dataset. No per-category breakdown (e.g., by question type, difficulty, number of hops) is provided.",
    221           "source": "opus"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "§3.5 discusses the 'self-distraction' failure mode of self-ask with a concrete example (Edgardo Mortara question producing tangential decomposition). The paper also discusses limitations of retrieve-then-read in Figure 1.",
    227           "source": "opus"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "Every experiment shows DSP outperforming all baselines. No configurations, design choices, or approaches that were tried and failed are reported. The paper also notes that branch, knn, and crossval primitives are 'work-in-progress' but doesn't discuss attempted approaches that didn't work.",
    233           "source": "opus"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "§3.2 specifies 'GPT-3.5 (text-davinci-002; Brown et al. 2020; Ouyang et al. 2022)' and 'ColBERTv2 (Santhanam et al., 2022b)' with precise model identifiers.",
    241           "source": "opus"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "§2.4 and §2.5 provide actual prompt text (e.g., 'My task is to write a simple query that gathers information...', 'My task is to answer questions using Web documents...'). The full code is also released at the GitHub repository where all prompt templates are available.",
    247           "source": "opus"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "§3.1–3.4 report key hyperparameters: greedy decoding for n=1, temperature t=0.7 for n>1, k=7 passages for open-domain QA, k=5 for multi-hop, n=20 for self-consistency, n=10 for query fusion, 16-shot training examples.",
    253           "source": "opus"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "The entire paper describes the DSP pipeline architecture in detail: the three-stage framework (Demonstrate, Search, Predict), data flow between components, control flow with code snippets, and specific configurations for each task. §2.1–2.5 provide comprehensive descriptions.",
    259           "source": "opus"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "§3.1 documents evaluation subsampling (1000 questions, 5 seeds, 200 per seed). §3.5 describes HotPotQA filtering to 'hard' examples and 90/10 train/val splits. QReCC filtering criteria are explicitly stated (removing empty answers, short conversations, and 'other interesting' questions).",
    265           "source": "opus"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "All benchmarks used are publicly available: SQuAD, HotPotQA, and QReCC datasets and their associated Wikipedia corpus dumps are downloadable.",
    273           "source": "opus"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "§3.1 and §3.5 describe which datasets were used, which splits, which Wikipedia corpus versions (Dec 2016, Nov 2017 abstracts, Dec 2018), and how evaluation was conducted across seeds.",
    279           "source": "opus"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No human participants. All data comes from standard public benchmarks (SQuAD, HotPotQA, QReCC).",
    285           "source": "opus"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The pipeline from data to results is documented: dataset selection, corpus alignment, subsampling strategy, seed-based evaluation split, and QReCC filtering criteria with explicit markers (§3.1, §3.5).",
    291           "source": "opus"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "GPT-3.5 (text-davinci-002) is used but its training data cutoff date is not stated anywhere in the paper.",
    299           "source": "opus"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No discussion of whether SQuAD (2016), HotPotQA (2018), or QReCC (2020) examples appeared in GPT-3.5's training data, despite all being publicly available before the model's training.",
    305           "source": "opus"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "All three benchmarks (SQuAD 2016, HotPotQA 2018, QReCC 2020) were published and publicly available online well before GPT-3.5's training cutoff. No contamination risk discussion is provided.",
    311           "source": "opus"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study. All evaluation is automated on NLP benchmarks.",
    319           "source": "opus"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants.",
    325           "source": "opus"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants.",
    331           "source": "opus"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants.",
    337           "source": "opus"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants.",
    343           "source": "opus"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants.",
    349           "source": "opus"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants.",
    355           "source": "opus"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "DSP programs call GPT-3.5 many times per example (e.g., n=20 self-consistency samples, multi-hop search with n=10 query fusion, plus demonstration bootstrapping) but no API costs, token counts, or latency measurements are reported.",
    363           "source": "opus"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "§3.1 mentions controlling 'the language model API spending budget' by splitting evaluation across seeds, but the actual budget, total API spend, or compute resources used are never quantified.",
    369           "source": "opus"
    370         }
    371       },
    372       "experimental_rigor": {
    373         "seed_sensitivity_reported": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "§3.1 states results are averaged across 5 seeds, but per-seed results are not reported and no sensitivity analysis shows how much results vary across seeds.",
    377           "source": "opus"
    378         },
    379         "number_of_runs_stated": {
    380           "applies": true,
    381           "answer": true,
    382           "justification": "§3.1 clearly states: 'report average quality across five seeds where each seed fixes a single k-shot training set of examples.'",
    383           "source": "opus"
    384         },
    385         "hyperparameter_search_budget": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "Various hyperparameters are set (k=7 passages, n=20 for self-consistency, k=3 demonstrations, etc.) but no search budget, search method, or justification for these choices is provided.",
    389           "source": "opus"
    390         },
    391         "best_config_selection_justified": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "Hyperparameter choices (k values for retrieval, n values for generation, number of demonstrations) appear tuned but no selection criterion or validation-based justification is described.",
    395           "source": "opus"
    396         },
    397         "multiple_comparison_correction": {
    398           "applies": false,
    399           "answer": false,
    400           "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.",
    401           "source": "opus"
    402         },
    403         "self_comparison_bias_addressed": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "The authors implement their own baselines (vanilla LM, retrieve-then-read) and compare against their own DSP framework. They also modified the self-ask prompt for comparison. No acknowledgment of author-evaluation bias per Lucic et al. (2018).",
    407           "source": "opus"
    408         },
    409         "compute_budget_vs_performance": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "DSP uses dramatically more compute than baselines: multi-hop retrieval with n=10 query fusion, n=20 self-consistency, plus demonstration bootstrapping. The vanilla LM uses a single LM call. This compute gap is never discussed or controlled for.",
    413           "source": "opus"
    414         },
    415         "benchmark_construct_validity": {
    416           "applies": true,
    417           "answer": false,
    418           "justification": "The paper uses SQuAD, HotPotQA, and QReCC with EM/F1 metrics to evaluate 'knowledge-intensive NLP' capabilities without discussing whether these benchmarks and metrics adequately measure the claimed construct.",
    419           "source": "opus"
    420         },
    421         "scaffold_confound_addressed": {
    422           "applies": true,
    423           "answer": true,
    424           "justification": "The paper explicitly controls for the scaffold confound by holding the LM (GPT-3.5 text-davinci-002) and RM (ColBERTv2) constant across all pipeline variants. Differences are attributable to pipeline design, which is the stated variable under study.",
    425           "source": "opus"
    426         }
    427       },
    428       "data_leakage": {
    429         "temporal_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "GPT-3.5 was trained on data that may include SQuAD (2016), HotPotQA (2018), and QReCC (2020) questions and answers. No temporal leakage discussion is present.",
    433           "source": "opus"
    434         },
    435         "feature_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of whether the evaluation setup introduces information leakage (e.g., whether retrieved passages or demonstration bootstrapping provide unfair hints).",
    439           "source": "opus"
    440         },
    441         "non_independence_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No discussion of train/test independence. The 16-shot training examples are sampled from the same distribution as test examples, and the bootstrapping process uses the same retrieval corpus.",
    445           "source": "opus"
    446         },
    447         "leakage_detection_method": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No concrete leakage detection or prevention method is applied (no canary strings, membership inference, n-gram overlap analysis, or decontamination).",
    451           "source": "opus"
    452         }
    453       }
    454     }
    455   },
    456   "claims": [
    457     {
    458       "claim": "DSP programs achieve 37–120% relative EM gains over vanilla LM across three knowledge-intensive QA tasks.",
    459       "evidence": "Table 1: Open-SQuAD 36.6% vs 16.2% EM (126% gain), HotPotQA 51.4% vs 28.3% EM (82% gain), QReCC 35.0 vs 29.8 F1 (17% gain).",
    460       "supported": "strong"
    461     },
    462     {
    463       "claim": "DSP achieves 8–39% relative gains over standard retrieve-then-read pipelines.",
    464       "evidence": "Table 1: Open-SQuAD 36.6% vs 33.8% EM (8%), HotPotQA 51.4% vs 36.9% EM (39%), QReCC 35.0 vs 31.6 F1 (11%).",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "DSP achieves 80–290% relative gains over the self-ask pipeline with the same ColBERTv2 retriever.",
    469       "evidence": "Table 1: HotPotQA 51.4% vs 25.2% EM (104%), Open-SQuAD 36.6% vs 9.3% EM (294%).",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "DSP's in-context learning results are competitive with fine-tuned systems such as DPR and FiD-base.",
    474       "evidence": "Open-SQuAD DSP 36.6% EM vs DPR 29.8% and FiD-base ~36%; FiD with 100 passages reaches 48% EM, which DSP does not match.",
    475       "supported": "moderate"
    476     },
    477     {
    478       "claim": "The DEMONSTRATE stage can automatically bootstrap intermediate annotations from end-task labels without hand-labeling intermediate steps.",
    479       "evidence": "The annotate mechanism is described and illustrated with code; no ablation is provided comparing bootstrapped vs hand-labeled demonstrations to verify the claim of equivalence.",
    480       "supported": "moderate"
    481     },
    482     {
    483       "claim": "Self-ask suffers from a 'self-distraction' problem that DSP programs avoid.",
    484       "evidence": "A single qualitative example in §3.5 shows self-ask misdecomposing one question; no systematic analysis of failure rates or categories is provided.",
    485       "supported": "weak"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval",
    490     "case-study"
    491   ],
    492   "key_findings": "DSP introduces a programmatic framework for composing frozen language and retrieval models through three stages — Demonstrate (bootstrap demonstrations via weak supervision), Search (multi-hop retrieval), and Predict (aggregate with self-consistency) — expressed as short Python programs. Evaluated on Open-SQuAD, HotPotQA, and QReCC using GPT-3.5 and ColBERTv2 without fine-tuning, DSP substantially outperforms vanilla in-context learning and retrieve-then-read baselines, with the largest gains on multi-hop QA (82% EM gain over vanilla LM on HotPotQA). The framework's automatic demonstration bootstrapping eliminates the need for hand-labeled intermediate steps, enabling rapid prototyping of complex retrieval-augmented pipelines. All results are on development sets only; test evaluation and additional LM choices are deferred.",
    493   "red_flags": [
    494     {
    495       "flag": "Development-set evaluation only",
    496       "detail": "All main results in Table 1 are on validation/development sets used during program development. Test sets are explicitly deferred to 'future versions of this report', raising the risk of inadvertent over-tuning to the dev sets."
    497     },
    498     {
    499       "flag": "No variance or significance reported",
    500       "detail": "Results are averaged across 5 seeds but no standard deviation, confidence intervals, or significance tests appear anywhere, making it impossible to assess whether the reported gains are statistically reliable."
    501     },
    502     {
    503       "flag": "Benchmark contamination unaddressed",
    504       "detail": "GPT-3.5 (text-davinci-002) was almost certainly trained on SQuAD (2016), HotPotQA (2018), and QReCC (2020). The training cutoff is never stated and contamination risk is never acknowledged."
    505     },
    506     {
    507       "flag": "No ablation study",
    508       "detail": "Component-level causal claims (bootstrapped demonstrations, multi-hop search, self-consistency each contribute to gains) are made but never tested; only full system vs external baselines is compared."
    509     },
    510     {
    511       "flag": "Self-evaluating own retriever",
    512       "detail": "ColBERTv2 is the authors' own prior work and is used as the sole retriever throughout, without comparing to alternatives to verify retriever-agnosticism."
    513     },
    514     {
    515       "flag": "Self-described as preliminary",
    516       "detail": "The paper itself repeatedly flags 'preliminary results' and 'early evaluations', yet makes broad contribution claims inconsistent with the scope of what was actually evaluated."
    517     }
    518   ],
    519   "cited_papers": [
    520     {
    521       "title": "Language models are few-shot learners (GPT-3)",
    522       "relevance": "Foundation for in-context learning that DSP extends; used as vanilla LM baseline methodology."
    523     },
    524     {
    525       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks (RAG)",
    526       "relevance": "Core prior work on retrieval-augmented LMs that DSP directly supersedes."
    527     },
    528     {
    529       "title": "ColBERTv2: Effective and efficient retrieval via lightweight late interaction",
    530       "relevance": "The retrieval model used in all DSP experiments; authors' own prior work."
    531     },
    532     {
    533       "title": "Chain of thought prompting elicits reasoning in large language models",
    534       "relevance": "Chain-of-thought reasoning integrated into DSP's PREDICT stage."
    535     },
    536     {
    537       "title": "Self-consistency improves chain of thought reasoning in language models",
    538       "relevance": "Self-consistency decoding strategy used in DSP's PREDICT stage for answer selection."
    539     },
    540     {
    541       "title": "Measuring and narrowing the compositionality gap in language models (self-ask)",
    542       "relevance": "Direct contemporaneous comparison baseline; DSP's 80–290% gains are measured against this."
    543     },
    544     {
    545       "title": "HotpotQA: A dataset for diverse, explainable multi-hop question answering",
    546       "relevance": "Primary multi-hop QA evaluation dataset; DSP shows its largest gains here."
    547     },
    548     {
    549       "title": "Open-domain question answering goes conversational via question rewriting (QReCC)",
    550       "relevance": "Conversational QA evaluation dataset used in this paper."
    551     }
    552   ],
    553   "engagement_factors": {
    554     "practical_relevance": {
    555       "score": 3,
    556       "justification": "DSP is released as a pip-installable Python library (stanfordnlp/dsp) that practitioners can directly use to build RAG pipelines for knowledge-intensive tasks."
    557     },
    558     "surprise_contrarian": {
    559       "score": 1,
    560       "justification": "The 'self-distraction' failure mode of self-ask is a minor novel observation, but the overall finding that structured pipelines beat naive prompting is expected."
    561     },
    562     "fear_safety": {
    563       "score": 0,
    564       "justification": "No safety, security, or risk concerns are raised or relevant to the work."
    565     },
    566     "drama_conflict": {
    567       "score": 1,
    568       "justification": "The 80-290% gains over self-ask and the 'self-distraction' critique mildly challenge that popular prompting approach, but without strong controversy."
    569     },
    570     "demo_ability": {
    571       "score": 2,
    572       "justification": "The GitHub repo is public with code examples, but requires API keys for GPT-3.5 and a ColBERTv2 index setup, making it moderate-effort to reproduce."
    573     },
    574     "brand_recognition": {
    575       "score": 2,
    576       "justification": "From Stanford NLP (Percy Liang, Matei Zaharia, Omar Khattab) — well-known in the NLP/ML community, and this became the foundation for the widely-used DSPy framework."
    577     }
    578   },
    579   "hn_data": {
    580     "threads": [
    581       {
    582         "hn_id": "34178437",
    583         "title": "Cramming: Training a Language Model on a Single GPU in One Day",
    584         "points": 6,
    585         "comments": 0,
    586         "url": "https://news.ycombinator.com/item?id=34178437",
    587         "created_at": "2022-12-29T21:44:44Z"
    588       },
    589       {
    590         "hn_id": "42209577",
    591         "title": "Cramming: Training a Language Model on a Single GPU in One Day",
    592         "points": 3,
    593         "comments": 0,
    594         "url": "https://news.ycombinator.com/item?id=42209577",
    595         "created_at": "2024-11-21T23:01:03Z"
    596       },
    597       {
    598         "hn_id": "34232125",
    599         "title": "Cramming: Training a Language Model on a Single GPU in One Day",
    600         "points": 3,
    601         "comments": 0,
    602         "url": "https://news.ycombinator.com/item?id=34232125",
    603         "created_at": "2023-01-03T14:57:13Z"
    604       },
    605       {
    606         "hn_id": "34570488",
    607         "title": "Training a Language Model on a Single GPU in One Day",
    608         "points": 2,
    609         "comments": 1,
    610         "url": "https://news.ycombinator.com/item?id=34570488",
    611         "created_at": "2023-01-29T17:40:13Z"
    612       },
    613       {
    614         "hn_id": "39968113",
    615         "title": "Cramming: Training a Language Model on a Single GPU in One Day (2022)",
    616         "points": 2,
    617         "comments": 0,
    618         "url": "https://news.ycombinator.com/item?id=39968113",
    619         "created_at": "2024-04-08T10:09:08Z"
    620       },
    621       {
    622         "hn_id": "34338363",
    623         "title": "Cramming: Training a Language Model on a Single GPU in One Day",
    624         "points": 2,
    625         "comments": 0,
    626         "url": "https://news.ycombinator.com/item?id=34338363",
    627         "created_at": "2023-01-11T13:47:58Z"
    628       },
    629       {
    630         "hn_id": "42656632",
    631         "title": "Show HN: We collected detailed annotations for text-to-image generation",
    632         "points": 2,
    633         "comments": 0,
    634         "url": "https://news.ycombinator.com/item?id=42656632",
    635         "created_at": "2025-01-10T15:47:29Z"
    636       },
    637       {
    638         "hn_id": "33522332",
    639         "title": "Championship Simulator: Architectural Simulation for Education and Competition",
    640         "points": 1,
    641         "comments": 0,
    642         "url": "https://news.ycombinator.com/item?id=33522332",
    643         "created_at": "2022-11-08T18:21:42Z"
    644       }
    645     ],
    646     "top_points": 6,
    647     "total_points": 21,
    648     "total_comments": 1
    649   }
    650 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs