scan.json (26130B)
1 { 2 "paper": { 3 "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines", 4 "authors": ["Omar Khattab", "Arnav Singhvi", "Paridhi Maheshwari", "Zhiyuan Zhang", "Keshav Santhanam", "Sri Vardhamanan", "Saiful Haq", "Ashutosh Sharma", "Thomas T. Joshi", "Hanna Moazam", "Heather Miller", "Matei Zaharia", "Christopher Potts"], 5 "year": 2023, 6 "venue": "arXiv", 7 "arxiv_id": "2310.03714" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "DSPy introduces a programming model that replaces hand-crafted prompt engineering with declarative modules and automatic compilation (teleprompters). On GSM8K, compiled DSPy programs improve GPT-3.5 from 25% to 82% and llama2-13b-chat from 9% to 47%. On HotPotQA, multi-hop programs compiled with bootstrapping achieve 45.6% answer EM with GPT-3.5 and 41.0% with llama2-13b-chat, competitive with much larger models. Small models like T5-Large (770M) can be fine-tuned via DSPy to be competitive with GPT-3.5.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The paper provides a GitHub link: https://github.com/stanfordnlp/dspy, mentioned in the abstract and Section 1." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper uses publicly available benchmarks: GSM8K (Cobbe et al., 2021) and HotPotQA (Yang et al., 2018), both standard public datasets." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. Only model names are mentioned." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "While code is released, the paper does not include step-by-step reproduction instructions or scripts to replicate experiments." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Tables 1 and 2 report point estimates only (e.g., '81.6', '46.9') with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims DSPy programs outperform baselines but provides no statistical significance tests — only raw accuracy comparisons." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports percentage improvements with baseline context throughout, e.g., 'from 33% to 82%' (Sec 1), 'from 32% to 46%' (Sec 1), and absolute numbers in Tables 1 and 2." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "Training set sizes (200, 300 examples) and dev/test splits are stated but not justified. No power analysis or rationale for why these sizes are adequate." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "Section 6 mentions 'average of 3–5 runs' for the fewshot setting with random sampling, but no standard deviations or variance measures are reported in the results tables." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Multiple baselines are included: zero-shot (none), few-shot with random demos, few-shot with human CoT demos, and bootstrapped variants. Also informal comparisons to prior work (Zhang et al., Wang et al., Touvron et al.)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Comparisons include contemporary work: Wang et al. (2022b) self-consistency with PaLM-540B, Zhao et al. (2023b) with gpt-3.5-turbo, Yao et al. (2022) ReAct with PaLM-540B, and Trivedi et al. (2022)." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Tables 1 and 2 systematically vary programs (vanilla, CoT, reflection, multihop) and compilation strategies (none, fewshot, bootstrap, ensemble), effectively serving as an ablation study across modules and teleprompters." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "GSM8K uses accuracy; HotPotQA uses both answer exact match (Ans) and passage retrieval accuracy (Psg) in Table 2." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation of system outputs is included. All evaluation is automated (accuracy, EM, passage accuracy)." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "GSM8K uses the official 1.3k test set separate from train/dev (Sec 6). HotPotQA reserves the official validation set for testing (Sec 7). Dev results are reported separately." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by program type, compilation strategy, and LM in Tables 1 and 2. Dev vs test results are also separated." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": false, 102 "justification": "No error analysis or failure case discussion. The paper shows only aggregate accuracy numbers without examining where or why the system fails." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Some configurations show worse performance: ensemble hurts vanilla on GPT-3.5 test (61.9 vs 61.7), ReAct with bootstrap underperforms fewshot+human in Table 2, and zero-shot results are very poor (4-20%). These are honestly reported." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims of 25%/65% improvements and 5-46%/16-40% over expert demos are supported by Tables 1 and 2. The claim that T5 and llama2-13b are competitive with GPT-3.5 is supported by HotPotQA results." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper claims compiling modules improves performance. The ablation-style evaluation (Tables 1-2) systematically varies one factor at a time (program type or compilation strategy), providing adequate evidence for these causal claims." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title says 'Self-Improving Pipelines' generally, but results are on only two tasks (GSM8K, HotPotQA) with only a few LMs. The paper acknowledges in Sec 8 that it leaves 'reporting on such tasks under controlled experimental conditions to future work' but the abstract and title overstate the generality." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No discussion of alternative explanations for the improvements. For example, the bootstrapped demonstrations might simply be better few-shot examples rather than demonstrating the value of the modular programming model itself." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper measures accuracy/EM on specific benchmarks and reports these directly without inflating them into broader claims about general AI capability. Claims match measurement granularity." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "The paper uses 'GPT-3.5' and 'llama2-13b-chat' and 'T5-Large' without specifying exact API versions or snapshot dates. No version like 'gpt-3.5-turbo-0613' is given." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Appendix F (Figures 9-11) provides the actual automatically generated prompts used in the experiments, including full demonstration text." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "Key hyperparameters like temperature, top-p, and max tokens are not reported. Some parameters like k=8 for few-shot and num_attempts=5 for reflection are stated in code, but LM API settings are missing." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The DSPy framework itself is the scaffolding, and it is described in extensive detail in Sections 3-4, with pseudocode in Appendices D and E for modules and teleprompters." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 6 describes sampling 200/300 examples from GSM8K training set. Section 7 describes HotPotQA splits (70/30 train/val, keeping only 'hard' examples, sampling 200/300). The retrieval index is specified as ColBERTv2 over Wikipedia 2017 abstracts." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations section. The conclusion (Sec 8) briefly mentions leaving other tasks to future work but does not discuss limitations substantively." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed. There is no consideration of whether the improvements generalize beyond the two tested tasks or whether the compilation cost is prohibitive." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what the results do NOT show. Section 5 states evaluation goals (H1-H3) but does not bound the scope of claims." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "No raw experimental data (individual predictions, per-example results) is made available. Only aggregate accuracy numbers are reported." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Data sources are well described: GSM8K official train/test splits (Sec 6), HotPotQA official splits with the fullwiki setting (Sec 7), with specific sample sizes." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. The paper uses standard public benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline from training examples through compilation (bootstrapping, random search, ensembling) to evaluation is documented in Sections 4, 6, and 7 with code examples." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "The Acknowledgments section discloses funding from IBM, Oracle, Virtusa, Cigna Healthcare, HAI Azure compute grant, Stanford DAWN project (Facebook, Google, VMware), and NSF CAREER grant CNS-1651570. Omar Khattab's Apple fellowship is also disclosed." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed: Stanford, UC Berkeley, CMU, Amazon Alexa AI, Dashworks, IIT Bombay, Calera Capital, Microsoft, Two Sigma." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "The funders (IBM, Oracle, NSF, etc.) are not evaluated in the paper. The paper evaluates open models and OpenAI's GPT-3.5; none of the funders have a direct stake in DSPy's performance claims." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is included. Some authors have industry affiliations (Amazon, Microsoft, Two Sigma, Calera Capital) that are not discussed as potential conflicts." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for GPT-3.5 or llama2-13b-chat. The paper notes GPT-4 was 'pre-trained on a subset of GSM8K's training set' (Sec 6) but does not address this for GPT-3.5." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether GSM8K or HotPotQA data appeared in the training data of GPT-3.5 or Llama2 models." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "GSM8K (2021) and HotPotQA (2018) were both available online before GPT-3.5 and Llama2 training. The paper only mentions GPT-4's known contamination with GSM8K training data but does not address contamination for the models actually used." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference costs reported. The reflection program calls the LM 5 times per example, and bootstrap compilation runs the program thousands of times, but costs are not quantified." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "The paper says compilation runs 'on the order of minutes (or tens of minutes)' (Sec 6) but provides no specific compute budget, GPU hours, or API costs." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No seed sensitivity analysis. The paper mentions 'average of 3–5 runs' for the fewshot setting but does not report variance across seeds for bootstrapped results." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Section 6 states 'We report the average of 3–5 runs (depending on the setting) when applying such random sampling' for the fewshot compiler. However, it's unclear how many runs other compilation strategies use." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "BootstrapFewShotWithRandomSearch is used but the total number of trials, configurations explored, and compute spent on search is not reported in the main experiments." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section 6 states 'We report extensive comparisons on the development set to avoid overfitting on test.' Best configurations are selected on dev before test evaluation. The random search optimizes over a validation set (Appendix E.2)." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "Many comparisons are made across programs, compilers, and LMs without any correction for multiple comparisons." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors implement DSPy and evaluate it against their own implementations. No acknowledgment of self-comparison bias per Lucic et al. (2018)." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "Bootstrap and ensemble methods use substantially more compute than simple few-shot, but performance is not reported as a function of compute budget. The reflection program calls the LM 5x more per example than vanilla." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether GSM8K and HotPotQA actually measure the capabilities DSPy claims to improve. The paper takes benchmark validity for granted." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": true, 335 "justification": "DSPy IS the scaffold being evaluated. The paper systematically compares the same scaffold (DSPy programs) across different LMs (GPT-3.5, Llama2-13b, T5), isolating the model variable. When comparing compilation strategies, the same model is held constant." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "GSM8K (2021) and HotPotQA (2018) predate GPT-3.5 and Llama2 training. The paper mentions GPT-4's GSM8K contamination but does not address temporal leakage for the models actually evaluated." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information. The bootstrapped demonstrations come from the same distribution as the test set, but this is not addressed." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of train/test independence or whether bootstrapped demonstrations could overlap with or be similar to test examples." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention methods are used." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "DSPy programs compiled with bootstrapping outperform standard few-shot prompting by over 25% for GPT-3.5 and 65% for llama2-13b-chat", 364 "evidence": "Table 1 shows GSM8K vanilla fewshot at 33.1% vs bootstrap×2 at 61.7% for GPT-3.5 (28.6pp gain), and 4.3% vs 36.5% for Llama2 (32.2pp gain). CoT bootstrap reaches 72.9% vs fewshot 63.0% for GPT-3.5.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "DSPy can match or outperform expert-created human reasoning chain demonstrations", 369 "evidence": "Table 1, GSM8K CoT: bootstrap (72.9% test) vs human CoT fewshot (72.4% test) for GPT-3.5. Table 2, HotPotQA react: bootstrap×2 (40.0% dev) vs human demos (28.3% dev) for Llama2.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Small open LMs compiled with DSPy are competitive with proprietary GPT-3.5 using expert prompts", 374 "evidence": "Table 2 HotPotQA: Llama2-13b multihop ensemble achieves 41.0% vs GPT-3.5 multihop ensemble 45.6%. T5-Large (770M) achieves 39.3% dev via finetuning (Sec 7).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "A few lines of DSPy code can express sophisticated multi-stage NLP pipelines", 379 "evidence": "Section 6 shows the ThoughtReflection program in ~10 lines, and Section 7 shows BasicMultiHop in ~15 lines. Both achieve strong results when compiled.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "DSPy compilation improves programs from 4-20% to 49-88% accuracy on GSM8K", 384 "evidence": "Table 1: vanilla none (24.0/7.0 dev) to reflection bootstrap+ensemble (86.7/49.0 dev) for GPT-3.5/Llama2. The 88.3% figure is CoT ensemble on GPT-3.5 dev.", 385 "supported": "strong" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "No variance or uncertainty quantification", 391 "detail": "Despite averaging over 3-5 runs for some settings, no standard deviations, confidence intervals, or error bars are reported. It's impossible to assess whether differences between methods are meaningful or within noise." 392 }, 393 { 394 "flag": "Benchmark contamination unaddressed", 395 "detail": "GSM8K and HotPotQA were publicly available years before GPT-3.5 and Llama2 training. The paper even notes GPT-4 trained on GSM8K data but doesn't address this for the models it actually evaluates." 396 }, 397 { 398 "flag": "Self-evaluation without acknowledgment", 399 "detail": "The authors designed DSPy and evaluate it. All baselines (including zero-shot and few-shot variants) are run within the DSPy framework, meaning the framework authors control both the system and the evaluation setup." 400 }, 401 { 402 "flag": "Informal comparisons to prior work", 403 "detail": "Section 6-7 compare informally to prior work ('we can informally compare') with different models, evaluation splits, and settings. These comparisons are not controlled and could be misleading." 404 }, 405 { 406 "flag": "No limitations section", 407 "detail": "The paper has no dedicated limitations section discussing failure modes, scalability concerns, or settings where DSPy would not work well." 408 }, 409 { 410 "flag": "Missing cost analysis", 411 "detail": "Bootstrapping runs programs thousands of times, and ensemble/reflection multiply inference costs. No cost analysis is provided despite this being a major practical concern." 412 } 413 ], 414 "cited_papers": [ 415 { 416 "title": "Chain of thought prompting elicits reasoning in large language models", 417 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 418 "year": 2022, 419 "arxiv_id": "2201.11903", 420 "relevance": "Foundational prompting technique that DSPy modularizes; key baseline for comparison." 421 }, 422 { 423 "title": "ReAct: Synergizing reasoning and acting in language models", 424 "authors": ["Shunyu Yao", "Jeffrey Zhao"], 425 "year": 2022, 426 "arxiv_id": "2210.03629", 427 "relevance": "Agent framework that DSPy implements as a module; directly compared in HotPotQA experiments." 428 }, 429 { 430 "title": "Self-consistency improves chain of thought reasoning in language models", 431 "authors": ["Xuezhi Wang", "Jason Wei"], 432 "year": 2022, 433 "arxiv_id": "2203.11171", 434 "relevance": "Prompting technique related to DSPy's ensemble strategy; compared on GSM8K." 435 }, 436 { 437 "title": "Language models are few-shot learners", 438 "authors": ["Tom Brown"], 439 "year": 2020, 440 "relevance": "Foundational in-context learning work that DSPy builds upon and seeks to automate." 441 }, 442 { 443 "title": "Retrieval-augmented generation for knowledge-intensive nlp tasks", 444 "authors": ["Patrick Lewis"], 445 "year": 2020, 446 "relevance": "RAG paradigm that DSPy implements as a composable module pattern." 447 }, 448 { 449 "title": "Automatic chain of thought prompting in large language models", 450 "authors": ["Zhuosheng Zhang"], 451 "year": 2022, 452 "arxiv_id": "2210.03493", 453 "relevance": "Automatic CoT method compared on GSM8K; related approach to DSPy's bootstrapping." 454 }, 455 { 456 "title": "Self-refine: Iterative refinement with self-feedback", 457 "authors": ["Aman Madaan"], 458 "year": 2023, 459 "arxiv_id": "2303.17651", 460 "relevance": "Self-improvement technique for LMs that DSPy generalizes through its module and compiler system." 461 }, 462 { 463 "title": "Toolformer: Language models can teach themselves to use tools", 464 "authors": ["Timo Schick"], 465 "year": 2023, 466 "arxiv_id": "2302.04761", 467 "relevance": "Tool-use approach for LMs relevant to DSPy's tool integration capabilities." 468 }, 469 { 470 "title": "Demonstrate-search-predict: Composing retrieval and language models for knowledge-intensive nlp", 471 "authors": ["Omar Khattab"], 472 "year": 2022, 473 "arxiv_id": "2212.14024", 474 "relevance": "DSPy's predecessor framework (DSP); directly evolved into the DSPy programming model." 475 }, 476 { 477 "title": "Automatic prompt optimization with gradient descent and beam search", 478 "authors": ["Reid Pryzant"], 479 "year": 2023, 480 "arxiv_id": "2305.03495", 481 "relevance": "Prompt optimization technique related to DSPy's teleprompter concept." 482 }, 483 { 484 "title": "Large language models as optimizers", 485 "authors": ["Chengrun Yang"], 486 "year": 2023, 487 "arxiv_id": "2309.03409", 488 "relevance": "LLM-based optimization approach relevant to DSPy's automated prompt tuning." 489 }, 490 { 491 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 492 "authors": ["Lingjiao Chen"], 493 "year": 2023, 494 "arxiv_id": "2305.05176", 495 "relevance": "Cost-aware LLM usage relevant to DSPy's model selection and compilation capabilities." 496 } 497 ] 498 }