scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32344B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
      6     "authors": [
      7       "Omar Khattab",
      8       "Arnav Singhvi",
      9       "Paridhi Maheshwari",
     10       "Zhiyuan Zhang",
     11       "Keshav Santhanam",
     12       "Sri Vardhamanan",
     13       "Saiful Haq",
     14       "Ashutosh Sharma",
     15       "Thomas T. Joshi",
     16       "Hanna Moazam",
     17       "Heather Miller",
     18       "Matei Zaharia",
     19       "Christopher Potts"
     20     ],
     21     "year": 2023,
     22     "venue": "arXiv.org",
     23     "arxiv_id": "2310.03714",
     24     "doi": null
     25   },
     26   "checklist": {
     27     "claims_and_evidence": {
     28       "abstract_claims_supported": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Abstract claims of 25%/65% improvements and 5-46%/16-40% over expert demos are supported by Tables 1 and 2. The claim that T5 and llama2-13b are competitive with GPT-3.5 is supported by HotPotQA results.",
     32         "source": "opus"
     33       },
     34       "causal_claims_justified": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "The paper claims compiling modules improves performance. The ablation-style evaluation (Tables 1-2) systematically varies one factor at a time (program type or compilation strategy), providing adequate evidence for these causal claims.",
     38         "source": "opus"
     39       },
     40       "generalization_bounded": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The title says 'Self-Improving Pipelines' generally, but results are on only two tasks (GSM8K, HotPotQA) with only a few LMs. The paper acknowledges in Sec 8 that it leaves 'reporting on such tasks under controlled experimental conditions to future work' but the abstract and title overstate the generality.",
     44         "source": "opus"
     45       },
     46       "alternative_explanations_discussed": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "No discussion of alternative explanations for the improvements. For example, the bootstrapped demonstrations might simply be better few-shot examples rather than demonstrating the value of the modular programming model itself.",
     50         "source": "opus"
     51       },
     52       "proxy_outcome_distinction": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper measures accuracy/EM on specific benchmarks and reports these directly without inflating them into broader claims about general AI capability. Claims match measurement granularity.",
     56         "source": "opus"
     57       }
     58     },
     59     "limitations_and_scope": {
     60       "limitations_section_present": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "There is no dedicated limitations section. The conclusion (Sec 8) briefly mentions leaving other tasks to future work but does not discuss limitations substantively.",
     64         "source": "opus"
     65       },
     66       "threats_to_validity_specific": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No threats to validity are discussed. There is no consideration of whether the improvements generalize beyond the two tested tasks or whether the compilation cost is prohibitive.",
     70         "source": "opus"
     71       },
     72       "scope_boundaries_stated": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The paper does not explicitly state what the results do NOT show. Section 5 states evaluation goals (H1-H3) but does not bound the scope of claims.",
     76         "source": "opus"
     77       }
     78     },
     79     "conflicts_of_interest": {
     80       "funding_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The Acknowledgments section discloses funding from IBM, Oracle, Virtusa, Cigna Healthcare, HAI Azure compute grant, Stanford DAWN project (Facebook, Google, VMware), and NSF CAREER grant CNS-1651570. Omar Khattab's Apple fellowship is also disclosed.",
     84         "source": "opus"
     85       },
     86       "affiliations_disclosed": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Author affiliations are listed: Stanford, UC Berkeley, CMU, Amazon Alexa AI, Dashworks, IIT Bombay, Calera Capital, Microsoft, Two Sigma.",
     90         "source": "opus"
     91       },
     92       "funder_independent_of_outcome": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "The funders (IBM, Oracle, NSF, etc.) are not evaluated in the paper. The paper evaluates open models and OpenAI's GPT-3.5; none of the funders have a direct stake in DSPy's performance claims.",
     96         "source": "opus"
     97       },
     98       "financial_interests_declared": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No competing interests or financial interests statement is included. Some authors have industry affiliations (Amazon, Microsoft, Two Sigma, Calera Capital) that are not discussed as potential conflicts.",
    102         "source": "opus"
    103       }
    104     },
    105     "scope_and_framing": {
    106       "key_terms_defined": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Key DSPy-specific terms (signatures, modules, teleprompters, text transformation graph) are precisely defined in Section 3 with formal descriptions and code examples.",
    110         "source": "haiku"
    111       },
    112       "intended_contribution_clear": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper explicitly states its contribution: 'the first programming model that translates prompting techniques into parameterized declarative modules and introduces an effective compiler with general optimization strategies.'",
    116         "source": "haiku"
    117       },
    118       "engagement_with_prior_work": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 2 situates DSPy relative to deep learning frameworks (PyTorch, Theano), in-context learning literature, tool use pipelines, and existing toolkits (LangChain, LlamaIndex), showing how it extends and differs from each.",
    122         "source": "haiku"
    123       }
    124     }
    125   },
    126   "type_checklist": {
    127     "empirical": {
    128       "artifacts": {
    129         "code_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "The paper provides a GitHub link: https://github.com/stanfordnlp/dspy, mentioned in the abstract and Section 1.",
    133           "source": "opus"
    134         },
    135         "data_released": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "The paper uses publicly available benchmarks: GSM8K (Cobbe et al., 2021) and HotPotQA (Yang et al., 2018), both standard public datasets.",
    139           "source": "opus"
    140         },
    141         "environment_specified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No requirements.txt, Dockerfile, or detailed environment setup is described in the paper. Only model names are mentioned.",
    145           "source": "opus"
    146         },
    147         "reproduction_instructions": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "While code is released, the paper does not include step-by-step reproduction instructions or scripts to replicate experiments.",
    151           "source": "opus"
    152         }
    153       },
    154       "statistical_methodology": {
    155         "confidence_intervals_or_error_bars": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "Tables 1 and 2 report point estimates only (e.g., '81.6', '46.9') with no confidence intervals or error bars.",
    159           "source": "opus"
    160         },
    161         "significance_tests": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "The paper claims DSPy programs outperform baselines but provides no statistical significance tests — only raw accuracy comparisons.",
    165           "source": "opus"
    166         },
    167         "effect_sizes_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "The paper reports percentage improvements with baseline context throughout, e.g., 'from 33% to 82%' (Sec 1), 'from 32% to 46%' (Sec 1), and absolute numbers in Tables 1 and 2.",
    171           "source": "opus"
    172         },
    173         "sample_size_justified": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Training set sizes (200, 300 examples) and dev/test splits are stated but not justified. No power analysis or rationale for why these sizes are adequate.",
    177           "source": "opus"
    178         },
    179         "variance_reported": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "Section 6 mentions 'average of 3–5 runs' for the fewshot setting with random sampling, but no standard deviations or variance measures are reported in the results tables.",
    183           "source": "opus"
    184         }
    185       },
    186       "evaluation_design": {
    187         "baselines_included": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Multiple baselines are included: zero-shot (none), few-shot with random demos, few-shot with human CoT demos, and bootstrapped variants. Also informal comparisons to prior work (Zhang et al., Wang et al., Touvron et al.).",
    191           "source": "opus"
    192         },
    193         "baselines_contemporary": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Comparisons include contemporary work: Wang et al. (2022b) self-consistency with PaLM-540B, Zhao et al. (2023b) with gpt-3.5-turbo, Yao et al. (2022) ReAct with PaLM-540B, and Trivedi et al. (2022).",
    197           "source": "opus"
    198         },
    199         "ablation_study": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Tables 1 and 2 systematically vary programs (vanilla, CoT, reflection, multihop) and compilation strategies (none, fewshot, bootstrap, ensemble), effectively serving as an ablation study across modules and teleprompters.",
    203           "source": "opus"
    204         },
    205         "multiple_metrics": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "GSM8K uses accuracy; HotPotQA uses both answer exact match (Ans) and passage retrieval accuracy (Psg) in Table 2.",
    209           "source": "opus"
    210         },
    211         "human_evaluation": {
    212           "applies": true,
    213           "answer": false,
    214           "justification": "No human evaluation of system outputs is included. All evaluation is automated (accuracy, EM, passage accuracy).",
    215           "source": "opus"
    216         },
    217         "held_out_test_set": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "GSM8K uses the official 1.3k test set separate from train/dev (Sec 6). HotPotQA reserves the official validation set for testing (Sec 7). Dev results are reported separately.",
    221           "source": "opus"
    222         },
    223         "per_category_breakdown": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Results are broken down by program type, compilation strategy, and LM in Tables 1 and 2. Dev vs test results are also separated.",
    227           "source": "opus"
    228         },
    229         "failure_cases_discussed": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "No error analysis or failure case discussion. The paper shows only aggregate accuracy numbers without examining where or why the system fails.",
    233           "source": "opus"
    234         },
    235         "negative_results_reported": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Some configurations show worse performance: ensemble hurts vanilla on GPT-3.5 test (61.9 vs 61.7), ReAct with bootstrap underperforms fewshot+human in Table 2, and zero-shot results are very poor (4-20%). These are honestly reported.",
    239           "source": "opus"
    240         }
    241       },
    242       "setup_transparency": {
    243         "model_versions_specified": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "The paper uses 'GPT-3.5' and 'llama2-13b-chat' and 'T5-Large' without specifying exact API versions or snapshot dates. No version like 'gpt-3.5-turbo-0613' is given.",
    247           "source": "opus"
    248         },
    249         "prompts_provided": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Appendix F (Figures 9-11) provides the actual automatically generated prompts used in the experiments, including full demonstration text.",
    253           "source": "opus"
    254         },
    255         "hyperparameters_reported": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "Key hyperparameters like temperature, top-p, and max tokens are not reported. Some parameters like k=8 for few-shot and num_attempts=5 for reflection are stated in code, but LM API settings are missing.",
    259           "source": "opus"
    260         },
    261         "scaffolding_described": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "The DSPy framework itself is the scaffolding, and it is described in extensive detail in Sections 3-4, with pseudocode in Appendices D and E for modules and teleprompters.",
    265           "source": "opus"
    266         },
    267         "data_preprocessing_documented": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Section 6 describes sampling 200/300 examples from GSM8K training set. Section 7 describes HotPotQA splits (70/30 train/val, keeping only 'hard' examples, sampling 200/300). The retrieval index is specified as ColBERTv2 over Wikipedia 2017 abstracts.",
    271           "source": "opus"
    272         }
    273       },
    274       "data_integrity": {
    275         "raw_data_available": {
    276           "applies": true,
    277           "answer": false,
    278           "justification": "No raw experimental data (individual predictions, per-example results) is made available. Only aggregate accuracy numbers are reported.",
    279           "source": "opus"
    280         },
    281         "data_collection_described": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Data sources are well described: GSM8K official train/test splits (Sec 6), HotPotQA official splits with the fullwiki setting (Sec 7), with specific sample sizes.",
    285           "source": "opus"
    286         },
    287         "recruitment_methods_described": {
    288           "applies": false,
    289           "answer": false,
    290           "justification": "No human participants. The paper uses standard public benchmarks.",
    291           "source": "opus"
    292         },
    293         "data_pipeline_documented": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The pipeline from training examples through compilation (bootstrapping, random search, ensembling) to evaluation is documented in Sections 4, 6, and 7 with code examples.",
    297           "source": "opus"
    298         }
    299       },
    300       "contamination": {
    301         "training_cutoff_stated": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "No training data cutoff dates are stated for GPT-3.5 or llama2-13b-chat. The paper notes GPT-4 was 'pre-trained on a subset of GSM8K's training set' (Sec 6) but does not address this for GPT-3.5.",
    305           "source": "opus"
    306         },
    307         "train_test_overlap_discussed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "No discussion of whether GSM8K or HotPotQA data appeared in the training data of GPT-3.5 or Llama2 models.",
    311           "source": "opus"
    312         },
    313         "benchmark_contamination_addressed": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "GSM8K (2021) and HotPotQA (2018) were both available online before GPT-3.5 and Llama2 training. The paper only mentions GPT-4's known contamination with GSM8K training data but does not address contamination for the models actually used.",
    317           "source": "opus"
    318         }
    319       },
    320       "human_studies": {
    321         "pre_registered": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "opus"
    326         },
    327         "irb_or_ethics_approval": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "demographics_reported": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "inclusion_exclusion_criteria": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "randomization_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         },
    351         "blinding_described": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "opus"
    356         },
    357         "attrition_reported": {
    358           "applies": false,
    359           "answer": false,
    360           "justification": "No human participants in this study.",
    361           "source": "opus"
    362         }
    363       },
    364       "cost_and_practicality": {
    365         "inference_cost_reported": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No inference costs reported. The reflection program calls the LM 5 times per example, and bootstrap compilation runs the program thousands of times, but costs are not quantified.",
    369           "source": "opus"
    370         },
    371         "compute_budget_stated": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "The paper says compilation runs 'on the order of minutes (or tens of minutes)' (Sec 6) but provides no specific compute budget, GPU hours, or API costs.",
    375           "source": "opus"
    376         }
    377       },
    378       "experimental_rigor": {
    379         "seed_sensitivity_reported": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "No seed sensitivity analysis. The paper mentions 'average of 3–5 runs' for the fewshot setting but does not report variance across seeds for bootstrapped results.",
    383           "source": "opus"
    384         },
    385         "number_of_runs_stated": {
    386           "applies": true,
    387           "answer": true,
    388           "justification": "Section 6 states 'We report the average of 3–5 runs (depending on the setting) when applying such random sampling' for the fewshot compiler. However, it's unclear how many runs other compilation strategies use.",
    389           "source": "opus"
    390         },
    391         "hyperparameter_search_budget": {
    392           "applies": true,
    393           "answer": false,
    394           "justification": "BootstrapFewShotWithRandomSearch is used but the total number of trials, configurations explored, and compute spent on search is not reported in the main experiments.",
    395           "source": "opus"
    396         },
    397         "best_config_selection_justified": {
    398           "applies": true,
    399           "answer": true,
    400           "justification": "Section 6 states 'We report extensive comparisons on the development set to avoid overfitting on test.' Best configurations are selected on dev before test evaluation. The random search optimizes over a validation set (Appendix E.2).",
    401           "source": "opus"
    402         },
    403         "multiple_comparison_correction": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "Many comparisons are made across programs, compilers, and LMs without any correction for multiple comparisons.",
    407           "source": "opus"
    408         },
    409         "self_comparison_bias_addressed": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "The authors implement DSPy and evaluate it against their own implementations. No acknowledgment of self-comparison bias per Lucic et al. (2018).",
    413           "source": "opus"
    414         },
    415         "compute_budget_vs_performance": {
    416           "applies": true,
    417           "answer": false,
    418           "justification": "Bootstrap and ensemble methods use substantially more compute than simple few-shot, but performance is not reported as a function of compute budget. The reflection program calls the LM 5x more per example than vanilla.",
    419           "source": "opus"
    420         },
    421         "benchmark_construct_validity": {
    422           "applies": true,
    423           "answer": false,
    424           "justification": "No discussion of whether GSM8K and HotPotQA actually measure the capabilities DSPy claims to improve. The paper takes benchmark validity for granted.",
    425           "source": "opus"
    426         },
    427         "scaffold_confound_addressed": {
    428           "applies": true,
    429           "answer": true,
    430           "justification": "DSPy IS the scaffold being evaluated. The paper systematically compares the same scaffold (DSPy programs) across different LMs (GPT-3.5, Llama2-13b, T5), isolating the model variable. When comparing compilation strategies, the same model is held constant.",
    431           "source": "opus"
    432         }
    433       },
    434       "data_leakage": {
    435         "temporal_leakage_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "GSM8K (2021) and HotPotQA (2018) predate GPT-3.5 and Llama2 training. The paper mentions GPT-4's GSM8K contamination but does not address temporal leakage for the models actually evaluated.",
    439           "source": "opus"
    440         },
    441         "feature_leakage_addressed": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No discussion of whether the evaluation setup leaks information. The bootstrapped demonstrations come from the same distribution as the test set, but this is not addressed.",
    445           "source": "opus"
    446         },
    447         "non_independence_addressed": {
    448           "applies": true,
    449           "answer": false,
    450           "justification": "No discussion of train/test independence or whether bootstrapped demonstrations could overlap with or be similar to test examples.",
    451           "source": "opus"
    452         },
    453         "leakage_detection_method": {
    454           "applies": true,
    455           "answer": false,
    456           "justification": "No leakage detection or prevention methods are used.",
    457           "source": "opus"
    458         }
    459       }
    460     }
    461   },
    462   "claims": [
    463     {
    464       "claim": "DSPy bootstrapped programs outperform expert-crafted human CoT prompts on GSM8K by 5-46% (GPT-3.5) and 16-40% (Llama2-13b-chat)",
    465       "evidence": "Table 1: CoT bootstrap 80.3% vs human CoT fewshot 78.6% for GPT-3.5; reflection bootstrap 76.0% vs human CoT 72.4% test for GPT-3.5",
    466       "supported": "moderate"
    467     },
    468     {
    469       "claim": "DSPy compilation raises GPT-3.5 on GSM8K from 4-20% (zero-shot/few-shot) to 49-88% accuracy",
    470       "evidence": "Table 1: vanilla none 24.0% dev → CoT+ensemble bootstrap 88.3% dev; test results confirm 81.6%",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "Llama2-13b-chat compiled with DSPy is competitive with Llama2-34b on GSM8K without human reasoning chains",
    475       "evidence": "Llama2-13b-chat reflection ensemble reaches 46.9% test, compared to Llama2-34b's 42.2% reported in Touvron et al. (2023)",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "T5-Large (770M parameters) compiled via DSPy achieves competitive HotPotQA performance vs larger proprietary models",
    480       "evidence": "multihop_t5 scores 39.3% answer EM and 46.0% passage accuracy on dev using only 200 labeled + 800 unlabeled inputs",
    481       "supported": "moderate"
    482     },
    483     {
    484       "claim": "DSPy multihop program achieves 45.6% answer EM on HotPotQA test with GPT-3.5 (ensemble)",
    485       "evidence": "Table 2: multihop ensemble GPT-3.5 test 45.6% EM, evaluated on 50% of test set due to cost",
    486       "supported": "weak"
    487     },
    488     {
    489       "claim": "LangChain relies on 50+ hard-coded prompt strings exceeding 1000 characters each; DSPy contains none",
    490       "evidence": "Appendix B: informal analysis of LangChain codebase in late September 2023 counting 50 strings >1000 chars and 12/42 prompts.py files",
    491       "supported": "weak"
    492     }
    493   ],
    494   "methodology_tags": [
    495     "benchmark-eval",
    496     "case-study"
    497   ],
    498   "key_findings": "DSPy introduces a programming model for LM pipelines that replaces hand-crafted prompts with parameterized declarative modules compiled via bootstrapped few-shot demonstrations. Across two case studies (GSM8K and HotPotQA), compilation consistently outperforms zero-shot, random few-shot, and expert-crafted human prompts, improving GPT-3.5 from 25% to 82% on GSM8K math problems. Smaller open-source models (Llama2-13b-chat, T5-Large 770M) compiled with DSPy can match or exceed the performance of uncompiled larger or proprietary models. The bootstrapping approach is task-agnostic and requires only a few training examples with optional final-output labels.",
    499   "red_flags": [
    500     {
    501       "flag": "No statistical testing",
    502       "detail": "Despite reporting numerical comparisons across many conditions and models, the paper uses no significance tests and reports no confidence intervals, making it impossible to determine if improvements are reliable."
    503     },
    504     {
    505       "flag": "No variance reported",
    506       "detail": "Only means of 3-5 runs are reported without standard deviation; given LM stochasticity, the spread could be substantial relative to some reported differences."
    507     },
    508     {
    509       "flag": "Two-task generalization",
    510       "detail": "Broad claims about DSPy as a general LM pipeline paradigm rest on only two NLP benchmark tasks (math word problems and multi-hop QA), limiting generalizability."
    511     },
    512     {
    513       "flag": "Model version underspecified",
    514       "detail": "GPT-3.5 is used without a snapshot date; GPT-3.5-turbo capabilities vary significantly by deployment date, making exact reproduction difficult."
    515     },
    516     {
    517       "flag": "Benchmark contamination unaddressed",
    518       "detail": "GSM8K and HotPotQA were available before training cutoffs of both GPT-3.5 and Llama2; the paper does not discuss or assess potential contamination effects on results."
    519     },
    520     {
    521       "flag": "No limitations section",
    522       "detail": "The paper has no dedicated limitations or threats-to-validity section; failure modes, scope limits, and task sensitivity of DSPy are not analyzed."
    523     },
    524     {
    525       "flag": "Partial test set evaluation",
    526       "detail": "The best HotPotQA result (multihop ensemble) is evaluated on only 50% of the test set 'due to cost', weakening the reliability of the headline result."
    527     }
    528   ],
    529   "cited_papers": [
    530     {
    531       "title": "Chain of Thought Prompting Elicits Reasoning in Large Language Models",
    532       "relevance": "Foundational prompting technique that DSPy abstracts into a parameterized module (ChainOfThought)"
    533     },
    534     {
    535       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    536       "relevance": "Agent loop prompting technique implemented as a built-in DSPy module and used as baseline in HotPotQA evaluation"
    537     },
    538     {
    539       "title": "HotPotQA: A Dataset for Diverse, Explainable Multi-Hop Question Answering",
    540       "relevance": "Primary evaluation benchmark for multi-hop retrieval case study"
    541     },
    542     {
    543       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    544       "relevance": "Primary evaluation benchmark for math word problem case study"
    545     },
    546     {
    547       "title": "Demonstrate-Search-Predict: Composing Retrieval and Language Models for Knowledge-Intensive NLP",
    548       "relevance": "Direct predecessor to DSPy from the same authors; DSPy is the second iteration of the DSP framework"
    549     },
    550     {
    551       "title": "Language Models are Few-Shot Learners (GPT-3)",
    552       "relevance": "Foundation for in-context learning paradigm that DSPy's compilation optimizes"
    553     },
    554     {
    555       "title": "Training Language Models to Follow Instructions with Human Feedback (InstructGPT)",
    556       "relevance": "Instruction tuning context that enables the prompting techniques DSPy abstracts"
    557     },
    558     {
    559       "title": "Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-Intensive Multi-Step Questions (IRCoT)",
    560       "relevance": "Related multi-hop reasoning approach used for comparison in HotPotQA evaluation"
    561     }
    562   ],
    563   "engagement_factors": {
    564     "practical_relevance": {
    565       "score": 3,
    566       "justification": "DSPy is publicly available and actively used by practitioners to build LM pipelines without manual prompt engineering."
    567     },
    568     "surprise_contrarian": {
    569       "score": 2,
    570       "justification": "Challenges the prevailing paradigm that expert-crafted prompts are necessary, showing automated compilation can match or exceed human-written prompts."
    571     },
    572     "fear_safety": {
    573       "score": 0,
    574       "justification": "No AI safety, risk, or security concerns are raised; the paper is purely about pipeline optimization."
    575     },
    576     "drama_conflict": {
    577       "score": 1,
    578       "justification": "Includes a pointed informal critique of LangChain and LlamaIndex as prompt-engineering-dependent, positioning DSPy competitively."
    579     },
    580     "demo_ability": {
    581       "score": 3,
    582       "justification": "Code is publicly available at github.com/stanfordnlp/dspy with examples that can be run immediately; the paper includes complete executable code snippets."
    583     },
    584     "brand_recognition": {
    585       "score": 2,
    586       "justification": "Stanford and UC Berkeley authors with established names in the retrieval/NLP space (Khattab, Zaharia, Potts), but not a big lab release."
    587     }
    588   },
    589   "hn_data": {
    590     "threads": [
    591       {
    592         "hn_id": "42168997",
    593         "title": "It's time to replace TCP in the datacenter (2023)",
    594         "points": 189,
    595         "comments": 156,
    596         "url": "https://news.ycombinator.com/item?id=42168997",
    597         "created_at": "2024-11-18T01:42:41Z"
    598       },
    599       {
    600         "hn_id": "34337707",
    601         "title": "“A Handbook of Integer Sequences” Fifty Years Later",
    602         "points": 139,
    603         "comments": 45,
    604         "url": "https://news.ycombinator.com/item?id=34337707",
    605         "created_at": "2023-01-11T12:37:58Z"
    606       },
    607       {
    608         "hn_id": "33088928",
    609         "title": "It's time to replace TCP in the Datacenter",
    610         "points": 6,
    611         "comments": 1,
    612         "url": "https://news.ycombinator.com/item?id=33088928",
    613         "created_at": "2022-10-04T23:56:57Z"
    614       },
    615       {
    616         "hn_id": "37805651",
    617         "title": "Agent Instructs Large Language Models to Be General Zero-Shot Reasoners",
    618         "points": 5,
    619         "comments": 0,
    620         "url": "https://news.ycombinator.com/item?id=37805651",
    621         "created_at": "2023-10-07T21:17:40Z"
    622       },
    623       {
    624         "hn_id": "38561645",
    625         "title": "Relightable Gaussian Codec Avatars",
    626         "points": 4,
    627         "comments": 0,
    628         "url": "https://news.ycombinator.com/item?id=38561645",
    629         "created_at": "2023-12-07T20:50:41Z"
    630       },
    631       {
    632         "hn_id": "33151628",
    633         "title": "Integration of Skyline Queries into Spark SQL",
    634         "points": 3,
    635         "comments": 1,
    636         "url": "https://news.ycombinator.com/item?id=33151628",
    637         "created_at": "2022-10-10T14:10:30Z"
    638       },
    639       {
    640         "hn_id": "24766804",
    641         "title": "Abductive Knowledge Induction from Raw Data",
    642         "points": 3,
    643         "comments": 0,
    644         "url": "https://news.ycombinator.com/item?id=24766804",
    645         "created_at": "2020-10-13T15:59:02Z"
    646       },
    647       {
    648         "hn_id": "41820840",
    649         "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    650         "points": 2,
    651         "comments": 0,
    652         "url": "https://news.ycombinator.com/item?id=41820840",
    653         "created_at": "2024-10-12T17:30:26Z"
    654       },
    655       {
    656         "hn_id": "37776712",
    657         "title": "Large Language Models as Analogical Reasoners",
    658         "points": 2,
    659         "comments": 1,
    660         "url": "https://news.ycombinator.com/item?id=37776712",
    661         "created_at": "2023-10-05T10:04:39Z"
    662       },
    663       {
    664         "hn_id": "34364348",
    665         "title": "Exoshuffle-CloudSort",
    666         "points": 2,
    667         "comments": 1,
    668         "url": "https://news.ycombinator.com/item?id=34364348",
    669         "created_at": "2023-01-13T05:40:48Z"
    670       }
    671     ],
    672     "top_points": 189,
    673     "total_points": 355,
    674     "total_comments": 205
    675   }
    676 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs