scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (30055B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
      6     "authors": [
      7       "Omar Khattab",
      8       "Arnav Singhvi",
      9       "Paridhi Maheshwari",
     10       "Zhiyuan Zhang",
     11       "Keshav Santhanam",
     12       "Sri Vardhamanan",
     13       "Saiful Haq",
     14       "Ashutosh Sharma",
     15       "Thomas T. Joshi",
     16       "Hanna Moazam",
     17       "Heather Miller",
     18       "Matei Zaharia",
     19       "Christopher Potts"
     20     ],
     21     "year": 2023,
     22     "venue": "arXiv.org",
     23     "arxiv_id": "2310.03714",
     24     "doi": null
     25   },
     26   "checklist": {
     27     "claims_and_evidence": {
     28       "abstract_claims_supported": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The abstract claims DSPy outperforms few-shot prompting 'generally by over 25% and 65%' for GPT-3.5 and Llama2 respectively, but the '65%' figure for Llama2 cannot be verified — the best Llama2 improvement over fewshot in Table 1 is ~33pp (vanilla bootstrap×2 dev 37.3% vs fewshot 4.3%), not 65pp.",
     32         "source": "haiku"
     33       },
     34       "causal_claims_justified": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "Causal claims about compilation improving performance are supported through controlled comparisons across multiple compilation strategies (none, fewshot, bootstrap, bootstrap×2, ensemble) on fixed benchmarks with the same programs.",
     38         "source": "haiku"
     39       },
     40       "generalization_bounded": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The conclusion asserts DSPy has been applied to 'a large number of programs spanning tasks from information extraction to low-resource synthetic data generation,' but these results are not reported; formal evaluation covers only GSM8K and HotPotQA.",
     44         "source": "haiku"
     45       },
     46       "alternative_explanations_discussed": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "The paper does not address whether gains stem from the DSPy programming model itself or simply from having better bootstrapped few-shot demonstrations that any prompting framework could use; no comparison to standard few-shot with equivalently bootstrapped demonstrations is provided.",
     50         "source": "haiku"
     51       },
     52       "proxy_outcome_distinction": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Claims are framed in terms of exact match accuracy on the specific benchmarks tested; the paper does not conflate benchmark accuracy with broader productivity or real-world impact claims.",
     56         "source": "haiku"
     57       }
     58     },
     59     "limitations_and_scope": {
     60       "limitations_section_present": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion mentions leaving broader reporting to 'future work' but does not systematically acknowledge limitations of the present evaluation.",
     64         "source": "haiku"
     65       },
     66       "threats_to_validity_specific": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No specific threats to validity are discussed — evaluation scope (2 tasks, 2 LMs), potential benchmark contamination, variance across runs, and GPT-3.5 version instability are not addressed.",
     70         "source": "haiku"
     71       },
     72       "scope_boundaries_stated": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "The paper makes broad claims about DSPy as a general programming model without explicitly stating what the two-task, two-LM evaluation does not demonstrate.",
     76         "source": "haiku"
     77       }
     78     },
     79     "conflicts_of_interest": {
     80       "funding_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Funding from IBM/Stanford HAI, Oracle, Virtusa, Cigna Healthcare, HAI Azure compute grant, NSF CAREER grant CNS-1651570, and Apple Scholars fellowship is explicitly disclosed in the acknowledgments.",
     84         "source": "haiku"
     85       },
     86       "affiliations_disclosed": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "All author affiliations are disclosed on the title page, including Stanford, UC Berkeley, CMU, Amazon Alexa AI, IIT Bombay, Calera Capital, Microsoft, and Two Sigma.",
     90         "source": "haiku"
     91       },
     92       "funder_independent_of_outcome": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Primary funders (NSF, Stanford HAI, Azure compute) are independent of DSPy's commercial outcome; while Amazon Alexa AI is one author's affiliation, DSPy is not an Amazon product and the evaluation does not favor Amazon systems.",
     96         "source": "haiku"
     97       },
     98       "financial_interests_declared": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "No competing interests or financial interests statement is included; authors at industry affiliations (Amazon Alexa AI, Microsoft, Calera Capital) have undisclosed potential interests.",
    102         "source": "haiku"
    103       }
    104     },
    105     "scope_and_framing": {
    106       "key_terms_defined": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Core DSPy concepts (signature, module, teleprompter, compilation, text transformation graph) are defined precisely in Sections 3 and 4 with formal definitions and code examples.",
    110         "source": "haiku"
    111       },
    112       "intended_contribution_clear": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 1 explicitly states three contributions: signatures (abstracting prompts), modules (abstracting prompting techniques), and teleprompters (optimizing arbitrary pipelines), framed as the first programming model of this kind.",
    116         "source": "haiku"
    117       },
    118       "engagement_with_prior_work": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Section 2 engages substantively with deep learning frameworks (PyTorch, Theano), in-context learning literature, LangChain/LlamaIndex, and prompt optimization work; Appendix B provides a detailed quantitative comparison with LangChain.",
    122         "source": "haiku"
    123       }
    124     }
    125   },
    126   "type_checklist": {
    127     "empirical": {
    128       "artifacts": {
    129         "code_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "DSPy is released at https://github.com/stanfordnlp/dspy; the paper notes the open-source version has been maintained for 'close to a year' at time of writing.",
    133           "source": "haiku"
    134         },
    135         "data_released": {
    136           "applies": true,
    137           "answer": true,
    138           "justification": "Both evaluation benchmarks (GSM8K and HotPotQA) are standard public datasets; the Wikipedia 2017 abstracts dump and ColBERTv2 retriever used for HotPotQA are also publicly available.",
    139           "source": "haiku"
    140         },
    141         "environment_specified": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "No requirements.txt, Dockerfile, or explicit environment specification is provided in the paper; only high-level component names (ColBERTv2, specific LMs) are mentioned.",
    145           "source": "haiku"
    146         },
    147         "reproduction_instructions": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "The appendix provides pseudocode for teleprompters and sample generated prompts, but no step-by-step instructions for reproducing the reported GSM8K or HotPotQA experiments are given.",
    151           "source": "haiku"
    152         }
    153       },
    154       "statistical_methodology": {
    155         "confidence_intervals_or_error_bars": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "All results in Tables 1 and 2 are reported as point estimates with no confidence intervals or error bars, even for LabeledFewShot settings where 3-5 runs are averaged.",
    159           "source": "haiku"
    160         },
    161         "significance_tests": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No statistical significance tests are applied to any comparative claims; all comparisons between compilation strategies are informal.",
    165           "source": "haiku"
    166         },
    167         "effect_sizes_reported": {
    168           "applies": true,
    169           "answer": true,
    170           "justification": "Absolute accuracy figures and baseline comparisons are reported throughout (e.g., GSM8K GPT-3.5 vanilla 25.2% → reflection+ensemble 81.6% test), allowing readers to assess effect magnitudes.",
    171           "source": "haiku"
    172         },
    173         "sample_size_justified": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Training (200) and development (300) set sizes are stated but not statistically justified; no power analysis or discussion of sufficiency is provided.",
    177           "source": "haiku"
    178         },
    179         "variance_reported": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "For LabeledFewShot only, 'average of 3-5 runs' is mentioned, but no standard deviations or variance estimates are reported for any results in the tables.",
    183           "source": "haiku"
    184         }
    185       },
    186       "evaluation_design": {
    187         "baselines_included": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Tables 1 and 2 include vanilla (no compilation), fewshot, and none (zero-shot) baselines, as well as human-crafted demonstrations for selected settings.",
    191           "source": "haiku"
    192         },
    193         "baselines_contemporary": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Within-paper baselines use GPT-3.5 and Llama2-13b-chat (both current at publication); informal comparisons reference recent results from PaLM-540B, codex, and text-davinci-002.",
    197           "source": "haiku"
    198         },
    199         "ablation_study": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "The compilation strategy grid (none, fewshot, bootstrap, bootstrap×2, ensemble) effectively ablates the contribution of each optimization step for both tasks across both LMs.",
    203           "source": "haiku"
    204         },
    205         "multiple_metrics": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "HotPotQA results include both answer exact match (Ans) and pair-retrieval accuracy (Psg); GSM8K uses final numerical answer accuracy appropriate for the task.",
    209           "source": "haiku"
    210         },
    211         "human_evaluation": {
    212           "applies": false,
    213           "answer": false,
    214           "justification": "Human evaluation is not applicable; the paper evaluates LM pipeline performance on automated NLP benchmarks with well-defined ground-truth answers.",
    215           "source": "haiku"
    216         },
    217         "held_out_test_set": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "GSM8K reports results on the 1.3K official test set; HotPotQA uses the official validation set as test (the test set is hidden) with internal train/dev splits from the training data.",
    221           "source": "haiku"
    222         },
    223         "per_category_breakdown": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "No per-category, per-difficulty, or per-question-type breakdowns are provided; all results are aggregate accuracy across all examples.",
    227           "source": "haiku"
    228         },
    229         "failure_cases_discussed": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "No failure cases are shown or analyzed; the paper does not examine when or why DSPy compilation fails to improve performance.",
    233           "source": "haiku"
    234         },
    235         "negative_results_reported": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "ReAct with bootstrap (31.0% dev) underperforms fewshot+human demonstrations (33.0%) on HotPotQA for GPT-3.5; ensemble sometimes underperforms bootstrap (e.g., Table 1 vanilla ensemble 62.7% vs bootstrap×2 64.7% for GPT-3.5 dev).",
    239           "source": "haiku"
    240         }
    241       },
    242       "setup_transparency": {
    243         "model_versions_specified": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "GPT-3.5 is referenced without a specific API snapshot version or date; Llama2-13b-chat and T5-Large are specified by name and size but no checkpoint versions are given.",
    247           "source": "haiku"
    248         },
    249         "prompts_provided": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Appendix F shows the actual prompts automatically generated by DSPy for GSM8K and HotPotQA experiments; initial signatures like 'question -> answer' are shown throughout the paper.",
    253           "source": "haiku"
    254         },
    255         "hyperparameters_reported": {
    256           "applies": true,
    257           "answer": false,
    258           "justification": "Temperature for LM calls is described qualitatively ('high temperature' during bootstrapping) but not specified numerically; k=8 for fewshot is mentioned but other key hyperparameters (number of trials, max demonstrations) are only partially described.",
    259           "source": "haiku"
    260         },
    261         "scaffolding_described": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "The compilation pipeline (candidate generation, parameter optimization, higher-order optimization) is described in detail in Section 4, with pseudocode for BootstrapFewShot, BootstrapFewShotWithRandomSearch, and BootstrapFewShotWithOptuna in appendices.",
    265           "source": "haiku"
    266         },
    267         "data_preprocessing_documented": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Data sampling procedures are described (200 training / 300 dev for both tasks; 70/30 train/val for HotPotQA; 'hard' examples only for HotPotQA training); ColBERTv2 search over Wikipedia 2017 abstracts dump is specified.",
    271           "source": "haiku"
    272         }
    273       },
    274       "data_integrity": {
    275         "raw_data_available": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "Both benchmarks (GSM8K and HotPotQA) are publicly available with official train/test splits; the Wikipedia 2017 dump used for retrieval is also public.",
    279           "source": "haiku"
    280         },
    281         "data_collection_described": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Data sources and sampling procedures are described: official GSM8K and HotPotQA datasets with specified example counts and split criteria for each experimental condition.",
    285           "source": "haiku"
    286         },
    287         "recruitment_methods_described": {
    288           "applies": false,
    289           "answer": false,
    290           "justification": "No human participants — standard NLP benchmarks with existing annotations are used.",
    291           "source": "haiku"
    292         },
    293         "data_pipeline_documented": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "The pipeline from raw data to evaluation is described: benchmark sampling, ColBERTv2 retrieval indexing, compilation on training split, and evaluation on dev/test splits.",
    297           "source": "haiku"
    298         }
    299       },
    300       "contamination": {
    301         "training_cutoff_stated": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "Neither GPT-3.5's nor Llama2's training data cutoff is stated; the paper does not specify model snapshot dates used in experiments.",
    305           "source": "haiku"
    306         },
    307         "train_test_overlap_discussed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "The paper notes GPT-4 was 'pre-trained on a subset of GSM8K's training set' but does not address whether GPT-3.5 or Llama2 similarly saw GSM8K or HotPotQA data during pretraining.",
    311           "source": "haiku"
    312         },
    313         "benchmark_contamination_addressed": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "GSM8K (2021) and HotPotQA (2018) predate both models' training cutoffs; contamination of test examples into model training is not addressed despite the paper acknowledging this issue for GPT-4.",
    317           "source": "haiku"
    318         }
    319       },
    320       "human_studies": {
    321         "pre_registered": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "haiku"
    326         },
    327         "irb_or_ethics_approval": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "haiku"
    332         },
    333         "demographics_reported": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "haiku"
    338         },
    339         "inclusion_exclusion_criteria": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "haiku"
    344         },
    345         "randomization_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "haiku"
    350         },
    351         "blinding_described": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "haiku"
    356         },
    357         "attrition_reported": {
    358           "applies": false,
    359           "answer": false,
    360           "justification": "No human participants in this study.",
    361           "source": "haiku"
    362         }
    363       },
    364       "cost_and_practicality": {
    365         "inference_cost_reported": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "Compilation time is described qualitatively ('minutes to tens of minutes') and T5-Large is said to have 'orders of magnitude lower costs for inference,' but no specific figures (API calls, dollar costs, latency) are reported.",
    369           "source": "haiku"
    370         },
    371         "compute_budget_stated": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "The paper mentions compilation requires 'running the program a few thousand times,' but no specific GPU hours, API call counts, or dollar costs are stated.",
    375           "source": "haiku"
    376         }
    377       }
    378     }
    379   },
    380   "claims": [
    381     {
    382       "claim": "DSPy bootstrap compilation raises GPT-3.5 accuracy on GSM8K from ~25% (vanilla) to over 80%",
    383       "evidence": "Table 1: vanilla test 25.2%, CoT bootstrap test 72.9%, CoT+ensemble test 81.6%, reflection bootstrap dev 76.0% test",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Bootstrap compilation outperforms expert-written human reasoning chains for GPT-3.5 on GSM8K",
    388       "evidence": "Table 1: CoT bootstrap dev (80.3%) outperforms CoT with +human CoT dev (78.6%); bootstrap test 72.9% vs human CoT test 72.4%",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "Llama2-13b-chat compiled with DSPy is competitive with expert-prompt GPT-3.5 pipelines on both tasks",
    393       "evidence": "Table 1: Llama2 reflection+ensemble dev 46.9% vs GPT-3.5 CoT+human dev 78.6% — not close; Table 2: Llama2 multihop bootstrap dev 42.0% vs GPT-3.5 multihop bootstrap dev 48.7% — closer but not equal",
    394       "supported": "weak"
    395     },
    396     {
    397       "claim": "T5-Large (770M parameters) compiled via DSPy achieves 39.3% EM on HotPotQA using only 200 labeled inputs",
    398       "evidence": "Section 7: 'This program scores 39.3% answer EM and 46.0% passage accuracy on the dev set, using only 200 labeled inputs and 800 unlabeled questions'",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "DSPy eliminates the need for hand-crafted prompt strings without sacrificing performance relative to expert-engineered systems",
    403       "evidence": "Programs using only generic signatures and modules (no task-specific prompts) outperform zero-shot and few-shot baselines on GSM8K and HotPotQA; Appendix B shows DSPy has zero hand-written prompt demonstrations vs 50 long strings in LangChain",
    404       "supported": "moderate"
    405     },
    406     {
    407       "claim": "DSPy outperforms standard few-shot prompting generally by over 25% for GPT-3.5 and 65% for Llama2",
    408       "evidence": "Tables 1-2 show improvements exceeding 25pp for GPT-3.5 in several settings (e.g., vanilla bootstrap×2 dev +40.7pp), but no Llama2 improvement over fewshot reaches 65pp — best is ~33pp",
    409       "supported": "weak"
    410     }
    411   ],
    412   "methodology_tags": [
    413     "benchmark-eval",
    414     "case-study"
    415   ],
    416   "key_findings": "DSPy introduces a programming model replacing hand-crafted LM prompt templates with parameterized declarative modules compiled by 'teleprompters' that automatically bootstrap few-shot demonstrations through a rejection-sampling-like process. On GSM8K and HotPotQA, compiled DSPy programs consistently outperform zero-shot and few-shot baselines, with GPT-3.5 reaching 88.3% dev accuracy on GSM8K (from 25.2% vanilla test) and 54.7% dev on HotPotQA. A T5-Large (770M parameters) fine-tuned via DSPy achieves 39.3% EM on HotPotQA using only 200 labeled examples, demonstrating that systematic compilation can enable small models to approach larger model performance. However, the evaluation scope is limited to two tasks and two LMs, no statistical tests are performed, and the headline '65% improvement' claim for Llama2 is not clearly supported by the reported tables.",
    417   "red_flags": [
    418     {
    419       "flag": "Unsupported 65% claim",
    420       "detail": "The abstract claims Llama2-13b-chat improvements of 'generally over 65%' vs. standard few-shot prompting, but no specific result in Tables 1-2 reaches this threshold — the best identifiable improvement is ~33pp."
    421     },
    422     {
    423       "flag": "No variance or error bars",
    424       "detail": "All main results are point estimates; standard deviations are absent despite some settings averaging 3-5 runs, making reliability of small differences unclear."
    425     },
    426     {
    427       "flag": "No statistical significance tests",
    428       "detail": "All comparative claims are made without significance testing; improvements of a few percentage points over baselines may not be statistically meaningful."
    429     },
    430     {
    431       "flag": "GPT-3.5 version unspecified",
    432       "detail": "GPT-3.5 is used without a specific model snapshot or API version date, making results potentially irreproducible as the underlying model changes over time."
    433     },
    434     {
    435       "flag": "No limitations section",
    436       "detail": "The paper has no dedicated limitations or threats-to-validity section despite evaluating on only 2 tasks with 2 LMs and making broad claims about a general programming model."
    437     },
    438     {
    439       "flag": "Contamination unaddressed",
    440       "detail": "GSM8K (2021) and HotPotQA (2018) predate LLM training cutoffs; the paper explicitly acknowledges GPT-4's GSM8K contamination but ignores the same issue for GPT-3.5 and Llama2."
    441     },
    442     {
    443       "flag": "Overgeneralized conclusions",
    444       "detail": "The conclusion claims DSPy has been validated on tasks 'from information extraction to synthetic data generation' but explicitly defers reporting these under 'controlled experimental conditions to future work.'"
    445     }
    446   ],
    447   "cited_papers": [
    448     {
    449       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    450       "relevance": "Core prompting technique abstracted into DSPy's ChainOfThought module; motivates the need for parameterized, compilable versions of prompting techniques"
    451     },
    452     {
    453       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    454       "relevance": "Agent prompting technique implemented as a built-in DSPy module and evaluated on HotPotQA as a baseline"
    455     },
    456     {
    457       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    458       "relevance": "Primary evaluation benchmark for the math reasoning case study"
    459     },
    460     {
    461       "title": "HotPotQA: A Dataset for Diverse, Explainable Multi-Hop Question Answering",
    462       "relevance": "Primary evaluation benchmark for the multi-hop QA case study"
    463     },
    464     {
    465       "title": "PyTorch: An Imperative Style, High-Performance Deep Learning Library",
    466       "relevance": "Direct inspiration for DSPy's define-by-run computation graph abstraction and parameterized module design"
    467     },
    468     {
    469       "title": "Demonstrate-Search-Predict: Composing Retrieval and Language Models for Knowledge-Intensive NLP",
    470       "relevance": "Direct predecessor framework to DSPy from the same research group; DSPy is introduced as its second iteration"
    471     },
    472     {
    473       "title": "ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction",
    474       "relevance": "Retrieval model used as the backbone for all HotPotQA experiments"
    475     },
    476     {
    477       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    478       "relevance": "Related prompting technique compared against in HotPotQA evaluation; DSPy's ensemble approach generalizes this idea"
    479     },
    480     {
    481       "title": "Connecting Large Language Models with Evolutionary Algorithms Yields Powerful Prompt Optimizers",
    482       "relevance": "Related single-step prompt optimization work that DSPy generalizes to multi-stage arbitrary pipelines"
    483     },
    484     {
    485       "title": "Decomposed Prompting: A Modular Approach for Solving Complex Tasks",
    486       "relevance": "Related modular prompting approach; DSPy claims to generalize its ideas through parameterized compilable modules"
    487     }
    488   ],
    489   "engagement_factors": {
    490     "practical_relevance": {
    491       "score": 3,
    492       "justification": "DSPy is a publicly released, pip-installable framework that eliminates the most painful part of LM system development (manual prompt engineering), with immediate use cases and a 1-year open-source track record."
    493     },
    494     "surprise_contrarian": {
    495       "score": 2,
    496       "justification": "Challenges the prevailing convention that effective LM pipelines require careful manual prompt engineering, demonstrating that automated compilation can match or exceed expert-crafted prompts."
    497     },
    498     "fear_safety": {
    499       "score": 0,
    500       "justification": "No AI safety, risk, or harm concerns are raised; the paper is entirely focused on engineering methodology for LM pipelines."
    501     },
    502     "drama_conflict": {
    503       "score": 1,
    504       "justification": "Appendix B implicitly critiques LangChain and LlamaIndex by quantifying their reliance on hard-coded prompts (50 strings over 1000 chars) vs. DSPy's zero, creating mild competitive framing."
    505     },
    506     "demo_ability": {
    507       "score": 3,
    508       "justification": "DSPy is available on GitHub, installable, and the paper includes 1-3 line runnable code snippets that produce working RAG and multi-hop systems anyone can reproduce immediately."
    509     },
    510     "brand_recognition": {
    511       "score": 3,
    512       "justification": "Stanford NLP (Khattab, Potts) and Berkeley (Zaharia, Singhvi) are highly recognized institutions; DSPy has since become one of the most widely cited LM programming frameworks."
    513     }
    514   },
    515   "hn_data": {
    516     "threads": [
    517       {
    518         "hn_id": "42168997",
    519         "title": "It's time to replace TCP in the datacenter (2023)",
    520         "points": 189,
    521         "comments": 156,
    522         "url": "https://news.ycombinator.com/item?id=42168997",
    523         "created_at": "2024-11-18T01:42:41Z"
    524       },
    525       {
    526         "hn_id": "34337707",
    527         "title": "“A Handbook of Integer Sequences” Fifty Years Later",
    528         "points": 139,
    529         "comments": 45,
    530         "url": "https://news.ycombinator.com/item?id=34337707",
    531         "created_at": "2023-01-11T12:37:58Z"
    532       },
    533       {
    534         "hn_id": "33088928",
    535         "title": "It's time to replace TCP in the Datacenter",
    536         "points": 6,
    537         "comments": 1,
    538         "url": "https://news.ycombinator.com/item?id=33088928",
    539         "created_at": "2022-10-04T23:56:57Z"
    540       },
    541       {
    542         "hn_id": "37805651",
    543         "title": "Agent Instructs Large Language Models to Be General Zero-Shot Reasoners",
    544         "points": 5,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=37805651",
    547         "created_at": "2023-10-07T21:17:40Z"
    548       },
    549       {
    550         "hn_id": "38561645",
    551         "title": "Relightable Gaussian Codec Avatars",
    552         "points": 4,
    553         "comments": 0,
    554         "url": "https://news.ycombinator.com/item?id=38561645",
    555         "created_at": "2023-12-07T20:50:41Z"
    556       },
    557       {
    558         "hn_id": "33151628",
    559         "title": "Integration of Skyline Queries into Spark SQL",
    560         "points": 3,
    561         "comments": 1,
    562         "url": "https://news.ycombinator.com/item?id=33151628",
    563         "created_at": "2022-10-10T14:10:30Z"
    564       },
    565       {
    566         "hn_id": "24766804",
    567         "title": "Abductive Knowledge Induction from Raw Data",
    568         "points": 3,
    569         "comments": 0,
    570         "url": "https://news.ycombinator.com/item?id=24766804",
    571         "created_at": "2020-10-13T15:59:02Z"
    572       },
    573       {
    574         "hn_id": "41820840",
    575         "title": "DSPy: Compiling Declarative Language Model Calls into Self-Improving Pipelines",
    576         "points": 2,
    577         "comments": 0,
    578         "url": "https://news.ycombinator.com/item?id=41820840",
    579         "created_at": "2024-10-12T17:30:26Z"
    580       },
    581       {
    582         "hn_id": "37776712",
    583         "title": "Large Language Models as Analogical Reasoners",
    584         "points": 2,
    585         "comments": 1,
    586         "url": "https://news.ycombinator.com/item?id=37776712",
    587         "created_at": "2023-10-05T10:04:39Z"
    588       },
    589       {
    590         "hn_id": "34364348",
    591         "title": "Exoshuffle-CloudSort",
    592         "points": 2,
    593         "comments": 1,
    594         "url": "https://news.ycombinator.com/item?id=34364348",
    595         "created_at": "2023-01-13T05:40:48Z"
    596       }
    597     ],
    598     "top_points": 189,
    599     "total_points": 355,
    600     "total_comments": 205
    601   }
    602 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs