scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27027B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Demonstrate-Search-Predict: Composing retrieval and language models for knowledge-intensive NLP",
      6     "authors": [
      7       "O. Khattab",
      8       "Keshav Santhanam",
      9       "Xiang Lisa Li",
     10       "David Hall",
     11       "Percy Liang",
     12       "Christopher Potts",
     13       "Matei Zaharia"
     14     ],
     15     "year": 2022,
     16     "venue": "arXiv.org",
     17     "arxiv_id": "2212.14024",
     18     "doi": "10.48550/arXiv.2212.14024"
     19   },
     20   "checklist": {
     21     "claims_and_evidence": {
     22       "abstract_claims_supported": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The reported gain ranges (37-120% vs vanilla LM, 8-39% vs retrieve-then-read, 80-290% vs self-ask) are verifiable from Table 1, though the ranges are selectively drawn across metrics and tasks.",
     26         "source": "haiku"
     27       },
     28       "causal_claims_justified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper attributes gains causally to DSP components (DEMONSTRATE, SEARCH, PREDICT) but conducts no systematic ablation isolating individual components' contributions.",
     32         "source": "haiku"
     33       },
     34       "generalization_bounded": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The title and framing claim DSP addresses 'knowledge-intensive NLP' broadly, but evaluation is limited to three QA datasets using one LM (GPT-3.5) and one RM (ColBERTv2).",
     38         "source": "haiku"
     39       },
     40       "alternative_explanations_discussed": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper does not discuss alternative explanations for gains, such as increased computational budget (more LM API calls per query), corpus alignment advantages, or prompt engineering effects independent of the DSP abstraction.",
     44         "source": "haiku"
     45       },
     46       "proxy_outcome_distinction": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper measures EM and F1 on QA benchmarks and claims improvements in QA performance; the metrics align with stated claims without overreaching to broader capabilities.",
     50         "source": "haiku"
     51       }
     52     },
     53     "limitations_and_scope": {
     54       "limitations_section_present": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion briefly promises 'additional test tasks and LM choices' in future work, which does not constitute a limitations section.",
     58         "source": "haiku"
     59       },
     60       "threats_to_validity_specific": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper briefly acknowledges that comparisons with concurrent work are 'not generally apples-to-apples,' but enumerates no specific threats to the validity of its own results.",
     64         "source": "haiku"
     65       },
     66       "scope_boundaries_stated": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "The paper does not explicitly state what the results do NOT show; future extensions are promised rather than current scope being bounded.",
     70         "source": "haiku"
     71       }
     72     },
     73     "conflicts_of_interest": {
     74       "funding_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Funding is disclosed from IBM, Stanford HAI affiliates (Ant Financial, Facebook, Google, VMware), Cisco, SAP, and NSF grant CNS-1651570.",
     78         "source": "haiku"
     79       },
     80       "affiliations_disclosed": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "All seven authors are affiliated with Stanford University, clearly disclosed on the title page.",
     84         "source": "haiku"
     85       },
     86       "funder_independent_of_outcome": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "ColBERTv2, the retrieval model central to every experiment, was developed by the same Stanford group — Khattab, Santhanam, and Zaharia co-author both this paper and ColBERTv2, creating a direct conflict of interest.",
     90         "source": "haiku"
     91       },
     92       "financial_interests_declared": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) is present in the paper.",
     96         "source": "haiku"
     97       }
     98     },
     99     "scope_and_framing": {
    100       "key_terms_defined": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Key terms including 'in-context learning,' 'retrieval-augmented,' 'frozen LM/RM,' and each DSP stage (DEMONSTRATE, SEARCH, PREDICT) are defined precisely in §2.",
    104         "source": "haiku"
    105       },
    106       "intended_contribution_clear": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper explicitly enumerates four contributions: arguing for task-aware strategies, showing they can be expressed as short programs, demonstrating the power of composability, and establishing state-of-the-art in-context learning results on three tasks.",
    110         "source": "haiku"
    111       },
    112       "engagement_with_prior_work": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper extensively engages with prior work on retrieval-augmented NLP, multi-hop QA, and bootstrapping methods throughout the text, situating DSP relative to specific systems rather than merely listing citations.",
    116         "source": "haiku"
    117       }
    118     }
    119   },
    120   "type_checklist": {
    121     "empirical": {
    122       "artifacts": {
    123         "code_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "Code is released at https://github.com/stanfordnlp/dsp as stated in the abstract.",
    127           "source": "haiku"
    128         },
    129         "data_released": {
    130           "applies": true,
    131           "answer": true,
    132           "justification": "All three evaluation datasets (Open-SQuAD, HotPotQA, QReCC) are standard publicly available benchmarks used unmodified.",
    133           "source": "haiku"
    134         },
    135         "environment_specified": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No requirements.txt, Dockerfile, or specific dependency specifications are provided; the implementation language (Python) is implied by code snippets but no environment is specified.",
    139           "source": "haiku"
    140         },
    141         "reproduction_instructions": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "Code examples are provided as pseudocode in the paper, but step-by-step instructions for reproducing the exact experimental results (indices, seeds, scripts) are not included.",
    145           "source": "haiku"
    146         }
    147       },
    148       "statistical_methodology": {
    149         "confidence_intervals_or_error_bars": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "Table 1 reports single values with no confidence intervals or error bars, despite results being averages over five seeds.",
    153           "source": "haiku"
    154         },
    155         "significance_tests": {
    156           "applies": true,
    157           "answer": false,
    158           "justification": "No statistical significance tests are performed for any of the comparative claims made throughout the paper.",
    159           "source": "haiku"
    160         },
    161         "effect_sizes_reported": {
    162           "applies": true,
    163           "answer": true,
    164           "justification": "Relative percentage gains are reported (37-120% vs vanilla LM, etc.) with baseline context provided, which constitutes effect size reporting.",
    165           "source": "haiku"
    166         },
    167         "sample_size_justified": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "The choice of 1000 questions (or 400 conversations) subsampled across 5 seeds is stated but not justified or supported by power analysis.",
    171           "source": "haiku"
    172         },
    173         "variance_reported": {
    174           "applies": true,
    175           "answer": false,
    176           "justification": "Only mean values across five seeds appear in Table 1; no standard deviations or variance measures are reported.",
    177           "source": "haiku"
    178         }
    179       },
    180       "evaluation_design": {
    181         "baselines_included": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Three baselines are compared: vanilla LM, retrieve-then-read, and self-ask, plus SoTA results from concurrent work cited from related papers.",
    185           "source": "haiku"
    186         },
    187         "baselines_contemporary": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Baselines include contemporaneous systems (self-ask from Press et al. 2022, Si et al. 2022, Yao et al. 2022 ReAct) collected as of mid-December 2022.",
    191           "source": "haiku"
    192         },
    193         "ablation_study": {
    194           "applies": true,
    195           "answer": false,
    196           "justification": "No systematic ablation study is conducted to isolate the contribution of individual DSP components (DEMONSTRATE vs. SEARCH vs. PREDICT); only full-system comparisons are provided.",
    197           "source": "haiku"
    198         },
    199         "multiple_metrics": {
    200           "applies": true,
    201           "answer": true,
    202           "justification": "Both EM and F1 are reported for Open-SQuAD and HotPotQA; F1 and novel-F1 (nF1) are reported for QReCC.",
    203           "source": "haiku"
    204         },
    205         "human_evaluation": {
    206           "applies": false,
    207           "answer": false,
    208           "justification": "Human evaluation is not applicable; the paper evaluates on automatic QA metrics (EM, F1) on standard benchmarks.",
    209           "source": "haiku"
    210         },
    211         "held_out_test_set": {
    212           "applies": true,
    213           "answer": false,
    214           "justification": "The paper explicitly states 'we report the validation set accuracy on all three datasets'; test set evaluation is deferred to 'a future version of this report.'",
    215           "source": "haiku"
    216         },
    217         "per_category_breakdown": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "Results are broken down across three task types (open-domain QA, multi-hop QA, conversational QA), providing per-task performance analysis.",
    221           "source": "haiku"
    222         },
    223         "failure_cases_discussed": {
    224           "applies": true,
    225           "answer": false,
    226           "justification": "The paper discusses self-ask's 'self-distraction' failure mode with one example, but does not systematically discuss DSP's own failure cases or error analysis.",
    227           "source": "haiku"
    228         },
    229         "negative_results_reported": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "The paper does not report negative results about DSP programs; all DSP results are presented positively relative to baselines, with no cases where DSP fails or underperforms reported.",
    233           "source": "haiku"
    234         }
    235       },
    236       "setup_transparency": {
    237         "model_versions_specified": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "GPT-3.5 is identified as text-davinci-002 and ColBERTv2 is specified with its source paper; Wikipedia corpus versions are dated (Dec 2016, Nov 2017, Dec 2018 dumps).",
    241           "source": "haiku"
    242         },
    243         "prompts_provided": {
    244           "applies": true,
    245           "answer": false,
    246           "justification": "The paper shows prompt structures with placeholders (e.g., '{Task demonstrations from x.demos, if any}') but does not provide the actual verbatim prompt templates used in experiments.",
    247           "source": "haiku"
    248         },
    249         "hyperparameters_reported": {
    250           "applies": true,
    251           "answer": true,
    252           "justification": "Key hyperparameters are reported: temperature t=0.7 for n>1, greedy decoding for n=1, k=7 passages for open-domain QA, n=20 candidates for self-consistency, k=5 for multi-hop retrieval.",
    253           "source": "haiku"
    254         },
    255         "scaffolding_described": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "The DSP framework scaffolding is described in detail with code snippets showing DEMONSTRATE, SEARCH, and PREDICT stages, their APIs, and interactions.",
    259           "source": "haiku"
    260         },
    261         "data_preprocessing_documented": {
    262           "applies": true,
    263           "answer": true,
    264           "justification": "Preprocessing is documented: specific Wikipedia corpus dates are used, QReCC filtering criteria are stated (removing empty answers, short conversations, 'other interesting' keyword conversations), and HotPotQA 'hard' example filtering is described.",
    265           "source": "haiku"
    266         }
    267       },
    268       "data_integrity": {
    269         "raw_data_available": {
    270           "applies": true,
    271           "answer": false,
    272           "justification": "Raw experimental outputs (model responses, retrieved passages, intermediate predictions across seeds) are not released; only the DSP code library is made available.",
    273           "source": "haiku"
    274         },
    275         "data_collection_described": {
    276           "applies": true,
    277           "answer": true,
    278           "justification": "The paper describes which datasets and Wikipedia corpora are used with specific splits, corpus dates, and the 5-seed, 200-questions-per-seed evaluation protocol.",
    279           "source": "haiku"
    280         },
    281         "recruitment_methods_described": {
    282           "applies": false,
    283           "answer": false,
    284           "justification": "No participant recruitment is applicable; the paper uses standard public benchmarks.",
    285           "source": "haiku"
    286         },
    287         "data_pipeline_documented": {
    288           "applies": true,
    289           "answer": true,
    290           "justification": "The pipeline from input question to final answer is documented through code examples, and the evaluation protocol (5 seeds, 200 questions per seed, averaging) is described.",
    291           "source": "haiku"
    292         }
    293       },
    294       "contamination": {
    295         "training_cutoff_stated": {
    296           "applies": true,
    297           "answer": false,
    298           "justification": "The training data cutoff for GPT-3.5 (text-davinci-002) is not stated, despite evaluating on benchmarks like SQuAD (2016) and HotPotQA (2018) that predate the model.",
    299           "source": "haiku"
    300         },
    301         "train_test_overlap_discussed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "Potential contamination of GPT-3.5's training data with evaluation benchmark content (SQuAD, HotPotQA, QReCC) is not discussed anywhere in the paper.",
    305           "source": "haiku"
    306         },
    307         "benchmark_contamination_addressed": {
    308           "applies": true,
    309           "answer": false,
    310           "justification": "The evaluation benchmarks (SQuAD 2016, HotPotQA 2018) predate GPT-3.5's likely training cutoff and probably appear in its training data, but this is not acknowledged or addressed.",
    311           "source": "haiku"
    312         }
    313       },
    314       "human_studies": {
    315         "pre_registered": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "haiku"
    320         },
    321         "irb_or_ethics_approval": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "haiku"
    326         },
    327         "demographics_reported": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "haiku"
    332         },
    333         "inclusion_exclusion_criteria": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "haiku"
    338         },
    339         "randomization_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "haiku"
    344         },
    345         "blinding_described": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "haiku"
    350         },
    351         "attrition_reported": {
    352           "applies": false,
    353           "answer": false,
    354           "justification": "No human participants in this study.",
    355           "source": "haiku"
    356         }
    357       },
    358       "cost_and_practicality": {
    359         "inference_cost_reported": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "The paper mentions controlling 'language model API spending budget' but reports no actual inference costs, API call counts, or latency figures.",
    363           "source": "haiku"
    364         },
    365         "compute_budget_stated": {
    366           "applies": true,
    367           "answer": false,
    368           "justification": "No total computational budget (compute hours, API costs, number of API calls) is reported.",
    369           "source": "haiku"
    370         }
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "DSP programs achieve 37-120% relative gains over vanilla LM baselines across three knowledge-intensive QA tasks",
    377       "evidence": "Table 1: DSP at 36.6/49.0 EM/F1 vs 16.2/25.6 on Open-SQuAD; 51.4/62.9 vs 28.3/36.4 on HotPotQA; 35.0/25.3 vs 29.8/18.4 on QReCC",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "DSP programs achieve 8-39% relative gains over retrieve-then-read pipelines",
    382       "evidence": "Table 1: DSP outperforms retrieve-then-read on all three tasks (36.6 vs 33.8 EM on SQuAD; 51.4 vs 36.9 EM on HotPotQA; 35.0 vs 31.6 F1 on QReCC)",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "DSP programs achieve 80-290% relative gains over the self-ask pipeline",
    387       "evidence": "Table 1: self-ask achieves 9.3/17.2 EM/F1 on Open-SQuAD (vs DSP 36.6/49.0) and 28.6/37.3 on HotPotQA (vs DSP 51.4/62.9)",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "DSP achieves state-of-the-art in-context learning on HotPotQA at 51.4% EM as of December 2022",
    392       "evidence": "Table 1 comparison with concurrent work: Wang et al. 33.8%, Sun et al. 26.5%, Yao et al. (ReAct) 35.1% — all below DSP's 51.4%",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "DSP matches competitive fine-tuned systems on Open-SQuAD without any fine-tuning",
    397       "evidence": "DSP achieves 36.6% EM vs DPR 29.8% and FiD-base ~36% (with 5 passages); however FiD with 100 passages reaches 48%, substantially higher",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "DEMONSTRATE stage enables automatic bootstrapping of pipeline annotations from end-task labels without labeling intermediate steps",
    402       "evidence": "The annotate function is described with code but no ablation is conducted to show contribution vs. using fixed hand-written demonstrations",
    403       "supported": "weak"
    404     }
    405   ],
    406   "methodology_tags": [
    407     "benchmark-eval"
    408   ],
    409   "key_findings": "The DSP framework for retrieval-augmented in-context learning substantially outperforms vanilla LMs and retrieve-then-read pipelines on three QA benchmarks using frozen GPT-3.5 and ColBERTv2, achieving 51.4% EM on HotPotQA versus 35.1% for the next-best contemporary approach. The key innovation is composable programmatic pipelines that bootstrap demonstrations automatically, execute multi-hop retrieval, and aggregate evidence across retrieved passages. However, all results are on validation sets only, no ablations isolate component contributions, no variance is reported across the five evaluation seeds, and the most dramatic gain claims (80-290% over self-ask) rely on a baseline that performs surprisingly poorly even at its intended task.",
    410   "red_flags": [
    411     {
    412       "flag": "Validation-only evaluation",
    413       "detail": "All results are on validation sets; test set evaluation is explicitly deferred to 'a future version of this report,' creating risk of implicit overfitting during program development."
    414     },
    415     {
    416       "flag": "Self-evaluation of own retrieval system",
    417       "detail": "ColBERTv2, the only RM used throughout all experiments, is co-authored by Khattab, Santhanam, and Zaharia — the same team — creating a direct conflict of interest undisclosed as such."
    418     },
    419     {
    420       "flag": "No ablation study",
    421       "detail": "Three interacting components are proposed (DEMONSTRATE, SEARCH, PREDICT) but are never ablated individually; gains cannot be attributed to any specific innovation."
    422     },
    423     {
    424       "flag": "No variance or significance testing",
    425       "detail": "Results averaged over five seeds are reported as single-point numbers with no standard deviations, confidence intervals, or significance tests despite the stochastic nature of sampling-based generation."
    426     },
    427     {
    428       "flag": "Weak self-ask baseline inflating headline gains",
    429       "detail": "Self-ask achieves only 9.3% EM on Open-SQuAD — worse than even retrieve-then-read (33.8%) — suggesting it is not a valid baseline for that task, making the 80-290% relative gain claim misleading."
    430     },
    431     {
    432       "flag": "Benchmark contamination unaddressed",
    433       "detail": "SQuAD (2016) and HotPotQA (2018) predate GPT-3.5's training; no contamination analysis or acknowledgment is provided."
    434     }
    435   ],
    436   "cited_papers": [
    437     {
    438       "title": "Language Models are Few-Shot Learners",
    439       "relevance": "Foundational GPT-3 paper establishing in-context learning paradigm that DSP extends with retrieval and programmatic pipelines"
    440     },
    441     {
    442       "title": "ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction",
    443       "relevance": "The retrieval model used throughout all DSP experiments; co-authored by the same team"
    444     },
    445     {
    446       "title": "Measuring and Narrowing the Compositionality Gap in Language Models (self-ask)",
    447       "relevance": "Primary contemporaneous baseline that DSP outperforms significantly; motivates the multi-hop retrieval design"
    448     },
    449     {
    450       "title": "Self-consistency improves chain of thought reasoning in language models",
    451       "relevance": "Self-consistency technique directly incorporated into DSP's PREDICT stage for answer selection"
    452     },
    453     {
    454       "title": "Chain of thought prompting elicits reasoning in large language models",
    455       "relevance": "Chain-of-thought reasoning integrated into DSP's PREDICT stage prompts"
    456     },
    457     {
    458       "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks",
    459       "relevance": "Foundational RAG paper that DSP extends with more sophisticated programmatic LM-RM interaction"
    460     },
    461     {
    462       "title": "STaR: Bootstrapping Reasoning with Reasoning",
    463       "relevance": "Related bootstrapping approach that DSP generalizes to complex multi-stage pipelines"
    464     },
    465     {
    466       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    467       "relevance": "Contemporaneous approach combining LM reasoning with retrieval actions; cited as concurrent SoTA on HotPotQA"
    468     },
    469     {
    470       "title": "Baleen: Robust Multi-Hop Reasoning at Scale via Condensed Retrieval",
    471       "relevance": "Prior fine-tuned multi-hop system by the same first author that DSP aims to replicate with in-context learning"
    472     }
    473   ],
    474   "engagement_factors": {
    475     "practical_relevance": {
    476       "score": 3,
    477       "justification": "DSP directly evolved into the DSPy library, now widely used by practitioners for building and auto-optimizing LM pipelines."
    478     },
    479     "surprise_contrarian": {
    480       "score": 2,
    481       "justification": "Challenges the dominant 'retrieve-then-read' paradigm by showing that programmatic LM-RM coordination with bootstrapped demonstrations substantially outperforms simple retrieval augmentation."
    482     },
    483     "fear_safety": {
    484       "score": 0,
    485       "justification": "No AI safety or risk concerns are raised; the paper focuses purely on QA accuracy improvements."
    486     },
    487     "drama_conflict": {
    488       "score": 1,
    489       "justification": "Directly positions itself against self-ask, arguing DSP's modularity avoids the 'self-distraction' problem of LM-controlled pipelines."
    490     },
    491     "demo_ability": {
    492       "score": 3,
    493       "justification": "Code released at github.com/stanfordnlp/dsp with clear API examples and code snippets in the paper; practitioners can immediately build their own DSP programs."
    494     },
    495     "brand_recognition": {
    496       "score": 3,
    497       "justification": "Stanford DAWN project; authors include Percy Liang and Matei Zaharia (prominent in ML and systems communities) and Omar Khattab (ColBERT, DSPy author)."
    498     }
    499   },
    500   "hn_data": {
    501     "threads": [
    502       {
    503         "hn_id": "34178437",
    504         "title": "Cramming: Training a Language Model on a Single GPU in One Day",
    505         "points": 6,
    506         "comments": 0,
    507         "url": "https://news.ycombinator.com/item?id=34178437",
    508         "created_at": "2022-12-29T21:44:44Z"
    509       },
    510       {
    511         "hn_id": "42209577",
    512         "title": "Cramming: Training a Language Model on a Single GPU in One Day",
    513         "points": 3,
    514         "comments": 0,
    515         "url": "https://news.ycombinator.com/item?id=42209577",
    516         "created_at": "2024-11-21T23:01:03Z"
    517       },
    518       {
    519         "hn_id": "34232125",
    520         "title": "Cramming: Training a Language Model on a Single GPU in One Day",
    521         "points": 3,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=34232125",
    524         "created_at": "2023-01-03T14:57:13Z"
    525       },
    526       {
    527         "hn_id": "34570488",
    528         "title": "Training a Language Model on a Single GPU in One Day",
    529         "points": 2,
    530         "comments": 1,
    531         "url": "https://news.ycombinator.com/item?id=34570488",
    532         "created_at": "2023-01-29T17:40:13Z"
    533       },
    534       {
    535         "hn_id": "39968113",
    536         "title": "Cramming: Training a Language Model on a Single GPU in One Day (2022)",
    537         "points": 2,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=39968113",
    540         "created_at": "2024-04-08T10:09:08Z"
    541       },
    542       {
    543         "hn_id": "34338363",
    544         "title": "Cramming: Training a Language Model on a Single GPU in One Day",
    545         "points": 2,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=34338363",
    548         "created_at": "2023-01-11T13:47:58Z"
    549       },
    550       {
    551         "hn_id": "42656632",
    552         "title": "Show HN: We collected detailed annotations for text-to-image generation",
    553         "points": 2,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=42656632",
    556         "created_at": "2025-01-10T15:47:29Z"
    557       },
    558       {
    559         "hn_id": "33522332",
    560         "title": "Championship Simulator: Architectural Simulation for Education and Competition",
    561         "points": 1,
    562         "comments": 0,
    563         "url": "https://news.ycombinator.com/item?id=33522332",
    564         "created_at": "2022-11-08T18:21:42Z"
    565       }
    566     ],
    567     "top_points": 6,
    568     "total_points": 21,
    569     "total_comments": 1
    570   }
    571 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs