scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (28003B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence",
      6     "authors": [
      7       "DeepSeek-AI"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv",
     11     "arxiv_id": "2406.11931",
     12     "doi": null
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The abstract claims 'superior performance compared to closed-source models such as GPT4-Turbo' but results are mixed: DeepSeek-Coder-V2 scores below GPT-4-Turbo-0409 on LiveCodeBench (43.4% vs 45.7%) and SWE-Bench (12.7% vs 18.3%), while exceeding it on HumanEval and MBPP.",
     20         "source": "haiku"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Table 1 ablates the new code corpus against DeepSeek-Coder's corpus using a controlled 1B model; Figure 3 compares reward model signal vs raw compiler signal in RL training. These ablations partially justify causal claims made.",
     26         "source": "haiku"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The title claims 'Breaking the Barrier of Closed-Source Models in Code Intelligence' — a broad generalization. Results show competitive performance on specific benchmarks but the paper does not qualify claims to tested settings or note the benchmarks where closed-source models still lead.",
     32         "source": "haiku"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No alternative explanations are considered for the performance improvements — whether gains are primarily from scale, the new data corpus, the DeepSeek-V2 initialization, or the RL phase is not systematically disentangled.",
     38         "source": "haiku"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "Benchmark scores (HumanEval, MBPP) are used to claim code intelligence parity with closed-source models without discussing the gap between these proxies and real-world coding ability.",
     44         "source": "haiku"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No dedicated limitations section. The only limitation mentioned is a single sentence in the conclusion about an 'instruction-following gap compared to GPT-4 Turbo'.",
     52         "source": "haiku"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No threats-to-validity section. Contamination is discussed for specific benchmarks but other threats (self-evaluation bias, benchmark saturation, single-run results without variance) are not addressed.",
     58         "source": "haiku"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No explicit scope boundaries are stated. The paper does not specify what tasks or domains the results do not apply to.",
     64         "source": "haiku"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding disclosure anywhere in the paper. This is a company paper from DeepSeek-AI but no funding source or financial support is mentioned.",
     72         "source": "haiku"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "All authors are listed as DeepSeek-AI employees and the affiliation is clearly stated on the paper.",
     78         "source": "haiku"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "All authors are DeepSeek-AI employees evaluating their own model. There is no independent funder or third-party evaluator.",
     84         "source": "haiku"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests or financial interests statement is included anywhere in the paper.",
     90         "source": "haiku"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "'Code intelligence' is used in the title and throughout without definition. 'Performance comparable' is not quantified with specific thresholds.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 1.1 explicitly lists three contributions: the 16B/236B MoE models, the first open-source hundred-billion-parameter code model matching closed-source frontier, and public release under permissive license.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper directly compares against StarCoder, CodeLlama, DeepSeek-Coder, Codestral, GPT-4, Claude 3, and Gemini 1.5, explaining how each relates to DeepSeek-Coder-V2 in approach and performance.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "The GitHub repository https://github.com/deepseek-ai/DeepSeek-Coder-V2 is linked in the abstract, and the paper states models are released under a permissive license for research and commercial use.",
    121           "source": "haiku"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": true,
    126           "justification": "All evaluation benchmarks used (HumanEval, MBPP, LiveCodeBench, SWE-Bench, CruxEval, GSM8K, MATH) are standard publicly available benchmarks used unmodified.",
    127           "source": "haiku"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "No requirements.txt, Dockerfile, or equivalent environment specification is provided in the paper. Hardware and software environment for training or evaluation are not specified.",
    133           "source": "haiku"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No step-by-step reproduction instructions are provided in the paper for reproducing training or evaluation results.",
    139           "source": "haiku"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "All results are reported as single-point estimates with no confidence intervals or error bars anywhere in the paper.",
    147           "source": "haiku"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "No statistical significance tests are performed for any comparative claims, despite numerous model comparisons being made across many benchmarks.",
    153           "source": "haiku"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Percentage improvements with baselines are reported throughout (e.g., ablation table shows +6.7pp HumanEval with new corpus vs old; Figure 3 shows explicit pass@1 curves with magnitude visible).",
    159           "source": "haiku"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "Benchmark sizes are mentioned (HumanEval: 164 problems, AIME: 30 problems) but no sample size justification or power analysis is provided for any comparisons.",
    165           "source": "haiku"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No variance, standard deviation, or run-to-run variance is reported. All results appear to be single-run point estimates.",
    171           "source": "haiku"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Multiple baselines included: closed-source (GPT-4-Turbo-0409, GPT-4o-0513, Claude-3-Opus, Gemini-1.5-Pro) and open-source (StarCoder2, CodeLlama, DeepSeek-Coder-33B, Codestral, Llama3-70B).",
    179           "source": "haiku"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines include GPT-4-Turbo-0409, GPT-4o-0513, Claude-3-Opus, and Gemini-1.5-Pro — all state-of-the-art models at the time of publication (June 2024).",
    185           "source": "haiku"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Table 1 ablates new vs old code corpus using a controlled 1B model; Figure 3 ablates reward model signal vs compiler signal in RL training showing reward model outperforms.",
    191           "source": "haiku"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Evaluation spans HumanEval, MBPP+, multilingual coding (13 languages), LiveCodeBench (by difficulty), USACO, RepoBench, FIM tasks, Defects4J, SWE-Bench, Aider, CruxEval, GSM8K, MATH, AIME, and NL benchmarks.",
    197           "source": "haiku"
    198         },
    199         "human_evaluation": {
    200           "applies": false,
    201           "answer": false,
    202           "justification": "Automated benchmarks with test cases serve as ground truth throughout; human evaluation of model outputs is not applicable to this code benchmark evaluation.",
    203           "source": "haiku"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Standard held-out test splits are used for all benchmarks; RepoBench uses only the December 2023 subset not present in training data.",
    209           "source": "haiku"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Table 3 breaks results down by 13 programming languages; Table 4 breaks LiveCodeBench by difficulty (Easy 82, Medium 87, Hard 57); Table 10 breaks NL benchmarks by domain.",
    215           "source": "haiku"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": false,
    220           "justification": "No systematic discussion of failure cases. Only a brief acknowledgment in the conclusion that SWE-bench performance lags and instruction-following has gaps relative to GPT-4-Turbo.",
    221           "source": "haiku"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "The paper reports that the Lite model underperforms Codestral on code completion (Table 5), CruxEval scores fall behind GPT-4o (70.0% vs 77.4%), and knowledge-intensive benchmark scores (TriviaQA) decline vs DeepSeek-V2.",
    227           "source": "haiku"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Most baselines specify version: GPT-4-Turbo-0409, GPT-4o-0513, GPT-4-1106; Claude-3-Opus and Gemini-1.5-Pro lack snapshot dates but are identified by specific model family names used at evaluation time.",
    235           "source": "haiku"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "HumanEval instruction prompt is given in footnote 4; math CoT prompt given in footnote 9. Key prompts for primary benchmarks are provided, though SWE-bench prompts are absent.",
    241           "source": "haiku"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "AdamW with β1=0.9, β2=0.95, weight decay 0.1; cosine decay with 2000 warm-up steps; SFT uses lr=5e-6, batch size=1M tokens; FIM rate 0.5. Some parameters deferred to DeepSeek-V2 paper.",
    247           "source": "haiku"
    248         },
    249         "scaffolding_described": {
    250           "applies": true,
    251           "answer": false,
    252           "justification": "SWE-Bench evaluation requires repository navigation and patch generation scaffolding that is not described in the paper. The paper only states 'whole format' for Aider without explaining the agent scaffolding.",
    253           "source": "haiku"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section 2 documents filtering rules in detail: line length limits (avg>100, max>1000), alphabetic ratio <25%, XML/HTML/JSON/YAML-specific rules, near-deduplication, and three-iteration fastText classification pipeline.",
    259           "source": "haiku"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Raw model outputs for each benchmark problem are not released. Only aggregated accuracy scores are reported in the paper's tables.",
    267           "source": "haiku"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section 2 describes collection in detail: GitHub repos before November 2023, CommonCrawl via fastText with three-iteration seed expansion, manual seed corpus construction, and final 1,170B token composition.",
    273           "source": "haiku"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants in this study; standard benchmarks and automated evaluation are used throughout.",
    279           "source": "haiku"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "Section 2 documents the full pipeline: seed corpus construction, fastText classifier training with BPE tokenizer, iterative URL collection, domain classification at 10% threshold, filtering, deduplication, and final composition (60% code / 10% math / 30% NL).",
    285           "source": "haiku"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "The paper explicitly states: 'We collect public repositories created before November 2023 on GitHub,' establishing a clear training data cutoff.",
    293           "source": "haiku"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "Section 4.2.1 explicitly discusses potential overlap for RepoBench (uses only December 2023 subset to avoid leakage); LiveCodeBench is chosen for contamination-free evaluation with problems from December 2023 - June 2024.",
    299           "source": "haiku"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "Contamination is not addressed for the primary benchmarks HumanEval (2021) and MBPP (2021), which substantially predate the November 2023 training cutoff and are very likely present in the GitHub/CommonCrawl training corpus.",
    305           "source": "haiku"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants in this study.",
    313           "source": "haiku"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "haiku"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "haiku"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "haiku"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "haiku"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "haiku"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "haiku"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "The MoE architecture with 2.4B/21B active parameters implies inference efficiency, but no concrete cost, latency, or throughput figures are reported.",
    357           "source": "haiku"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "The paper mentions training on 6 trillion additional tokens but does not report the compute budget in GPU-hours, FLOPs, or wall-clock training time.",
    363           "source": "haiku"
    364         }
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "DeepSeek-Coder-V2 achieves superior performance to GPT-4-Turbo, Claude 3 Opus, and Gemini 1.5 Pro on coding and math benchmarks",
    371       "evidence": "HumanEval: 90.2% vs 88.2% (GPT-4-Turbo); MBPP+: 76.2% vs 72.2%; MATH: 75.7% vs 73.4%; Aider: 73.7% vs 63.9%; but inferior on LiveCodeBench (43.4% vs 45.7%) and SWE-Bench (12.7% vs 18.3%)",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "The new code corpus is superior to the DeepSeek-Coder corpus, improving HumanEval by 6.7pp and MBPP by 9.4pp",
    376       "evidence": "Table 1: 1B model with new corpus at 2T tokens achieves 37.2% HumanEval vs 30.5% baseline and 54.0% MBPP vs 44.6% baseline in controlled ablation",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Using a reward model for RL training signal outperforms raw compiler feedback on Leetcode pass@1",
    381       "evidence": "Figure 3 shows reward model signal achieving higher pass@1 than compiler signal on both LeetCode and LeetCode-zh across all 600 training steps shown",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "DeepSeek-Coder-V2 is the first open-source model to exceed 10% on SWE-Bench",
    386       "evidence": "Table 7 shows DeepSeek-Coder-V2-Instruct achieves 12.7% on SWE-Bench; all other listed open-source models score 0.0-2.7%",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Continued pre-training on code data maintains comparable general language performance to DeepSeek-V2",
    391       "evidence": "Table 10 shows Coder-V2 leads on reasoning benchmarks (BBH: 83.9 vs 79.7, Arena-Hard: 65.0 vs 41.6) but lags on knowledge tasks (TriviaQA: 82.3 vs 86.7), which is a mixed result",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "DeepSeek-Coder-V2 handles 128K context reliably as shown by Needle-in-a-Haystack tests",
    396       "evidence": "Figure 2 shows high scores across all context lengths up to 128K in NIAH test, though individual score values cannot be read precisely from the heatmap",
    397       "supported": "moderate"
    398     }
    399   ],
    400   "methodology_tags": [
    401     "benchmark-eval"
    402   ],
    403   "key_findings": "DeepSeek-Coder-V2 (236B MoE with 21B active parameters) achieves competitive performance with GPT-4-Turbo on code and math benchmarks, becoming the first open-source model to exceed 10% on SWE-Bench (12.7%) and scoring 90.2% on HumanEval. The model demonstrates that continued pre-training from a strong general foundation (DeepSeek-V2) with 6 trillion code/math tokens can match closed-source frontier models on many benchmarks, though gaps remain on instruction-following-heavy tasks like SWE-Bench and LiveCodeBench where GPT-4-Turbo leads. Key technical contributions include a new 1,170B-token code corpus covering 338 languages, GRPO reinforcement learning with a reward model trained on compiler feedback, and 128K context extension via YaRN.",
    404   "red_flags": [
    405     {
    406       "flag": "No confidence intervals or variance",
    407       "detail": "All benchmark results are single-point estimates with no variance, standard deviation, or confidence intervals, making it impossible to assess whether differences between models are statistically meaningful."
    408     },
    409     {
    410       "flag": "Self-evaluation only",
    411       "detail": "All authors are DeepSeek-AI employees evaluating their own model with no independent verification or third-party replication. No competing interests declaration is made."
    412     },
    413     {
    414       "flag": "HumanEval/MBPP contamination unaddressed",
    415       "detail": "The primary benchmarks HumanEval (2021) and MBPP (2021) substantially predate the November 2023 training cutoff and are very likely present in the GitHub training corpus. Contamination is acknowledged only for RepoBench and LiveCodeBench."
    416     },
    417     {
    418       "flag": "SWE-Bench scaffolding undescribed",
    419       "detail": "SWE-Bench requires repository navigation and patch generation scaffolding that is never described in the paper, making the 12.7% result unreproducible."
    420     },
    421     {
    422       "flag": "Overclaiming in title and abstract",
    423       "detail": "The title claims 'Breaking the Barrier of Closed-Source Models' but DeepSeek-Coder-V2 scores below GPT-4-Turbo-0409 on LiveCodeBench (43.4% vs 45.7%) and SWE-Bench (12.7% vs 18.3%)."
    424     },
    425     {
    426       "flag": "No compute budget",
    427       "detail": "No GPU-hours, FLOPs, or training time is reported, making cost-benefit analysis impossible and comparison with other approaches unfair to reproduce."
    428     }
    429   ],
    430   "cited_papers": [
    431     {
    432       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    433       "relevance": "Primary code generation benchmark used throughout; 164 Python problems evaluated in zero-shot setting"
    434     },
    435     {
    436       "title": "Program Synthesis with Large Language Models (MBPP)",
    437       "relevance": "Code synthesis benchmark; MBPP+ (EvalPlus) version used for stricter automated evaluation"
    438     },
    439     {
    440       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    441       "relevance": "Real-world software engineering benchmark; DeepSeek-Coder-V2 is first open-source model to exceed 10%"
    442     },
    443     {
    444       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    445       "relevance": "Contamination-free code benchmark chosen specifically because problems post-date training cutoff"
    446     },
    447     {
    448       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    449       "relevance": "Direct predecessor model; provides training data pipeline, instruction data, and architecture baseline"
    450     },
    451     {
    452       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    453       "relevance": "Math corpus collection pipeline and GRPO RL algorithm reused directly in this work"
    454     },
    455     {
    456       "title": "StarCoder 2 and the Stack V2: The Next Generation",
    457       "relevance": "Competing open-source code model used as baseline across code generation and completion evaluations"
    458     },
    459     {
    460       "title": "Code Llama: Open Foundation Models for Code",
    461       "relevance": "Competing open-source code model baseline across all code benchmarks"
    462     },
    463     {
    464       "title": "Measuring Mathematical Problem Solving with the MATH Dataset",
    465       "relevance": "Advanced math reasoning benchmark where DeepSeek-Coder-V2 achieves 75.7% matching GPT-4o"
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 3,
    471       "justification": "Open-source model weights released under permissive license; practitioners can directly deploy 16B and 236B models for code generation, completion, and fixing tasks."
    472     },
    473     "surprise_contrarian": {
    474       "score": 2,
    475       "justification": "Challenges the assumption that closed-source models dominate frontier code intelligence by demonstrating an open-source model can match GPT-4-Turbo on many benchmarks."
    476     },
    477     "fear_safety": {
    478       "score": 0,
    479       "justification": "No AI safety or risk concerns raised; the paper focuses entirely on capability benchmarks."
    480     },
    481     "drama_conflict": {
    482       "score": 1,
    483       "justification": "Mild open-source vs. closed-source narrative framed as 'breaking the barrier,' but without adversarial tone or controversy."
    484     },
    485     "demo_ability": {
    486       "score": 3,
    487       "justification": "Model weights publicly released on GitHub under permissive license; anyone can download and test immediately."
    488     },
    489     "brand_recognition": {
    490       "score": 2,
    491       "justification": "DeepSeek is a well-known AI lab known for efficient open-source models; carries recognition in the open-source ML community."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "45222339",
    498         "title": "Analog In-Memory Computing Attention Mechanism for Fast LLMs",
    499         "points": 4,
    500         "comments": 0,
    501         "url": "https://news.ycombinator.com/item?id=45222339",
    502         "created_at": "2025-09-12T14:09:56Z"
    503       },
    504       {
    505         "hn_id": "40761106",
    506         "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models",
    507         "points": 3,
    508         "comments": 0,
    509         "url": "https://news.ycombinator.com/item?id=40761106",
    510         "created_at": "2024-06-22T18:34:13Z"
    511       },
    512       {
    513         "hn_id": "40834241",
    514         "title": "A Critical Study of What Code-LLMs (Do Not) Learn",
    515         "points": 2,
    516         "comments": 0,
    517         "url": "https://news.ycombinator.com/item?id=40834241",
    518         "created_at": "2024-06-30T00:15:06Z"
    519       },
    520       {
    521         "hn_id": "39441274",
    522         "title": "Speculative Streaming: Fast LLM Inference Without Auxiliary Models",
    523         "points": 2,
    524         "comments": 1,
    525         "url": "https://news.ycombinator.com/item?id=39441274",
    526         "created_at": "2024-02-20T13:55:45Z"
    527       },
    528       {
    529         "hn_id": "39461525",
    530         "title": "Speculative Streaming: Fast LLM Inference Without Auxiliary Models",
    531         "points": 2,
    532         "comments": 0,
    533         "url": "https://news.ycombinator.com/item?id=39461525",
    534         "created_at": "2024-02-22T00:24:15Z"
    535       },
    536       {
    537         "hn_id": "40442724",
    538         "title": "Analogical Reasoning-Augmented Interactive Data Annotation",
    539         "points": 1,
    540         "comments": 0,
    541         "url": "https://news.ycombinator.com/item?id=40442724",
    542         "created_at": "2024-05-22T16:16:38Z"
    543       },
    544       {
    545         "hn_id": "40111141",
    546         "title": "Lossless Acceleration of Long Sequence Generation",
    547         "points": 1,
    548         "comments": 0,
    549         "url": "https://news.ycombinator.com/item?id=40111141",
    550         "created_at": "2024-04-22T03:10:54Z"
    551       },
    552       {
    553         "hn_id": "37234305",
    554         "title": "Opportunities and Risks of LLMs for Scalable Deliberation with Polis",
    555         "points": 1,
    556         "comments": 0,
    557         "url": "https://news.ycombinator.com/item?id=37234305",
    558         "created_at": "2023-08-23T11:30:32Z"
    559       },
    560       {
    561         "hn_id": "37191375",
    562         "title": "Opportunities and Risks of LLMs for Scalable Deliberation with Polis",
    563         "points": 1,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=37191375",
    566         "created_at": "2023-08-19T18:00:10Z"
    567       }
    568     ],
    569     "top_points": 4,
    570     "total_points": 17,
    571     "total_comments": 1
    572   }
    573 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs