scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32660B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence",
      6     "authors": [
      7       "DeepSeek-AI"
      8     ],
      9     "year": 2024,
     10     "venue": "arXiv",
     11     "arxiv_id": "2406.11931",
     12     "doi": null
     13   },
     14   "checklist": {
     15     "claims_and_evidence": {
     16       "abstract_claims_supported": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The abstract claims 'performance comparable to GPT4-Turbo in code-specific tasks' and this is supported by Tables 3-9 showing competitive or superior results on most benchmarks.",
     20         "source": "opus"
     21       },
     22       "causal_claims_justified": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "Causal claims are limited and backed by controlled experiments. The corpus ablation (Table 1) uses the same 1B model architecture with different corpora. The reward model vs. compiler signal comparison (Figure 3) is a controlled comparison. Claims like 'continued pre-training enhances coding capabilities' are supported by before/after comparisons on the same architecture.",
     26         "source": "opus"
     27       },
     28       "generalization_bounded": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The title claims 'Breaking the Barrier of Closed-Source Models in Code Intelligence' — a very broad framing. While results show competitive performance on specific benchmarks, the paper acknowledges 'a significant gap in instruction-following capabilities' and poor SWE-bench performance. The title overclaims relative to the evidence.",
     32         "source": "opus"
     33       },
     34       "alternative_explanations_discussed": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No discussion of alternative explanations for the results. For example, improved performance could partly be due to benchmark contamination (training data includes GitHub repos that may contain benchmark solutions), but this is not discussed.",
     38         "source": "opus"
     39       },
     40       "proxy_outcome_distinction": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper frames benchmark performance (HumanEval, MBPP pass@1) as 'code intelligence' without discussing what code intelligence actually entails or whether these benchmarks adequately measure it. The gap between 'pass@1 on function-level benchmarks' and 'code intelligence' is not acknowledged.",
     44         "source": "opus"
     45       }
     46     },
     47     "limitations_and_scope": {
     48       "limitations_section_present": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No dedicated limitations section. The conclusion briefly mentions the instruction-following gap but this is more of a future work direction than a limitations discussion.",
     52         "source": "opus"
     53       },
     54       "threats_to_validity_specific": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No threats to validity discussed. No mention of evaluation methodology concerns, benchmark contamination risks, or limitations of the comparison methodology.",
     58         "source": "opus"
     59       },
     60       "scope_boundaries_stated": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No explicit scope boundaries stated. The paper does not state what the results do NOT show or what settings are excluded from the claims.",
     64         "source": "opus"
     65       }
     66     },
     67     "conflicts_of_interest": {
     68       "funding_disclosed": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "No funding or acknowledgments section. DeepSeek-AI is the sole affiliation but no funding sources are disclosed.",
     72         "source": "opus"
     73       },
     74       "affiliations_disclosed": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "All authors are listed under 'DeepSeek-AI'. The affiliation is clear.",
     78         "source": "opus"
     79       },
     80       "funder_independent_of_outcome": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "DeepSeek-AI is both the developer and evaluator of the model. The company has a direct commercial interest in the model performing well on benchmarks.",
     84         "source": "opus"
     85       },
     86       "financial_interests_declared": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "No competing interests statement. Authors presumably have financial interest in DeepSeek-AI's products but this is not declared.",
     90         "source": "opus"
     91       }
     92     },
     93     "scope_and_framing": {
     94       "key_terms_defined": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "'Code intelligence' is used throughout without definition; 'comparable' and 'superior' performance are not operationalized, and the threshold for 'breaking the barrier' is never specified.",
     98         "source": "haiku"
     99       },
    100       "intended_contribution_clear": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 1.1 explicitly enumerates three contributions: the 16B/236B MoE models, first open-source 100B+ code model exceeding closed-source, and permissive public release.",
    104         "source": "haiku"
    105       },
    106       "engagement_with_prior_work": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper situates itself explicitly against StarCoder, StarCoder2, CodeLlama, DeepSeek-Coder, and Codestral, describing how each differs in size, data, and performance, and contextualizes the gap to closed-source models it aims to close.",
    110         "source": "haiku"
    111       }
    112     }
    113   },
    114   "type_checklist": {
    115     "empirical": {
    116       "artifacts": {
    117         "code_released": {
    118           "applies": true,
    119           "answer": true,
    120           "justification": "GitHub repository URL provided in the paper header: https://github.com/deepseek-ai/DeepSeek-Coder-V2. Model weights released under permissive license.",
    121           "source": "opus"
    122         },
    123         "data_released": {
    124           "applies": true,
    125           "answer": false,
    126           "justification": "The training data (6T tokens from GitHub, CommonCrawl) is not released. Standard evaluation benchmarks (HumanEval, MBPP, etc.) are public, but the proprietary training corpus is not available.",
    127           "source": "opus"
    128         },
    129         "environment_specified": {
    130           "applies": true,
    131           "answer": false,
    132           "justification": "No environment specifications, requirements.txt, or dependency details are provided in the paper.",
    133           "source": "opus"
    134         },
    135         "reproduction_instructions": {
    136           "applies": true,
    137           "answer": false,
    138           "justification": "No step-by-step reproduction instructions for the benchmark evaluations are provided in the paper. The paper mentions using 'identical scripts and environments' for baselines but does not provide these.",
    139           "source": "opus"
    140         }
    141       },
    142       "statistical_methodology": {
    143         "confidence_intervals_or_error_bars": {
    144           "applies": true,
    145           "answer": false,
    146           "justification": "All results are reported as single point estimates (e.g., '90.2% on HumanEval') with no confidence intervals or error bars.",
    147           "source": "opus"
    148         },
    149         "significance_tests": {
    150           "applies": true,
    151           "answer": false,
    152           "justification": "The paper claims superiority over various models based solely on comparing raw numbers. No statistical significance tests are performed.",
    153           "source": "opus"
    154         },
    155         "effect_sizes_reported": {
    156           "applies": true,
    157           "answer": true,
    158           "justification": "Results are reported with baseline context throughout — e.g., ablation study shows improvement from 30.5% to 37.2% on HumanEval. All comparison tables show both the proposed model and baselines' raw scores.",
    159           "source": "opus"
    160         },
    161         "sample_size_justified": {
    162           "applies": true,
    163           "answer": false,
    164           "justification": "No justification for benchmark sizes or number of evaluation examples. Standard benchmarks are used (HumanEval has 164 problems) but no discussion of whether these are sufficient for the claims.",
    165           "source": "opus"
    166         },
    167         "variance_reported": {
    168           "applies": true,
    169           "answer": false,
    170           "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run greedy decoding (the paper states 'greedy search strategy'). The one exception is AIME where maj@64 is mentioned but this is a different metric, not variance.",
    171           "source": "opus"
    172         }
    173       },
    174       "evaluation_design": {
    175         "baselines_included": {
    176           "applies": true,
    177           "answer": true,
    178           "justification": "Extensive baseline comparisons including CodeLlama, StarCoder, StarCoder2, DeepSeek-Coder, Codestral, Llama3, GPT-4 variants, Claude 3 Opus, and Gemini 1.5 Pro (Tables 3-10).",
    179           "source": "opus"
    180         },
    181         "baselines_contemporary": {
    182           "applies": true,
    183           "answer": true,
    184           "justification": "Baselines include GPT-4o-0513, Claude 3 Opus, Gemini 1.5 Pro, Llama3-70B, and Codestral — all contemporary at time of publication (June 2024).",
    185           "source": "opus"
    186         },
    187         "ablation_study": {
    188           "applies": true,
    189           "answer": true,
    190           "justification": "Table 1 provides an ablation study comparing the new code corpus vs. old DeepSeek-Coder corpus using a 1B parameter model. Figure 3 compares reward model signal vs. compiler signal for RL training.",
    191           "source": "opus"
    192         },
    193         "multiple_metrics": {
    194           "applies": true,
    195           "answer": true,
    196           "justification": "Multiple benchmarks and metrics used: HumanEval, MBPP+, LiveCodeBench, USACO, SWE-bench, Aider, Defects4J, CruxEval, RepoBench, FIM tasks, GSM8K, MATH, AIME, MMLU, BBH, Arena-Hard, MT-Bench, etc.",
    197           "source": "opus"
    198         },
    199         "human_evaluation": {
    200           "applies": true,
    201           "answer": false,
    202           "justification": "No human evaluation of model outputs. All evaluations are automated (test case pass/fail, exact match, GPT-4 as judge for Arena-Hard/MT-Bench). The paper claims code quality but relies entirely on automated metrics.",
    203           "source": "opus"
    204         },
    205         "held_out_test_set": {
    206           "applies": true,
    207           "answer": true,
    208           "justification": "Standard held-out benchmarks used. For RepoBench, the paper explicitly uses only December 2023 data to avoid overlap with training data (cutoff November 2023). LiveCodeBench uses problems from Dec 2023-June 2024.",
    209           "source": "opus"
    210         },
    211         "per_category_breakdown": {
    212           "applies": true,
    213           "answer": true,
    214           "justification": "Detailed per-language breakdowns across 14 programming languages in Table 3, per-difficulty breakdowns for LiveCodeBench (Easy/Medium/Hard in Table 4), per-context-length breakdowns for RepoBench (Table 5).",
    215           "source": "opus"
    216         },
    217         "failure_cases_discussed": {
    218           "applies": true,
    219           "answer": true,
    220           "justification": "The conclusion discusses failure modes: 'there is still a significant gap in instruction-following capabilities compared to current state-of-the-art models like GPT-4 Turbo. This gap leads to poor performance in complex scenarios and tasks such as those in SWEbench.'",
    221           "source": "opus"
    222         },
    223         "negative_results_reported": {
    224           "applies": true,
    225           "answer": true,
    226           "justification": "Several negative results reported: DeepSeek-Coder-V2-Lite drops on knowledge-intensive benchmarks like TriviaQA (59.5% vs 65.2% for DeepSeek-V2-Lite). Performance gap on CruxEval vs closed-source models acknowledged. Instability during training with exponential normalization noted.",
    227           "source": "opus"
    228         }
    229       },
    230       "setup_transparency": {
    231         "model_versions_specified": {
    232           "applies": true,
    233           "answer": true,
    234           "justification": "Specific model versions given for baselines: GPT-4-1106, GPT-4-Turbo-0409, GPT-4o-0513. Own model sizes (16B/236B with active params 2.4B/21B) clearly specified. Table 2 details training settings.",
    235           "source": "opus"
    236         },
    237         "prompts_provided": {
    238           "applies": true,
    239           "answer": true,
    240           "justification": "The HumanEval prompt template is provided in footnote 4. Math evaluation prompt is given in footnote 9 ('Please reason step by step, and put your final answer within \\boxed{}'). FIM format tokens shown in Section 3.1.",
    241           "source": "opus"
    242         },
    243         "hyperparameters_reported": {
    244           "applies": true,
    245           "answer": true,
    246           "justification": "Training hyperparameters reported in Section 3.3: AdamW with β1=0.9, β2=0.95, weight decay 0.1, cosine decay with 2000 warmup steps. SFT: lr 5e-6, batch 1M tokens, 1B tokens total. YARN hyperparameters: scale 40, α=1, β=32. Evaluation uses greedy search.",
    247           "source": "opus"
    248         },
    249         "scaffolding_described": {
    250           "applies": false,
    251           "answer": false,
    252           "justification": "No agentic scaffolding is used. The model is evaluated directly on benchmarks via standard prompting.",
    253           "source": "opus"
    254         },
    255         "data_preprocessing_documented": {
    256           "applies": true,
    257           "answer": true,
    258           "justification": "Section 2 provides detailed data filtering rules: average line length <100, max line length <1000, >25% alphabetic chars, XML filtering, HTML visible text ratio >20%, JSON/YAML 50-5000 chars. Near-deduplication applied. Three-iteration CommonCrawl pipeline with fastText classifier described.",
    259           "source": "opus"
    260         }
    261       },
    262       "data_integrity": {
    263         "raw_data_available": {
    264           "applies": true,
    265           "answer": false,
    266           "justification": "Raw evaluation outputs are not available. Only aggregate scores are reported. Training data is not released.",
    267           "source": "opus"
    268         },
    269         "data_collection_described": {
    270           "applies": true,
    271           "answer": true,
    272           "justification": "Section 2 describes data collection in detail: GitHub repos before November 2023, CommonCrawl pipeline with fastText classification using seed corpus from StackOverflow/PyTorch docs/StackExchange, three iterations of data collection yielding 1,170B code tokens and 221B math tokens.",
    273           "source": "opus"
    274         },
    275         "recruitment_methods_described": {
    276           "applies": false,
    277           "answer": false,
    278           "justification": "No human participants. Data sources are standard benchmarks and public code repositories.",
    279           "source": "opus"
    280         },
    281         "data_pipeline_documented": {
    282           "applies": true,
    283           "answer": true,
    284           "justification": "The data pipeline is documented: filtering rules → near-deduplication → 821B code + 185B code-related text from GitHub; fastText seed corpus → iterative CommonCrawl collection → 70B code-related + 221B math-related tokens; plus 94B additional GitHub source code from two iterations. Final composition: 60% code, 10% math, 30% NL.",
    285           "source": "opus"
    286         }
    287       },
    288       "contamination": {
    289         "training_cutoff_stated": {
    290           "applies": true,
    291           "answer": true,
    292           "justification": "Training data cutoff stated as 'before November 2023' for GitHub data (Section 2). The paper uses this to justify temporal splits for RepoBench evaluation.",
    293           "source": "opus"
    294         },
    295         "train_test_overlap_discussed": {
    296           "applies": true,
    297           "answer": true,
    298           "justification": "For RepoBench, the paper explicitly addresses overlap: 'we only use data from December 2023' to avoid data present in pre-training. For LiveCodeBench, the paper uses the subset from Dec 2023 to June 2024 since 'the cut-off of the training data is before November 2023.'",
    299           "source": "opus"
    300         },
    301         "benchmark_contamination_addressed": {
    302           "applies": true,
    303           "answer": false,
    304           "justification": "While temporal splits are used for RepoBench and LiveCodeBench, the paper does not address contamination for older benchmarks like HumanEval (2021), MBPP (2021), GSM8K (2021), or MATH (2021), all of which were available online well before the November 2023 training cutoff. These are among the paper's most prominent results.",
    305           "source": "opus"
    306         }
    307       },
    308       "human_studies": {
    309         "pre_registered": {
    310           "applies": false,
    311           "answer": false,
    312           "justification": "No human participants in this study.",
    313           "source": "opus"
    314         },
    315         "irb_or_ethics_approval": {
    316           "applies": false,
    317           "answer": false,
    318           "justification": "No human participants in this study.",
    319           "source": "opus"
    320         },
    321         "demographics_reported": {
    322           "applies": false,
    323           "answer": false,
    324           "justification": "No human participants in this study.",
    325           "source": "opus"
    326         },
    327         "inclusion_exclusion_criteria": {
    328           "applies": false,
    329           "answer": false,
    330           "justification": "No human participants in this study.",
    331           "source": "opus"
    332         },
    333         "randomization_described": {
    334           "applies": false,
    335           "answer": false,
    336           "justification": "No human participants in this study.",
    337           "source": "opus"
    338         },
    339         "blinding_described": {
    340           "applies": false,
    341           "answer": false,
    342           "justification": "No human participants in this study.",
    343           "source": "opus"
    344         },
    345         "attrition_reported": {
    346           "applies": false,
    347           "answer": false,
    348           "justification": "No human participants in this study.",
    349           "source": "opus"
    350         }
    351       },
    352       "cost_and_practicality": {
    353         "inference_cost_reported": {
    354           "applies": true,
    355           "answer": false,
    356           "justification": "No inference cost or latency reported despite the model being 236B parameters. Active parameter counts (2.4B, 21B) are given but no wall-clock time or API cost data.",
    357           "source": "opus"
    358         },
    359         "compute_budget_stated": {
    360           "applies": true,
    361           "answer": false,
    362           "justification": "Total training compute is not quantified in GPU hours or cost. The paper states 10.2T tokens were used for training but does not report the hardware or time required.",
    363           "source": "opus"
    364         }
    365       },
    366       "experimental_rigor": {
    367         "seed_sensitivity_reported": {
    368           "applies": true,
    369           "answer": false,
    370           "justification": "No seed sensitivity analysis. Results appear to be single-run evaluations using greedy decoding.",
    371           "source": "opus"
    372         },
    373         "number_of_runs_stated": {
    374           "applies": true,
    375           "answer": false,
    376           "justification": "Number of runs not stated for any benchmark evaluation. Greedy decoding implies single-run but this is not explicitly confirmed for all benchmarks.",
    377           "source": "opus"
    378         },
    379         "hyperparameter_search_budget": {
    380           "applies": true,
    381           "answer": false,
    382           "justification": "No hyperparameter search budget reported for training or evaluation. Learning rates and other hyperparameters are stated but not how they were selected.",
    383           "source": "opus"
    384         },
    385         "best_config_selection_justified": {
    386           "applies": true,
    387           "answer": false,
    388           "justification": "The paper states hyperparameters are 'consistent with DeepSeek V2 methodology' but does not justify the selection or explain if any search was performed for the new training phase.",
    389           "source": "opus"
    390         },
    391         "multiple_comparison_correction": {
    392           "applies": false,
    393           "answer": false,
    394           "justification": "No statistical tests are performed, so multiple comparison correction is not applicable.",
    395           "source": "opus"
    396         },
    397         "self_comparison_bias_addressed": {
    398           "applies": true,
    399           "answer": false,
    400           "justification": "DeepSeek-AI is evaluating its own model against competitors. The paper claims to use 'identical scripts and environments' for baselines but does not acknowledge the inherent bias of self-evaluation or discuss whether independent evaluation was performed.",
    401           "source": "opus"
    402         },
    403         "compute_budget_vs_performance": {
    404           "applies": true,
    405           "answer": false,
    406           "justification": "No performance vs. compute comparison. The 236B model uses 21B active params vs. competitors' full dense models but inference FLOPs are not compared. The paper notes the smaller active params as an advantage without quantifying.",
    407           "source": "opus"
    408         },
    409         "benchmark_construct_validity": {
    410           "applies": true,
    411           "answer": false,
    412           "justification": "The paper uses many benchmarks but does not discuss whether they actually measure 'code intelligence' as claimed. No discussion of construct validity for any benchmark.",
    413           "source": "opus"
    414         },
    415         "scaffold_confound_addressed": {
    416           "applies": false,
    417           "answer": false,
    418           "justification": "No scaffolding is used in the evaluations. Models are evaluated directly via standard prompting.",
    419           "source": "opus"
    420         }
    421       },
    422       "data_leakage": {
    423         "temporal_leakage_addressed": {
    424           "applies": true,
    425           "answer": true,
    426           "justification": "Temporal leakage addressed for RepoBench (uses only December 2023 data) and LiveCodeBench (uses problems from Dec 2023-June 2024, after training cutoff). However, not addressed for older benchmarks.",
    427           "source": "opus"
    428         },
    429         "feature_leakage_addressed": {
    430           "applies": true,
    431           "answer": false,
    432           "justification": "No discussion of feature leakage in the evaluation setup.",
    433           "source": "opus"
    434         },
    435         "non_independence_addressed": {
    436           "applies": true,
    437           "answer": false,
    438           "justification": "No discussion of whether training data and benchmark problems share structural similarities (e.g., HumanEval problems sourced from common programming patterns that appear in GitHub training data).",
    439           "source": "opus"
    440         },
    441         "leakage_detection_method": {
    442           "applies": true,
    443           "answer": false,
    444           "justification": "No concrete leakage detection method applied. Temporal splits are used for two benchmarks but no membership inference, canary strings, or n-gram overlap analysis performed.",
    445           "source": "opus"
    446         }
    447       }
    448     }
    449   },
    450   "claims": [
    451     {
    452       "claim": "DeepSeek-Coder-V2 achieves superior performance compared to GPT-4-Turbo in coding and math benchmarks",
    453       "evidence": "On HumanEval average across languages, DS-Coder-V2 scores 75.3% vs GPT-4-Turbo's 72.3%; MATH 75.7% vs 73.4%; but GPT-4-Turbo leads on LiveCodeBench (45.7% vs 43.4%) and SWE-Bench (18.3% vs 12.7%)",
    454       "supported": "weak"
    455     },
    456     {
    457       "claim": "The new code corpus is superior to DeepSeek-Coder's corpus for pre-training",
    458       "evidence": "Table 1 ablation at 1B scale: HumanEval improves from 30.5% to 37.2% and MBPP from 44.6% to 54.0% with the new corpus on 2T tokens",
    459       "supported": "moderate"
    460     },
    461     {
    462       "claim": "DeepSeek-Coder-V2 is the first open-source model to exceed 10% on SWE-Bench",
    463       "evidence": "Table 7 shows DS-Coder-V2-Instruct at 12.7% while all other open-source models including DS-Coder-33B score 0.0%",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Reward model signal outperforms raw compiler feedback for RL training",
    468       "evidence": "Figure 3 shows higher LeetCode and LeetCode-zh Pass@1 curves for reward model signal vs compiler signal throughout training",
    469       "supported": "moderate"
    470     },
    471     {
    472       "claim": "DeepSeek-Coder-V2 achieves 75.7% on MATH, comparable to GPT-4o's 76.6%",
    473       "evidence": "Table 9 direct comparison; DS-Coder-V2 236B gets 75.7% vs GPT-4o's 76.6% on the MATH benchmark",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Continued pre-training from DeepSeek-V2 maintains general language performance while enhancing code/math",
    478       "evidence": "Table 10 shows DS-Coder-V2-Instruct scores within a few points of DeepSeek-V2-Chat on MMLU (79.2% vs 78.1%), C-Eval, and CMMLU",
    479       "supported": "strong"
    480     }
    481   ],
    482   "methodology_tags": [
    483     "benchmark-eval"
    484   ],
    485   "key_findings": "DeepSeek-Coder-V2 is an open-source MoE code LLM that reaches competitive performance with GPT-4-Turbo on select coding and math benchmarks by continuing pre-training of DeepSeek-V2 on 6T additional tokens (60% code, 10% math, 30% NL). The 236B model is the first open-source model to exceed 10% on SWE-Bench and achieves state-of-the-art open-source results on HumanEval, MBPP+, MATH, and AIME. However, a significant instruction-following gap remains compared to frontier closed-source models, and the model is clearly inferior on SWE-Bench (12.7% vs 18.3%) and LiveCodeBench vs GPT-4-Turbo. The 'barrier broken' framing overstates results that are competitive but not uniformly superior.",
    486   "red_flags": [
    487     {
    488       "flag": "Self-evaluation bias",
    489       "detail": "DeepSeek-AI employees evaluated their own model with no independent third-party verification of benchmark results."
    490     },
    491     {
    492       "flag": "Overclaiming in title and abstract",
    493       "detail": "'Breaking the Barrier' and 'superior performance' claims are not uniformly supported; the model trails GPT-4-Turbo on SWE-Bench (12.7% vs 18.3%) and LiveCodeBench (43.4% vs 45.7%)."
    494     },
    495     {
    496       "flag": "No variance across runs",
    497       "detail": "All benchmark results are single-run point estimates; greedy decoding removes stochasticity but evaluation noise from benchmark construction is never addressed."
    498     },
    499     {
    500       "flag": "Ablations at 1B scale only",
    501       "detail": "Corpus quality ablations are conducted at 1B parameters; it is unclear whether the same improvements scale to 16B and 236B models."
    502     },
    503     {
    504       "flag": "No compute budget disclosure",
    505       "detail": "Training costs in GPU-hours are never disclosed, making it impossible to assess resource requirements for replication or comparison."
    506     },
    507     {
    508       "flag": "No statistical significance testing",
    509       "detail": "Performance differences between models on benchmarks with 164–800 examples are presented as conclusive without any significance tests or confidence intervals."
    510     }
    511   ],
    512   "cited_papers": [
    513     {
    514       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    515       "relevance": "Primary code generation benchmark used throughout all experiments; defines the evaluation standard the paper claims to advance"
    516     },
    517     {
    518       "title": "Program Synthesis with Large Language Models (MBPP)",
    519       "relevance": "Second primary code generation benchmark; MBPP+ variant used for main comparisons"
    520     },
    521     {
    522       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    523       "relevance": "Key contamination-free benchmark using post-cutoff problems; used to validate competitive programming capability"
    524     },
    525     {
    526       "title": "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?",
    527       "relevance": "Real-world software engineering benchmark where DS-Coder-V2 achieves first open-source >10% result"
    528     },
    529     {
    530       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    531       "relevance": "Direct predecessor model; provides baseline and training data/methodology that V2 builds upon"
    532     },
    533     {
    534       "title": "DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model",
    535       "relevance": "Base architecture and intermediate checkpoint that DeepSeek-Coder-V2 continues pre-training from"
    536     },
    537     {
    538       "title": "DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models",
    539       "relevance": "Provides the math data collection pipeline and GRPO algorithm used in this work"
    540     },
    541     {
    542       "title": "StarCoder 2 and the Stack V2: The Next Generation",
    543       "relevance": "Primary competing open-source code model; used as baseline in all code generation evaluations"
    544     }
    545   ],
    546   "engagement_factors": {
    547     "practical_relevance": {
    548       "score": 2,
    549       "justification": "Open-source code model with 338 language support and 128K context that practitioners can deploy, though not as simple as an API call."
    550     },
    551     "surprise_contrarian": {
    552       "score": 2,
    553       "justification": "An open-source model matching GPT-4 Turbo on code benchmarks was a notable achievement at the time, challenging the closed-source dominance narrative."
    554     },
    555     "fear_safety": {
    556       "score": 0,
    557       "justification": "No safety, security, or risk themes are discussed in the paper."
    558     },
    559     "drama_conflict": {
    560       "score": 1,
    561       "justification": "The title explicitly frames it as 'breaking the barrier' of closed-source models, creating mild tension with OpenAI/Google/Anthropic, but doesn't directly accuse anyone."
    562     },
    563     "demo_ability": {
    564       "score": 2,
    565       "justification": "Weights are publicly released on GitHub/HuggingFace under a permissive license, though running a 236B MoE model requires substantial hardware."
    566     },
    567     "brand_recognition": {
    568       "score": 2,
    569       "justification": "DeepSeek gained significant recognition in the AI community and the paper directly benchmarks against GPT-4, Claude, and Gemini."
    570     }
    571   },
    572   "hn_data": {
    573     "threads": [
    574       {
    575         "hn_id": "45222339",
    576         "title": "Analog In-Memory Computing Attention Mechanism for Fast LLMs",
    577         "points": 4,
    578         "comments": 0,
    579         "url": "https://news.ycombinator.com/item?id=45222339",
    580         "created_at": "2025-09-12T14:09:56Z"
    581       },
    582       {
    583         "hn_id": "40761106",
    584         "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models",
    585         "points": 3,
    586         "comments": 0,
    587         "url": "https://news.ycombinator.com/item?id=40761106",
    588         "created_at": "2024-06-22T18:34:13Z"
    589       },
    590       {
    591         "hn_id": "40834241",
    592         "title": "A Critical Study of What Code-LLMs (Do Not) Learn",
    593         "points": 2,
    594         "comments": 0,
    595         "url": "https://news.ycombinator.com/item?id=40834241",
    596         "created_at": "2024-06-30T00:15:06Z"
    597       },
    598       {
    599         "hn_id": "39441274",
    600         "title": "Speculative Streaming: Fast LLM Inference Without Auxiliary Models",
    601         "points": 2,
    602         "comments": 1,
    603         "url": "https://news.ycombinator.com/item?id=39441274",
    604         "created_at": "2024-02-20T13:55:45Z"
    605       },
    606       {
    607         "hn_id": "39461525",
    608         "title": "Speculative Streaming: Fast LLM Inference Without Auxiliary Models",
    609         "points": 2,
    610         "comments": 0,
    611         "url": "https://news.ycombinator.com/item?id=39461525",
    612         "created_at": "2024-02-22T00:24:15Z"
    613       },
    614       {
    615         "hn_id": "40442724",
    616         "title": "Analogical Reasoning-Augmented Interactive Data Annotation",
    617         "points": 1,
    618         "comments": 0,
    619         "url": "https://news.ycombinator.com/item?id=40442724",
    620         "created_at": "2024-05-22T16:16:38Z"
    621       },
    622       {
    623         "hn_id": "40111141",
    624         "title": "Lossless Acceleration of Long Sequence Generation",
    625         "points": 1,
    626         "comments": 0,
    627         "url": "https://news.ycombinator.com/item?id=40111141",
    628         "created_at": "2024-04-22T03:10:54Z"
    629       },
    630       {
    631         "hn_id": "37234305",
    632         "title": "Opportunities and Risks of LLMs for Scalable Deliberation with Polis",
    633         "points": 1,
    634         "comments": 0,
    635         "url": "https://news.ycombinator.com/item?id=37234305",
    636         "created_at": "2023-08-23T11:30:32Z"
    637       },
    638       {
    639         "hn_id": "37191375",
    640         "title": "Opportunities and Risks of LLMs for Scalable Deliberation with Polis",
    641         "points": 1,
    642         "comments": 0,
    643         "url": "https://news.ycombinator.com/item?id=37191375",
    644         "created_at": "2023-08-19T18:00:10Z"
    645       }
    646     ],
    647     "top_points": 4,
    648     "total_points": 17,
    649     "total_comments": 1
    650   }
    651 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs