scan-v5.json (24704B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming - The Rise of Code Intelligence", 6 "authors": [ 7 "Guo, D.", 8 "Zhu, Q.", 9 "Yang, D.", 10 "Xie, Z.", 11 "et al." 12 ], 13 "year": 2024, 14 "venue": "arXiv", 15 "arxiv_id": "2401.14196", 16 "doi": null 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "Abstract claims SOTA among open-source code models and superiority over GPT-3.5 are backed by benchmark results in Tables 3–8; 2T token training and 16K context are documented in Sections 2–3.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims about FIM and repo-level pre-training are supported by ablation experiments: FIM rate ablation (Figure 3) and CrossCodeEval ablation with/without repo pre-training (Table 7).", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper's title claims 'The Rise of Code Intelligence' and the abstract asserts broad superiority, but evaluations are confined to narrow benchmarks (HumanEval, MBPP, LeetCode); no discussion of generalization limits beyond benchmark settings.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "Better performance is attributed to data quality and repo-level training without considering alternative explanations such as sheer data volume advantage, architectural differences, or training compute differences.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": false, 47 "justification": "Pass@1 on HumanEval is equated with 'code intelligence' throughout; no discussion of whether benchmark performance reflects real-world coding utility or the limitations of these proxies.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "There is no dedicated limitations or threats-to-validity section; only a brief one-sentence acknowledgment of potential LeetCode contamination in the results section.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "No specific threats to validity are enumerated; the contamination acknowledgment ('the possibility of data contamination cannot be entirely ruled out') is generic boilerplate.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper does not explicitly state what its results do not show (e.g., that HumanEval pass rates do not imply real-world productivity, or that comparisons are snapshot-in-time against specific model versions).", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding source is disclosed anywhere in the paper; the acknowledgments section lists individual contributors but no funding body or grant.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "Author affiliations are clearly disclosed: DeepSeek-AI and Peking University (Key Lab of HCST), with contact emails provided.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": true, 86 "answer": false, 87 "justification": "The majority of authors are DeepSeek-AI employees evaluating their own proprietary models; no independent third-party evaluation is performed.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "Key technical terms such as Fill-in-the-Middle (PSM/SPM modes), repository-level data construction, and cross-file completion are explained clearly with sufficient specificity for the technical audience.", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Four explicit contributions are listed in the introduction: the DeepSeek-Coder model series, repo-level data construction, FIM training analysis, and comprehensive benchmark evaluations.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper situates itself against StarCoder, CodeLlama, CodeGeeX2, GPT-3.5/4, Codex, and related work including FIM training (Bavarian et al.) and deduplication (Lee et al., Kocetkov et al.).", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Source code and models are released on GitHub at https://github.com/deepseek-ai/DeepSeek-Coder, including the LeetCode evaluation benchmark.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": false, 130 "justification": "Training data (798 GB proprietary crawl from GitHub) is not released; evaluation uses public benchmarks but the custom training corpus required to reproduce the model is unavailable.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "The HAI-LLM framework and GPU cluster (A100/H800) are described but no requirements.txt, Dockerfile, or pinned dependency list is provided for reproducing training or evaluation.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "No step-by-step instructions for reproducing training or evaluation are included; the paper describes the pipeline at a high level but not with sufficient detail to follow without guessing.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "All benchmark results are reported as single point estimates with no confidence intervals, error bars, or standard deviations across multiple runs.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are applied to any comparative claim despite multiple model comparisons across benchmarks with small evaluation set sizes (e.g., HumanEval n=164).", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Percentage point differences are reported throughout (e.g., '9% and 11% improvement over CodeLlama-Base 34B') with baseline context provided in all comparison tables.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "Benchmark sizes (HumanEval n=164, MBPP n=500) are not justified or discussed for statistical adequacy; no power analysis is mentioned.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "No variance, standard deviation, or multiple-run statistics are reported; all results appear to be single-run point estimates.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Strong baselines included across all tasks: CodeGeeX2, StarCoder, CodeLlama (7B/13B/34B), GPT-3.5-Turbo, GPT-4-Turbo, WizardCoder, Phind-CodeLlama.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "Baselines include CodeLlama (2023), StarCoder (2023), GPT-3.5/4-Turbo — all contemporary at the time of writing (January 2024).", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": true, 194 "justification": "Two ablations are presented: FIM rate comparison (0%, 50%, 100%, MSP) in Figure 3, and repo-level pre-training ablation ('w/o Repo Pre-training') in Table 7.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "Multiple metrics used: Pass@1, exact match (EM), edit similarity (ES), per-difficulty breakdown, and per-library breakdown across diverse benchmark suites.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Automated test-case-based evaluation is standard for code generation benchmarks; human evaluation of code correctness is not applicable here.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "Held-out sets used: CrossCodeEval (repositories from March–June 2023, after training cutoff), LeetCode Contest (July 2023–January 2024), and standard benchmarks with withheld solutions.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Per-language breakdowns in Tables 3 and 6, per-library breakdown in Table 4 (DS-1000), per-difficulty breakdown in Table 5 (LeetCode Easy/Medium/Hard).", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": false, 224 "justification": "Appendix only shows successful interaction examples (snake game, database); no failure cases or error analysis is presented.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The paper reports that 100% FIM rate hurts code completion (Figure 3) and that DeepSeek-Coder-v1.5 shows slight coding regression vs. original 6.7B (Table 10: 43.2% vs 44.7% HumanEval).", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": false, 238 "justification": "DeepSeek models are specified by parameter count, but GPT-3.5-Turbo and GPT-4-Turbo are named without snapshot dates — critical given that OpenAI models change over time.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": false, 244 "justification": "Only the LeetCode evaluation template is provided; prompts for HumanEval, MBPP, DS-1000, CrossCodeEval, and math reasoning benchmarks are not given.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": true, 250 "justification": "Table 2 provides hidden size, layers, attention heads, batch size, learning rates; AdamW with β1=0.9, β2=0.95, FIM rate 0.5, warm-up steps, and LR scheduling are described.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "No agentic scaffolding is used; this is direct model evaluation on benchmarks.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Section 2 and Figure 2 document the full pipeline: crawling, rule filtering, dependency parsing (Algorithm 1), repo-level deduplication, quality screening, and n-gram decontamination.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": false, 270 "justification": "The 798 GB training corpus is proprietary and not publicly released; no raw training data is available for independent verification.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Section 2 describes GitHub crawling scope (pre-February 2023), 87-language selection, filter rules, dependency parsing, deduplication, and quality screening with statistics in Table 1.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; evaluation uses automated benchmark testing against fixed test cases.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "Figure 2 shows the full data pipeline (crawl → filter → dependency parse → dedup → quality screen), and each step is described in dedicated subsections with specific algorithms and thresholds.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": true, 295 "answer": true, 296 "justification": "Explicitly stated: 'We collect public repositories created before February 2023 on GitHub.'", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": true, 301 "answer": true, 302 "justification": "Section 2.4 describes n-gram decontamination filtering HumanEval, MBPP, GSM8K, and MATH examples; CrossCodeEval's post-February 2023 construction is explicitly noted as preventing overlap.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": true, 307 "answer": true, 308 "justification": "N-gram filtering (10-gram or 3-gram exact match) applied for HumanEval/MBPP/GSM8K/MATH; LeetCode contamination is explicitly acknowledged as unresolvable and flagged for the community.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in this study.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in this study.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants in this study.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in this study.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants in this study.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants in this study.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants in this study.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "No inference latency, memory requirements, or cost estimates are reported despite releasing models for public deployment.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "GPU cluster hardware is described (A100/H800) but total training compute (GPU-hours, FLOPs, or cost) is not reported for any model size.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "DeepSeek-Coder-Base 33B achieves state-of-the-art performance among open-source code models on HumanEval (50.3% avg across 8 languages) and MBPP (66.0%).", 375 "evidence": "Table 3 shows DeepSeek-Coder-Base 33B outperforming CodeLlama-34B (41.0% avg, 55.2% MBPP) and all other listed open-source models.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "DeepSeek-Coder-Instruct 33B outperforms GPT-3.5-Turbo on code generation benchmarks.", 380 "evidence": "Table 3 shows Instruct 33B at 69.2% avg vs GPT-3.5-Turbo 64.9%; Table 5 shows Instruct 33B at 27.8% vs GPT-3.5-Turbo 23.3% on LeetCode Contest.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "DeepSeek-Coder-Base 6.7B matches or exceeds CodeLlama-Base 34B despite having 5x fewer parameters.", 385 "evidence": "Table 3: DeepSeek 6.7B at 44.7% avg and 60.6% MBPP vs CodeLlama 34B at 41.0% avg and 55.2% MBPP.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Repository-level pre-training improves cross-file code completion performance.", 390 "evidence": "Table 7 ablation: 'w/o Repo Pre-training' shows performance drops on Java (16.64% vs 17.72% EM), TypeScript (13.23% vs 14.03% EM), and C# (14.48% vs 16.23% EM).", 391 "supported": "weak" 392 }, 393 { 394 "claim": "FIM training at 50% PSM rate optimally balances code completion (FIM) and code generation performance.", 395 "evidence": "Figure 3 ablation: 100% FIM rate maximizes HumanEval-FIM but minimizes HumanEval and MBPP pass@1; 50% PSM outperforms MSP strategy.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Continuing pre-training from a general LLM (DeepSeek-LLM-7B) significantly improves math and natural language capabilities of DeepSeek-Coder-v1.5.", 400 "evidence": "Table 10: GSM8K improves from 43.2% to 62.4%, MATH from 19.2% to 24.7%, MMLU from 36.6% to 49.1% at modest cost of ~1.5pp HumanEval regression.", 401 "supported": "strong" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval" 406 ], 407 "key_findings": "DeepSeek-Coder introduces a family of open-source code LLMs (1.3B–33B) trained from scratch on 2 trillion tokens with repository-level data organization and Fill-in-the-Middle training, achieving state-of-the-art performance among open-source models across HumanEval, MBPP, DS-1000, CrossCodeEval, and LeetCode Contest benchmarks. The 33B instruct variant surpasses GPT-3.5-Turbo on most code tasks. Ablations show that 50% PSM-mode FIM rate optimally balances completion and generation ability, and that repo-level pre-training provides modest but consistent improvements on cross-file completion. Continued pre-training from a general LLM substantially improves mathematical and natural language capabilities at minor cost to code performance.", 408 "red_flags": [ 409 { 410 "flag": "Self-evaluation", 411 "detail": "All evaluations are conducted by DeepSeek-AI employees on their own models; no independent third-party replication is reported." 412 }, 413 { 414 "flag": "No statistical significance testing", 415 "detail": "All comparative claims are based on point estimates with no confidence intervals, error bars, or significance tests, despite small benchmark sizes (HumanEval n=164)." 416 }, 417 { 418 "flag": "GPT baselines not version-pinned", 419 "detail": "GPT-3.5-Turbo and GPT-4-Turbo are referenced without snapshot dates; these models changed significantly during 2023–2024, making comparisons unreliable." 420 }, 421 { 422 "flag": "Training data not released", 423 "detail": "The 798 GB proprietary training corpus is not publicly available, making it impossible to fully reproduce the work or verify data quality claims." 424 }, 425 { 426 "flag": "LeetCode contamination unresolved", 427 "detail": "The paper acknowledges 'the possibility of data contamination cannot be entirely ruled out' for LeetCode, and notes higher scores in July/August contests, but does not resolve or quantify the contamination." 428 }, 429 { 430 "flag": "No limitations section", 431 "detail": "Despite making strong comparative claims (SOTA, surpassing GPT-3.5), there is no dedicated limitations or threats-to-validity section." 432 } 433 ], 434 "cited_papers": [ 435 { 436 "title": "Evaluating Large Language Models Trained on Code (HumanEval)", 437 "relevance": "Primary benchmark used throughout; introduces pass@k evaluation for code generation models." 438 }, 439 { 440 "title": "StarCoder: may the source be with you!", 441 "relevance": "Key open-source baseline model and data source (StarCoder data pipeline) that DeepSeek-Coder directly competes with and builds upon." 442 }, 443 { 444 "title": "Code Llama: Open Foundation Models for Code", 445 "relevance": "Primary open-source baseline across all benchmarks; DeepSeek-Coder's 6.7B model is claimed to match CodeLlama-34B." 446 }, 447 { 448 "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion", 449 "relevance": "Used to evaluate the novel repo-level pre-training contribution; provides contamination-free evaluation (post February 2023)." 450 }, 451 { 452 "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation", 453 "relevance": "More realistic benchmark than HumanEval for practical data science coding tasks across 7 libraries." 454 }, 455 { 456 "title": "Efficient Training of Language Models to Fill in the Middle (FIM)", 457 "relevance": "Foundation for the Fill-in-the-Middle training objective that is a core contribution of DeepSeek-Coder." 458 }, 459 { 460 "title": "The Stack: 3 TB of Permissively Licensed Source Code", 461 "relevance": "Data source and deduplication methodology that DeepSeek-Coder's data pipeline extends with repo-level deduplication." 462 }, 463 { 464 "title": "Program Synthesis with Large Language Models (MBPP)", 465 "relevance": "Secondary code generation benchmark used throughout all comparisons." 466 } 467 ], 468 "engagement_factors": { 469 "practical_relevance": { 470 "score": 3, 471 "justification": "Models released open-source with permissive license, directly usable by practitioners as drop-in replacement for closed-source code assistants." 472 }, 473 "surprise_contrarian": { 474 "score": 2, 475 "justification": "An open-source model matching or beating GPT-3.5-Turbo on code was surprising at the time of publication (January 2024)." 476 }, 477 "fear_safety": { 478 "score": 0, 479 "justification": "No AI safety or risk concerns raised; paper is purely a technical model introduction." 480 }, 481 "drama_conflict": { 482 "score": 1, 483 "justification": "Mild competitive framing against OpenAI's closed-source models, but no controversy or confrontational claims." 484 }, 485 "demo_ability": { 486 "score": 3, 487 "justification": "Models are publicly available on GitHub and HuggingFace; anyone can run them immediately." 488 }, 489 "brand_recognition": { 490 "score": 2, 491 "justification": "DeepSeek-AI has become well-known in the open-source LLM community; Peking University affiliation adds academic credibility." 492 } 493 }, 494 "hn_data": { 495 "threads": [ 496 { 497 "hn_id": "39142278", 498 "title": "Python has 189X the dataset size compared to Rust", 499 "points": 2, 500 "comments": 4, 501 "url": "https://news.ycombinator.com/item?id=39142278", 502 "created_at": "2024-01-26T13:18:01Z" 503 } 504 ], 505 "top_points": 2, 506 "total_points": 2, 507 "total_comments": 4 508 } 509 }