scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (31220B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DeepSeek-Coder: When the Large Language Model Meets Programming — The Rise of Code Intelligence",
      6     "authors": [
      7       "Guo, D.",
      8       "Zhu, Q.",
      9       "Yang, D.",
     10       "Xie, Z.",
     11       "et al."
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2401.14196",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims that DeepSeek-Coder 'achieves state-of-the-art performance among open-source code models' and 'surpasses existing closed-source models like Codex and GPT-3.5' are supported by Tables 3-8. The claim about surpassing GPT-3.5 is hedged with 'majority of the evaluation benchmarks.'",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims are primarily from ablation studies: FIM rate effect (Figure 3, controlled single-variable manipulation) and repository-level pre-training effect (Table 7, 'w/o Repo Pre-training' comparison). These ablations involve controlled manipulation of single variables.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The title 'When the Large Language Model Meets Programming — The Rise of Code Intelligence' is extremely broad. The subtitle suggests a general claim about code intelligence, while results are on specific benchmarks. The paper does not explicitly bound its generalization claims to the tested benchmarks and languages.",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No discussion of alternative explanations for performance improvements. The paper attributes gains to data quality and training methodology but does not consider whether larger training data volume, different tokenization, or other factors could explain the results.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper frames benchmark performance (Pass@1 on HumanEval, MBPP) as evidence of 'code intelligence' and 'programming' capability without discussing the gap between benchmark performance and real-world coding ability. The title and introduction frame this broadly as 'code intelligence' but measurements are narrow benchmark metrics.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No limitations or threats-to-validity section. The conclusion mentions future work on long-context but does not discuss study limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed. The paper does briefly acknowledge contamination risk for LeetCode (Section 4.1) but this is not a systematic discussion of threats.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit scope boundaries are stated. The paper does not discuss what its benchmark results do or do not show about real-world coding ability.",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding disclosure. The paper is from DeepSeek-AI but does not mention any funding source or grants.",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are listed: DeepSeek-AI and Peking University. The paper is evaluating DeepSeek-AI's own models.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "DeepSeek-AI is evaluating its own product. The company has a direct commercial interest in demonstrating strong performance for its models. This conflict is not acknowledged.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement. DeepSeek-AI authors evaluating DeepSeek-Coder models have obvious financial interests in positive results, but this is not declared.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Fill-in-the-Middle (FIM), PSM/SPM modes, repository-level pre-training, and evaluation benchmarks are all defined or cited to prior work with clear explanations.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four explicit contributions are listed in the introduction: the model series, first repository-level pre-training, FIM strategy analysis, and benchmark evaluations.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper situates DeepSeek-Coder against StarCoder, CodeLlama, CodeGeeX2, and Codex, and builds directly on prior FIM work (Bavarian et al.) and deduplication research (Lee et al., Kocetkov et al.).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "GitHub repository URL provided: https://github.com/deepseek-ai/DeepSeek-Coder. The LeetCode benchmark is also released at the same repository.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "The training dataset (798GB of source code) is not released. The paper describes the data collection pipeline but does not provide a download link for the training corpus. Evaluation benchmarks used are public (HumanEval, MBPP, DS-1000, CrossCodeEval).",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The paper mentions using HAI-LLM framework, NVIDIA A100 and H800 GPUs, and FlashAttention v2, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions for reproduction.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided for training or evaluation. The paper describes the methodology at a high level but lacks specific commands or scripts to reproduce results.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables 3-10 are reported as point estimates (e.g., '56.1% accuracy') with no confidence intervals or error bars.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "The paper claims DeepSeek-Coder 'surpasses' and 'outperforms' various baselines based solely on comparing numbers in tables, without any statistical significance tests.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "The paper reports absolute performance numbers with baseline context, e.g., 'our model has demonstrated a notable improvement of 9% and 11% in accuracy' compared to CodeLlama-Base 34B on HumanEval and MBPP respectively (Section 4.1). Per-benchmark breakdowns provide sufficient context.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification for benchmark sample sizes. The LeetCode benchmark uses 180 problems and the paper does not discuss whether this is sufficient for the claims made.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance or standard deviation is reported across experimental runs. All results appear to be single-run greedy decoding numbers. The paper states 'we adopted a greedy search approach' but does not report variance across seeds or runs.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Comprehensive baselines included: CodeGeeX2, StarCoder, CodeLlama (7B/13B/34B), code-cushman-001, GPT-3.5, GPT-4 across all benchmarks (Tables 3-8).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include contemporaneous models: CodeLlama (2023), StarCoder (2023), GPT-4 (2023), and GPT-3.5-Turbo. These were state-of-the-art at time of writing.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "FIM ablation study in Section 3.1.2 compares 0% FIM, 50% FIM, 100% FIM, and 50% MSP rates on the 1.3B model, with results in Figure 3. Section 4.3 Table 7 includes ablation of repository-level pre-training ('w/o Repo Pre-training').",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics used: Pass@1 on HumanEval and MBPP, exact match (EM) and edit similarity (ES) on CrossCodeEval, per-library accuracy on DS-1000, and difficulty-stratified results on LeetCode.",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human evaluation of code quality. All evaluation is automated via test case execution (Pass@1, exact match, etc.).",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Results are reported on established test sets (HumanEval, MBPP, DS-1000, CrossCodeEval). The LeetCode benchmark uses problems from July 2023-January 2024, after the training data cutoff of February 2023.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Extensive breakdowns: per-language results on HumanEval (Table 3), per-library results on DS-1000 (Table 4), per-difficulty on LeetCode (Table 5), per-language on CrossCodeEval (Table 7).",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "No error analysis or failure case discussion. The paper does not examine what types of problems the model fails on or why.",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The FIM ablation (Figure 3) shows that 100% FIM rate results in 'weakest code completion capability,' demonstrating a trade-off. Section 5 notes DeepSeek-Coder-v1.5 has 'a slight decrease in coding performance' compared to the base model (Table 10). The paper also acknowledges a 'substantial performance gap' vs GPT-4.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "For their own models, sizes are specified (1.3B, 6.7B, 33B). For baselines, GPT-3.5-Turbo and GPT-4 are referenced without snapshot dates or API versions. code-cushman-001 is named but other baseline versions (e.g., which CodeLlama checkpoint) lack precision.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "The LeetCode prompt template is provided verbatim: '{problem_description}\\nPlease complete the code below to solve the above problem:\\n```python\\n{code_template}\\n```'. The CoT addition is also quoted. FIM format with sentinel tokens is specified. HumanEval/MBPP use standard zero-shot/few-shot formats.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 2 provides detailed hyperparameters (hidden size, layers, attention heads, batch size, learning rate) for all model sizes. Section 3.4 specifies AdamW optimizer with β1=0.9, β2=0.95, warm-up steps, learning rate schedule. Section 3.7 gives instruction tuning parameters.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used. The models are evaluated directly on benchmark tasks with standard prompting.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 2 provides detailed data preprocessing: filtering rules from StarCoder (line length, alphabetic character ratio, XML filtering, HTML visible text ratio, JSON/YAML size limits), dependency parsing (Algorithm 1), repo-level deduplication (Section 2.3), quality screening with compiler and quality model (Section 2.4). Table 1 shows final data statistics.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The training data is not released. Evaluation outputs (model generations) are not provided for independent verification.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 2 describes data collection in detail: public GitHub repositories created before February 2023, 87 programming languages, filtering rules, deduplication, quality screening. Table 1 provides statistics for all 87 languages.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data sources are public GitHub repositories and standard benchmarks.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Figure 2 and Section 2 document the full pipeline: Data Crawling → Rule Filtering (reduced to 32.8% of original) → Dependency Parsing → Repo-Level Deduplication → Quality Screening. Final statistics in Table 1 (798GB, 603M files).",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Section 2.1 states 'We collect public repositories created before February 2023 on GitHub.' This establishes a clear training data cutoff.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Section 2.4 describes n-gram decontamination: 'we filter out files containing docstrings, questions, and solutions from sources such as HumanEval, MBPP, GSM8K and MATH' using 10-gram and exact match filtering. Section 4.1 also acknowledges potential contamination in LeetCode results.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "Section 2.4 implements active decontamination for HumanEval, MBPP, GSM8K, and MATH via n-gram filtering. Section 4.1 acknowledges 'the possibility of data contamination cannot be entirely ruled out' for LeetCode and notes higher scores in July-August contests. CrossCodeEval uses repos from March-June 2023, after the training cutoff.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or tokens consumed are reported for any evaluation.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware is mentioned (A100, H800 GPUs) but total GPU hours, training time, or compute budget are not quantified. Training 33B parameters on 2T tokens is massive compute but unquantified.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "No multi-seed results reported. All benchmark results appear to be single-run with greedy decoding.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of evaluation runs is not stated. Greedy decoding implies single-run but this is not explicit.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "The FIM ablation tests 4 configurations but no broader hyperparameter search budget is reported for training or evaluation settings.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": true,
    392           "justification": "The FIM rate selection is justified: 50% PSM chosen 'to achieve a balance between FIM efficiency and code completion proficiency' based on ablation results showing the trade-off (Section 3.1.2, Figure 3).",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": false,
    397           "answer": false,
    398           "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "The authors evaluate their own DeepSeek-Coder models against baselines they re-implemented ('re-implemented the baseline results using the same script'). Self-comparison bias is not acknowledged.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": false,
    410           "justification": "DeepSeek-Coder is trained on 2T tokens while baselines use different amounts (CodeLlama uses 500B). This compute difference is not discussed in relation to performance gains.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "The paper uses HumanEval, MBPP, etc. without questioning whether they measure 'code intelligence.' It briefly notes HumanEval/MBPP 'rely heavily on straightforward programming tasks' but introduces DS-1000 as a fix rather than discussing construct validity broadly.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": false,
    421           "answer": false,
    422           "justification": "No scaffolding is used; models are evaluated directly on benchmarks.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": true,
    430           "justification": "Training data cutoff is February 2023. The LeetCode benchmark uses problems from July 2023-January 2024. CrossCodeEval uses repos from March-June 2023. Both are explicitly noted as post-training-cutoff.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether evaluation setup leaks information. For example, no discussion of whether HumanEval docstrings or function signatures contain answer-leaking information.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of whether training data and test benchmarks share structural similarities (e.g., similar coding patterns, same repositories).",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": true,
    448           "justification": "Section 2.4 describes a concrete n-gram filtering decontamination method: 10-gram matching for long strings, exact match for 3-9 gram strings, applied against HumanEval, MBPP, GSM8K, and MATH test data.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "DeepSeek-Coder-Base 33B achieves state-of-the-art performance among open-source code models across HumanEval, MBPP, DS-1000, and FIM benchmarks.",
    457       "evidence": "Tables 3, 4, and 6 show DeepSeek-Coder-Base 33B outperforming StarCoder-16B, CodeLlama-34B, and CodeGeeX2-6B on all tested benchmarks.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "DeepSeek-Coder-Instruct 33B surpasses GPT-3.5-Turbo on most coding benchmarks.",
    462       "evidence": "Table 3 shows 69.2% vs 64.9% average on multilingual HumanEval; Table 5 shows 27.8% vs 23.3% on LeetCode Contest.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "DeepSeek-Coder-Base 6.7B achieves competitive performance with CodeLlama-Base 34B despite having 5x fewer parameters.",
    467       "evidence": "Table 3: 44.7% vs 41.0% average HumanEval; Table 4: 30.5% vs 34.3% DS-1000 (slightly below); Table 8: 54.7% vs 62.0% math avg — mixed evidence.",
    468       "supported": "moderate"
    469     },
    470     {
    471       "claim": "Repository-level pre-training significantly boosts cross-file code generation capability.",
    472       "evidence": "Table 7 shows consistent drops when removing repo pre-training (e.g., Java: 17.72% → 16.64% EM with retrieval), though improvements are modest (1-2pp).",
    473       "supported": "moderate"
    474     },
    475     {
    476       "claim": "50% PSM FIM rate is the optimal training policy balancing FIM and code completion performance.",
    477       "evidence": "Figure 3 ablation across 0%, 50% PSM, 100% PSM, and 50% MSP shows 50% PSM as best overall trade-off on HumanEval + HumanEval-FIM + MBPP.",
    478       "supported": "strong"
    479     },
    480     {
    481       "claim": "DeepSeek-Coder-v1.5 maintains coding performance while substantially improving math and natural language understanding.",
    482       "evidence": "Table 10: HumanEval drops 1.5pp (44.7% → 43.2%) while GSM8K improves 19.2pp (43.2% → 62.4%) and MMLU improves 12.5pp (36.6% → 49.1%).",
    483       "supported": "strong"
    484     }
    485   ],
    486   "methodology_tags": [
    487     "benchmark-eval"
    488   ],
    489   "key_findings": "DeepSeek-Coder-Base 33B achieves state-of-the-art performance among open-source code LLMs on HumanEval, MBPP, DS-1000, CrossCodeEval, and program-aided math reasoning benchmarks, trained from scratch on 2T tokens with repository-level organization and 50% PSM Fill-in-the-Middle pre-training. DeepSeek-Coder-Instruct 33B surpasses GPT-3.5-Turbo on multilingual HumanEval (69.2% vs 64.9%) and LeetCode Contest (27.8% vs 23.3%), substantially narrowing the gap to GPT-4. Ablations confirm that repository-level pre-training modestly improves cross-file completion and that FIM rate involves a trade-off between infilling and autocompletion performance. Continuing pre-training from a general-purpose base (DeepSeek-LLM-7B) preserves coding performance while dramatically improving math reasoning (+19pp on GSM8K) and natural language understanding.",
    490   "red_flags": [
    491     {
    492       "flag": "No variance across runs",
    493       "detail": "All benchmark results are single point estimates with no standard deviation, confidence intervals, or multiple evaluation seeds — small differences (1-3pp) cannot be distinguished from noise."
    494     },
    495     {
    496       "flag": "No limitations section",
    497       "detail": "The paper has no dedicated limitations or threats-to-validity section; the only acknowledgment of limitations is one sentence about LeetCode contamination."
    498     },
    499     {
    500       "flag": "Baseline model versions unspecified",
    501       "detail": "GPT-3.5-Turbo and GPT-4 are compared without snapshot dates or API version identifiers, making comparisons unreproducible as these models change over time."
    502     },
    503     {
    504       "flag": "Self-evaluation without independent replication",
    505       "detail": "DeepSeek-AI employees evaluate their own model with no independent verification; conflict of interest is not disclosed and no competing interests statement is provided."
    506     },
    507     {
    508       "flag": "LeetCode contamination unresolved",
    509       "detail": "The paper acknowledges higher scores in July-August contests and that contamination 'cannot be entirely ruled out' but provides no quantitative analysis of contamination impact."
    510     },
    511     {
    512       "flag": "Training data not released",
    513       "detail": "The 798GB training corpus is described but not released, making the most important methodological contribution (data pipeline and quality) unverifiable."
    514     }
    515   ],
    516   "cited_papers": [
    517     {
    518       "title": "Evaluating Large Language Models Trained on Code (Codex / HumanEval)",
    519       "relevance": "Introduces the HumanEval benchmark used as the primary evaluation throughout and establishes the code LLM evaluation paradigm."
    520     },
    521     {
    522       "title": "StarCoder: May the Source Be With You",
    523       "relevance": "Direct competitor and baseline; DeepSeek-Coder adopts and extends StarCoder's data filtering rules and FIM approach."
    524     },
    525     {
    526       "title": "Code Llama: Open Foundation Models for Code",
    527       "relevance": "Primary open-source competitor used as baseline across all evaluations; DeepSeek-Coder claims to outperform it with fewer parameters."
    528     },
    529     {
    530       "title": "Efficient Training of Language Models to Fill in the Middle",
    531       "relevance": "Foundation for the FIM pre-training approach adopted by DeepSeek-Coder; the PSM/SPM modes are built on this work."
    532     },
    533     {
    534       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    535       "relevance": "Key evaluation benchmark for cross-file completion; used to validate the repository-level pre-training contribution."
    536     },
    537     {
    538       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    539       "relevance": "Evaluation benchmark for practical data science code generation across 7 libraries, addressing HumanEval's focus on simple algorithmic tasks."
    540     },
    541     {
    542       "title": "Program Synthesis with Large Language Models (MBPP)",
    543       "relevance": "Second primary evaluation benchmark; used throughout to measure Python code generation quality."
    544     },
    545     {
    546       "title": "The Stack: 3 TB of Permissively Licensed Source Code",
    547       "relevance": "Prior dataset work that DeepSeek-Coder builds upon for data filtering methodology and near-deduplication approach."
    548     }
    549   ],
    550   "engagement_factors": {
    551     "practical_relevance": {
    552       "score": 3,
    553       "justification": "Open-source code models with permissive licensing that developers can immediately use for code completion and generation across 87 languages."
    554     },
    555     "surprise_contrarian": {
    556       "score": 1,
    557       "justification": "The 6.7B matching 34B CodeLlama is mildly surprising but the overall narrative of 'our model beats baselines' is standard."
    558     },
    559     "fear_safety": {
    560       "score": 0,
    561       "justification": "No safety, security, or misuse concerns are discussed or raised by the work."
    562     },
    563     "drama_conflict": {
    564       "score": 1,
    565       "justification": "Implicitly challenges Meta's CodeLlama dominance and claims to beat GPT-3.5, but framed cooperatively rather than confrontationally."
    566     },
    567     "demo_ability": {
    568       "score": 3,
    569       "justification": "Models are publicly available on HuggingFace with a GitHub repo, pip-installable via standard HF tooling, and ready to use immediately."
    570     },
    571     "brand_recognition": {
    572       "score": 2,
    573       "justification": "DeepSeek became widely recognized in the AI community, though at time of publication it was still building its reputation compared to OpenAI or Meta."
    574     }
    575   },
    576   "hn_data": {
    577     "threads": [
    578       {
    579         "hn_id": "39142278",
    580         "title": "Python has 189X the dataset size compared to Rust",
    581         "points": 2,
    582         "comments": 4,
    583         "url": "https://news.ycombinator.com/item?id=39142278",
    584         "created_at": "2024-01-26T13:18:01Z"
    585       }
    586     ],
    587     "top_points": 2,
    588     "total_points": 2,
    589     "total_comments": 4
    590   }
    591 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs