scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28871B)
      1 {
      2   "paper": {
      3     "title": "DeepSeek-Coder: When the Large Language Model Meets Programming — The Rise of Code Intelligence",
      4     "authors": [
      5       "Daya Guo",
      6       "Qihao Zhu",
      7       "Dejian Yang",
      8       "Zhenda Xie",
      9       "Kai Dong",
     10       "Wentao Zhang",
     11       "Guanting Chen",
     12       "Xiao Bi",
     13       "Y. Wu",
     14       "Y.K. Li",
     15       "Fuli Luo",
     16       "Yingfei Xiong",
     17       "Wenfeng Liang"
     18     ],
     19     "year": 2024,
     20     "venue": "arXiv",
     21     "arxiv_id": "2401.14196"
     22   },
     23   "scan_version": 3,
     24   "active_modules": [
     25     "experimental_rigor",
     26     "data_leakage"
     27   ],
     28   "methodology_tags": [
     29     "benchmark-eval"
     30   ],
     31   "key_findings": "DeepSeek-Coder is a series of open-source code models (1.3B-33B) trained from scratch on 2T tokens with 87 programming languages and repository-level data organization. DeepSeek-Coder-Base 33B achieves state-of-the-art among open-source code models on HumanEval (56.1%), MBPP (66.0%), DS-1000, and other benchmarks. The 6.7B model matches CodeLlama-34B performance. FIM ablation shows 50% PSM rate optimally balances code completion and fill-in-the-middle capability.",
     32   "checklist": {
     33     "artifacts": {
     34       "code_released": {
     35         "applies": true,
     36         "answer": true,
     37         "justification": "GitHub repository URL provided: https://github.com/deepseek-ai/DeepSeek-Coder. The LeetCode benchmark is also released at the same repository."
     38       },
     39       "data_released": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The training dataset (798GB of source code) is not released. The paper describes the data collection pipeline but does not provide a download link for the training corpus. Evaluation benchmarks used are public (HumanEval, MBPP, DS-1000, CrossCodeEval)."
     43       },
     44       "environment_specified": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper mentions using HAI-LLM framework, NVIDIA A100 and H800 GPUs, and FlashAttention v2, but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions for reproduction."
     48       },
     49       "reproduction_instructions": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No step-by-step reproduction instructions are provided for training or evaluation. The paper describes the methodology at a high level but lacks specific commands or scripts to reproduce results."
     53       }
     54     },
     55     "statistical_methodology": {
     56       "confidence_intervals_or_error_bars": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "All results in Tables 3-10 are reported as point estimates (e.g., '56.1% accuracy') with no confidence intervals or error bars."
     60       },
     61       "significance_tests": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper claims DeepSeek-Coder 'surpasses' and 'outperforms' various baselines based solely on comparing numbers in tables, without any statistical significance tests."
     65       },
     66       "effect_sizes_reported": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The paper reports absolute performance numbers with baseline context, e.g., 'our model has demonstrated a notable improvement of 9% and 11% in accuracy' compared to CodeLlama-Base 34B on HumanEval and MBPP respectively (Section 4.1). Per-benchmark breakdowns provide sufficient context."
     70       },
     71       "sample_size_justified": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No justification for benchmark sample sizes. The LeetCode benchmark uses 180 problems and the paper does not discuss whether this is sufficient for the claims made."
     75       },
     76       "variance_reported": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No variance or standard deviation is reported across experimental runs. All results appear to be single-run greedy decoding numbers. The paper states 'we adopted a greedy search approach' but does not report variance across seeds or runs."
     80       }
     81     },
     82     "evaluation_design": {
     83       "baselines_included": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Comprehensive baselines included: CodeGeeX2, StarCoder, CodeLlama (7B/13B/34B), code-cushman-001, GPT-3.5, GPT-4 across all benchmarks (Tables 3-8)."
     87       },
     88       "baselines_contemporary": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Baselines include contemporaneous models: CodeLlama (2023), StarCoder (2023), GPT-4 (2023), and GPT-3.5-Turbo. These were state-of-the-art at time of writing."
     92       },
     93       "ablation_study": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "FIM ablation study in Section 3.1.2 compares 0% FIM, 50% FIM, 100% FIM, and 50% MSP rates on the 1.3B model, with results in Figure 3. Section 4.3 Table 7 includes ablation of repository-level pre-training ('w/o Repo Pre-training')."
     97       },
     98       "multiple_metrics": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Multiple metrics used: Pass@1 on HumanEval and MBPP, exact match (EM) and edit similarity (ES) on CrossCodeEval, per-library accuracy on DS-1000, and difficulty-stratified results on LeetCode."
    102       },
    103       "human_evaluation": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "No human evaluation of code quality. All evaluation is automated via test case execution (Pass@1, exact match, etc.)."
    107       },
    108       "held_out_test_set": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Results are reported on established test sets (HumanEval, MBPP, DS-1000, CrossCodeEval). The LeetCode benchmark uses problems from July 2023-January 2024, after the training data cutoff of February 2023."
    112       },
    113       "per_category_breakdown": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Extensive breakdowns: per-language results on HumanEval (Table 3), per-library results on DS-1000 (Table 4), per-difficulty on LeetCode (Table 5), per-language on CrossCodeEval (Table 7)."
    117       },
    118       "failure_cases_discussed": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "No error analysis or failure case discussion. The paper does not examine what types of problems the model fails on or why."
    122       },
    123       "negative_results_reported": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The FIM ablation (Figure 3) shows that 100% FIM rate results in 'weakest code completion capability,' demonstrating a trade-off. Section 5 notes DeepSeek-Coder-v1.5 has 'a slight decrease in coding performance' compared to the base model (Table 10). The paper also acknowledges a 'substantial performance gap' vs GPT-4."
    127       }
    128     },
    129     "claims_and_evidence": {
    130       "abstract_claims_supported": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Abstract claims that DeepSeek-Coder 'achieves state-of-the-art performance among open-source code models' and 'surpasses existing closed-source models like Codex and GPT-3.5' are supported by Tables 3-8. The claim about surpassing GPT-3.5 is hedged with 'majority of the evaluation benchmarks.'"
    134       },
    135       "causal_claims_justified": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Causal claims are primarily from ablation studies: FIM rate effect (Figure 3, controlled single-variable manipulation) and repository-level pre-training effect (Table 7, 'w/o Repo Pre-training' comparison). These ablations involve controlled manipulation of single variables."
    139       },
    140       "generalization_bounded": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The title 'When the Large Language Model Meets Programming — The Rise of Code Intelligence' is extremely broad. The subtitle suggests a general claim about code intelligence, while results are on specific benchmarks. The paper does not explicitly bound its generalization claims to the tested benchmarks and languages."
    144       },
    145       "alternative_explanations_discussed": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No discussion of alternative explanations for performance improvements. The paper attributes gains to data quality and training methodology but does not consider whether larger training data volume, different tokenization, or other factors could explain the results."
    149       },
    150       "proxy_outcome_distinction": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The paper frames benchmark performance (Pass@1 on HumanEval, MBPP) as evidence of 'code intelligence' and 'programming' capability without discussing the gap between benchmark performance and real-world coding ability. The title and introduction frame this broadly as 'code intelligence' but measurements are narrow benchmark metrics."
    154       }
    155     },
    156     "setup_transparency": {
    157       "model_versions_specified": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "For their own models, sizes are specified (1.3B, 6.7B, 33B). For baselines, GPT-3.5-Turbo and GPT-4 are referenced without snapshot dates or API versions. code-cushman-001 is named but other baseline versions (e.g., which CodeLlama checkpoint) lack precision."
    161       },
    162       "prompts_provided": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The LeetCode prompt template is provided verbatim: '{problem_description}\\nPlease complete the code below to solve the above problem:\\n```python\\n{code_template}\\n```'. The CoT addition is also quoted. FIM format with sentinel tokens is specified. HumanEval/MBPP use standard zero-shot/few-shot formats."
    166       },
    167       "hyperparameters_reported": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Table 2 provides detailed hyperparameters (hidden size, layers, attention heads, batch size, learning rate) for all model sizes. Section 3.4 specifies AdamW optimizer with β1=0.9, β2=0.95, warm-up steps, learning rate schedule. Section 3.7 gives instruction tuning parameters."
    171       },
    172       "scaffolding_described": {
    173         "applies": false,
    174         "answer": false,
    175         "justification": "No agentic scaffolding is used. The models are evaluated directly on benchmark tasks with standard prompting."
    176       },
    177       "data_preprocessing_documented": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 2 provides detailed data preprocessing: filtering rules from StarCoder (line length, alphabetic character ratio, XML filtering, HTML visible text ratio, JSON/YAML size limits), dependency parsing (Algorithm 1), repo-level deduplication (Section 2.3), quality screening with compiler and quality model (Section 2.4). Table 1 shows final data statistics."
    181       }
    182     },
    183     "limitations_and_scope": {
    184       "limitations_section_present": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "No limitations or threats-to-validity section. The conclusion mentions future work on long-context but does not discuss study limitations."
    188       },
    189       "threats_to_validity_specific": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No threats to validity are discussed. The paper does briefly acknowledge contamination risk for LeetCode (Section 4.1) but this is not a systematic discussion of threats."
    193       },
    194       "scope_boundaries_stated": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "No explicit scope boundaries are stated. The paper does not discuss what its benchmark results do or do not show about real-world coding ability."
    198       }
    199     },
    200     "data_integrity": {
    201       "raw_data_available": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "The training data is not released. Evaluation outputs (model generations) are not provided for independent verification."
    205       },
    206       "data_collection_described": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Section 2 describes data collection in detail: public GitHub repositories created before February 2023, 87 programming languages, filtering rules, deduplication, quality screening. Table 1 provides statistics for all 87 languages."
    210       },
    211       "recruitment_methods_described": {
    212         "applies": false,
    213         "answer": false,
    214         "justification": "No human participants. Data sources are public GitHub repositories and standard benchmarks."
    215       },
    216       "data_pipeline_documented": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Figure 2 and Section 2 document the full pipeline: Data Crawling → Rule Filtering (reduced to 32.8% of original) → Dependency Parsing → Repo-Level Deduplication → Quality Screening. Final statistics in Table 1 (798GB, 603M files)."
    220       }
    221     },
    222     "conflicts_of_interest": {
    223       "funding_disclosed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No funding disclosure. The paper is from DeepSeek-AI but does not mention any funding source or grants."
    227       },
    228       "affiliations_disclosed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "Author affiliations are listed: DeepSeek-AI and Peking University. The paper is evaluating DeepSeek-AI's own models."
    232       },
    233       "funder_independent_of_outcome": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "DeepSeek-AI is evaluating its own product. The company has a direct commercial interest in demonstrating strong performance for its models. This conflict is not acknowledged."
    237       },
    238       "financial_interests_declared": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No competing interests statement. DeepSeek-AI authors evaluating DeepSeek-Coder models have obvious financial interests in positive results, but this is not declared."
    242       }
    243     },
    244     "contamination": {
    245       "training_cutoff_stated": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "Section 2.1 states 'We collect public repositories created before February 2023 on GitHub.' This establishes a clear training data cutoff."
    249       },
    250       "train_test_overlap_discussed": {
    251         "applies": true,
    252         "answer": true,
    253         "justification": "Section 2.4 describes n-gram decontamination: 'we filter out files containing docstrings, questions, and solutions from sources such as HumanEval, MBPP, GSM8K and MATH' using 10-gram and exact match filtering. Section 4.1 also acknowledges potential contamination in LeetCode results."
    254       },
    255       "benchmark_contamination_addressed": {
    256         "applies": true,
    257         "answer": true,
    258         "justification": "Section 2.4 implements active decontamination for HumanEval, MBPP, GSM8K, and MATH via n-gram filtering. Section 4.1 acknowledges 'the possibility of data contamination cannot be entirely ruled out' for LeetCode and notes higher scores in July-August contests. CrossCodeEval uses repos from March-June 2023, after the training cutoff."
    259       }
    260     },
    261     "human_studies": {
    262       "pre_registered": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "irb_or_ethics_approval": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "demographics_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "inclusion_exclusion_criteria": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "randomization_described": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       },
    287       "blinding_described": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "No human participants in this study."
    291       },
    292       "attrition_reported": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "No human participants in this study."
    296       }
    297     },
    298     "cost_and_practicality": {
    299       "inference_cost_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No inference cost, latency, or tokens consumed are reported for any evaluation."
    303       },
    304       "compute_budget_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "Hardware is mentioned (A100, H800 GPUs) but total GPU hours, training time, or compute budget are not quantified. Training 33B parameters on 2T tokens is massive compute but unquantified."
    308       }
    309     },
    310     "experimental_rigor": {
    311       "seed_sensitivity_reported": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No multi-seed results reported. All benchmark results appear to be single-run with greedy decoding."
    315       },
    316       "number_of_runs_stated": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The number of evaluation runs is not stated. Greedy decoding implies single-run but this is not explicit."
    320       },
    321       "hyperparameter_search_budget": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The FIM ablation tests 4 configurations but no broader hyperparameter search budget is reported for training or evaluation settings."
    325       },
    326       "best_config_selection_justified": {
    327         "applies": true,
    328         "answer": true,
    329         "justification": "The FIM rate selection is justified: 50% PSM chosen 'to achieve a balance between FIM efficiency and code completion proficiency' based on ablation results showing the trade-off (Section 3.1.2, Figure 3)."
    330       },
    331       "multiple_comparison_correction": {
    332         "applies": false,
    333         "answer": false,
    334         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    335       },
    336       "self_comparison_bias_addressed": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "The authors evaluate their own DeepSeek-Coder models against baselines they re-implemented ('re-implemented the baseline results using the same script'). Self-comparison bias is not acknowledged."
    340       },
    341       "compute_budget_vs_performance": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "DeepSeek-Coder is trained on 2T tokens while baselines use different amounts (CodeLlama uses 500B). This compute difference is not discussed in relation to performance gains."
    345       },
    346       "benchmark_construct_validity": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The paper uses HumanEval, MBPP, etc. without questioning whether they measure 'code intelligence.' It briefly notes HumanEval/MBPP 'rely heavily on straightforward programming tasks' but introduces DS-1000 as a fix rather than discussing construct validity broadly."
    350       },
    351       "scaffold_confound_addressed": {
    352         "applies": false,
    353         "answer": false,
    354         "justification": "No scaffolding is used; models are evaluated directly on benchmarks."
    355       }
    356     },
    357     "data_leakage": {
    358       "temporal_leakage_addressed": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "Training data cutoff is February 2023. The LeetCode benchmark uses problems from July 2023-January 2024. CrossCodeEval uses repos from March-June 2023. Both are explicitly noted as post-training-cutoff."
    362       },
    363       "feature_leakage_addressed": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No discussion of whether evaluation setup leaks information. For example, no discussion of whether HumanEval docstrings or function signatures contain answer-leaking information."
    367       },
    368       "non_independence_addressed": {
    369         "applies": true,
    370         "answer": false,
    371         "justification": "No discussion of whether training data and test benchmarks share structural similarities (e.g., similar coding patterns, same repositories)."
    372       },
    373       "leakage_detection_method": {
    374         "applies": true,
    375         "answer": true,
    376         "justification": "Section 2.4 describes a concrete n-gram filtering decontamination method: 10-gram matching for long strings, exact match for 3-9 gram strings, applied against HumanEval, MBPP, GSM8K, and MATH test data."
    377       }
    378     }
    379   },
    380   "claims": [
    381     {
    382       "claim": "DeepSeek-Coder-Base 33B achieves state-of-the-art performance among open-source code models across multiple benchmarks.",
    383       "evidence": "Table 3: 50.3% avg on multilingual HumanEval, 66.0% on MBPP, outperforming CodeLlama-34B by 9% and 11% respectively. Table 4: 40.2% on DS-1000 vs CodeLlama-34B's 34.3%. Table 6: 81.2% mean on FIM tasks.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "DeepSeek-Coder-Base 6.7B surpasses CodeLlama-Base 34B despite having 5x fewer parameters.",
    388       "evidence": "Table 3: 6.7B achieves 44.7% avg on HumanEval vs CodeLlama-34B's 41.0%, and 60.6% on MBPP vs 55.2%.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "DeepSeek-Coder-Instruct 33B surpasses GPT-3.5-Turbo in the majority of evaluation benchmarks.",
    393       "evidence": "Table 3: 69.2% avg HumanEval vs 64.9% for GPT-3.5-Turbo. Table 5: 27.8% on LeetCode vs 23.3%. However, MBPP is 70.0% vs 70.8%, slightly below GPT-3.5.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Repository-level pre-training improves cross-file code completion performance.",
    398       "evidence": "Table 7: DeepSeek-Coder with retrieval outperforms the 'w/o Repo Pre-training' variant on Java (17.72 vs 16.64 EM), TypeScript (14.03 vs 13.23 EM), and C# (16.23 vs 14.48 EM).",
    399       "supported": "moderate"
    400     },
    401     {
    402       "claim": "50% PSM rate provides optimal balance between FIM and code completion capabilities.",
    403       "evidence": "Figure 3: 100% FIM maximizes HumanFIM-Pass@1 but minimizes HumanEval-Pass@1 and MBPP-Pass@1. 50% FIM achieves competitive FIM performance while maintaining strong code completion. 50% PSM outperforms 50% MSP.",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "red_flags": [
    408     {
    409       "flag": "Company evaluating own product",
    410       "detail": "DeepSeek-AI employees evaluate DeepSeek-Coder models. The paper does not acknowledge this conflict of interest. All baselines were re-implemented by the authors using their own scripts, introducing potential self-comparison bias."
    411     },
    412     {
    413       "flag": "No uncertainty quantification",
    414       "detail": "No error bars, confidence intervals, standard deviations, or significance tests across any results. All tables show point estimates only, making it impossible to assess whether differences are meaningful."
    415     },
    416     {
    417       "flag": "No limitations section",
    418       "detail": "The paper has no dedicated limitations or threats-to-validity section, despite being a major system paper with broad claims about 'code intelligence.'"
    419     },
    420     {
    421       "flag": "Training compute advantage not discussed",
    422       "detail": "DeepSeek-Coder is trained on 2T tokens while CodeLlama uses 500B tokens of code. The 4x training data advantage is not discussed when comparing performance, making it unclear whether gains come from architecture/data quality or simply more training data."
    423     },
    424     {
    425       "flag": "Acknowledged contamination risk",
    426       "detail": "The paper itself acknowledges 'the possibility of data contamination cannot be entirely ruled out' for LeetCode results and notes higher scores on July-August 2023 problems closer to the training cutoff."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "Evaluating large language models trained on code",
    432       "authors": [
    433         "M. Chen",
    434         "J. Tworek",
    435         "H. Jun"
    436       ],
    437       "year": 2021,
    438       "arxiv_id": "2107.03374",
    439       "relevance": "Introduces HumanEval benchmark and Codex, foundational for code LLM evaluation."
    440     },
    441     {
    442       "title": "StarCoder: may the source be with you!",
    443       "authors": [
    444         "R. Li",
    445         "L. B. Allal",
    446         "Y. Zi"
    447       ],
    448       "year": 2023,
    449       "arxiv_id": "2305.06161",
    450       "relevance": "Major open-source code model baseline and data filtering methodology used in DeepSeek-Coder."
    451     },
    452     {
    453       "title": "Code Llama: Open Foundation Models for Code",
    454       "authors": [
    455         "B. Roziere",
    456         "J. Gehring",
    457         "F. Gloeckle"
    458       ],
    459       "year": 2023,
    460       "arxiv_id": "2308.12950",
    461       "relevance": "Primary open-source baseline for code generation evaluation across all benchmarks."
    462     },
    463     {
    464       "title": "MultiPL-E: a scalable and polyglot approach to benchmarking neural code generation",
    465       "authors": [
    466         "F. Cassano",
    467         "J. Gouwar",
    468         "D. Nguyen"
    469       ],
    470       "year": 2023,
    471       "relevance": "Provides multilingual extension of HumanEval used for evaluating code generation across 8 languages."
    472     },
    473     {
    474       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    475       "authors": [
    476         "Y. Ding",
    477         "Z. Wang",
    478         "W. U. Ahmad"
    479       ],
    480       "year": 2023,
    481       "relevance": "Benchmark for cross-file code completion, tests repository-level understanding."
    482     },
    483     {
    484       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    485       "authors": [
    486         "Y. Lai",
    487         "C. Li",
    488         "Y. Wang"
    489       ],
    490       "year": 2023,
    491       "relevance": "Practical data science code generation benchmark with 1000 problems across 7 libraries."
    492     },
    493     {
    494       "title": "SantaCoder: don't reach for the stars!",
    495       "authors": [
    496         "L. B. Allal",
    497         "R. Li",
    498         "D. Kocetkov"
    499       ],
    500       "year": 2023,
    501       "arxiv_id": "2301.03988",
    502       "relevance": "Open-source code model baseline and FIM code completion benchmark methodology."
    503     },
    504     {
    505       "title": "Efficient training of language models to fill in the middle",
    506       "authors": [
    507         "M. Bavarian",
    508         "H. Jun",
    509         "N. Tezak"
    510       ],
    511       "year": 2022,
    512       "arxiv_id": "2207.14255",
    513       "relevance": "Introduces FIM pre-training methodology adopted by DeepSeek-Coder."
    514     },
    515     {
    516       "title": "Program synthesis with large language models",
    517       "authors": [
    518         "J. Austin",
    519         "A. Odena",
    520         "M. Nye"
    521       ],
    522       "year": 2021,
    523       "relevance": "Introduces MBPP benchmark widely used for code LLM evaluation."
    524     },
    525     {
    526       "title": "PAL: Program-aided language models",
    527       "authors": [
    528         "L. Gao",
    529         "A. Madaan",
    530         "S. Zhou"
    531       ],
    532       "year": 2023,
    533       "relevance": "Program-aided math reasoning methodology used to evaluate DeepSeek-Coder on mathematical tasks."
    534     },
    535     {
    536       "title": "The Stack: 3 TB of permissively licensed source code",
    537       "authors": [
    538         "D. Kocetkov",
    539         "R. Li",
    540         "L. Jia"
    541       ],
    542       "year": 2022,
    543       "relevance": "Large-scale code dataset with deduplication methodology influencing DeepSeek-Coder's data pipeline."
    544     },
    545     {
    546       "title": "Deduplicating training data makes language models better",
    547       "authors": [
    548         "K. Lee",
    549         "D. Ippolito",
    550         "A. Nystrom"
    551       ],
    552       "year": 2022,
    553       "relevance": "Demonstrates importance of training data deduplication for LLM performance."
    554     }
    555   ],
    556   "engagement_factors": {
    557     "practical_relevance": {
    558       "score": 3,
    559       "justification": "Open-source code models with permissive licensing that developers can immediately use for code completion and generation across 87 languages."
    560     },
    561     "surprise_contrarian": {
    562       "score": 1,
    563       "justification": "The 6.7B matching 34B CodeLlama is mildly surprising but the overall narrative of 'our model beats baselines' is standard."
    564     },
    565     "fear_safety": {
    566       "score": 0,
    567       "justification": "No safety, security, or misuse concerns are discussed or raised by the work."
    568     },
    569     "drama_conflict": {
    570       "score": 1,
    571       "justification": "Implicitly challenges Meta's CodeLlama dominance and claims to beat GPT-3.5, but framed cooperatively rather than confrontationally."
    572     },
    573     "demo_ability": {
    574       "score": 3,
    575       "justification": "Models are publicly available on HuggingFace with a GitHub repo, pip-installable via standard HF tooling, and ready to use immediately."
    576     },
    577     "brand_recognition": {
    578       "score": 2,
    579       "justification": "DeepSeek became widely recognized in the AI community, though at time of publication it was still building its reputation compared to OpenAI or Meta."
    580     }
    581   }
    582 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs