scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24704B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DeepSeek-Coder: When the Large Language Model Meets Programming - The Rise of Code Intelligence",
      6     "authors": [
      7       "Guo, D.",
      8       "Zhu, Q.",
      9       "Yang, D.",
     10       "Xie, Z.",
     11       "et al."
     12     ],
     13     "year": 2024,
     14     "venue": "arXiv",
     15     "arxiv_id": "2401.14196",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims SOTA among open-source code models and superiority over GPT-3.5 are backed by benchmark results in Tables 3–8; 2T token training and 16K context are documented in Sections 2–3.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about FIM and repo-level pre-training are supported by ablation experiments: FIM rate ablation (Figure 3) and CrossCodeEval ablation with/without repo pre-training (Table 7).",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper's title claims 'The Rise of Code Intelligence' and the abstract asserts broad superiority, but evaluations are confined to narrow benchmarks (HumanEval, MBPP, LeetCode); no discussion of generalization limits beyond benchmark settings.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Better performance is attributed to data quality and repo-level training without considering alternative explanations such as sheer data volume advantage, architectural differences, or training compute differences.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Pass@1 on HumanEval is equated with 'code intelligence' throughout; no discussion of whether benchmark performance reflects real-world coding utility or the limitations of these proxies.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; only a brief one-sentence acknowledgment of potential LeetCode contamination in the results section.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are enumerated; the contamination acknowledgment ('the possibility of data contamination cannot be entirely ruled out') is generic boilerplate.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state what its results do not show (e.g., that HumanEval pass rates do not imply real-world productivity, or that comparisons are snapshot-in-time against specific model versions).",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source is disclosed anywhere in the paper; the acknowledgments section lists individual contributors but no funding body or grant.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly disclosed: DeepSeek-AI and Peking University (Key Lab of HCST), with contact emails provided.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "The majority of authors are DeepSeek-AI employees evaluating their own proprietary models; no independent third-party evaluation is performed.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear anywhere in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key technical terms such as Fill-in-the-Middle (PSM/SPM modes), repository-level data construction, and cross-file completion are explained clearly with sufficient specificity for the technical audience.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four explicit contributions are listed in the introduction: the DeepSeek-Coder model series, repo-level data construction, FIM training analysis, and comprehensive benchmark evaluations.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper situates itself against StarCoder, CodeLlama, CodeGeeX2, GPT-3.5/4, Codex, and related work including FIM training (Bavarian et al.) and deduplication (Lee et al., Kocetkov et al.).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Source code and models are released on GitHub at https://github.com/deepseek-ai/DeepSeek-Coder, including the LeetCode evaluation benchmark.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Training data (798 GB proprietary crawl from GitHub) is not released; evaluation uses public benchmarks but the custom training corpus required to reproduce the model is unavailable.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "The HAI-LLM framework and GPU cluster (A100/H800) are described but no requirements.txt, Dockerfile, or pinned dependency list is provided for reproducing training or evaluation.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step instructions for reproducing training or evaluation are included; the paper describes the pipeline at a high level but not with sufficient detail to follow without guessing.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All benchmark results are reported as single point estimates with no confidence intervals, error bars, or standard deviations across multiple runs.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied to any comparative claim despite multiple model comparisons across benchmarks with small evaluation set sizes (e.g., HumanEval n=164).",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Percentage point differences are reported throughout (e.g., '9% and 11% improvement over CodeLlama-Base 34B') with baseline context provided in all comparison tables.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Benchmark sizes (HumanEval n=164, MBPP n=500) are not justified or discussed for statistical adequacy; no power analysis is mentioned.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or multiple-run statistics are reported; all results appear to be single-run point estimates.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Strong baselines included across all tasks: CodeGeeX2, StarCoder, CodeLlama (7B/13B/34B), GPT-3.5-Turbo, GPT-4-Turbo, WizardCoder, Phind-CodeLlama.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include CodeLlama (2023), StarCoder (2023), GPT-3.5/4-Turbo — all contemporary at the time of writing (January 2024).",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Two ablations are presented: FIM rate comparison (0%, 50%, 100%, MSP) in Figure 3, and repo-level pre-training ablation ('w/o Repo Pre-training') in Table 7.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Multiple metrics used: Pass@1, exact match (EM), edit similarity (ES), per-difficulty breakdown, and per-library breakdown across diverse benchmark suites.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "Automated test-case-based evaluation is standard for code generation benchmarks; human evaluation of code correctness is not applicable here.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Held-out sets used: CrossCodeEval (repositories from March–June 2023, after training cutoff), LeetCode Contest (July 2023–January 2024), and standard benchmarks with withheld solutions.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Per-language breakdowns in Tables 3 and 6, per-library breakdown in Table 4 (DS-1000), per-difficulty breakdown in Table 5 (LeetCode Easy/Medium/Hard).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Appendix only shows successful interaction examples (snake game, database); no failure cases or error analysis is presented.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The paper reports that 100% FIM rate hurts code completion (Figure 3) and that DeepSeek-Coder-v1.5 shows slight coding regression vs. original 6.7B (Table 10: 43.2% vs 44.7% HumanEval).",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "DeepSeek models are specified by parameter count, but GPT-3.5-Turbo and GPT-4-Turbo are named without snapshot dates — critical given that OpenAI models change over time.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Only the LeetCode evaluation template is provided; prompts for HumanEval, MBPP, DS-1000, CrossCodeEval, and math reasoning benchmarks are not given.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Table 2 provides hidden size, layers, attention heads, batch size, learning rates; AdamW with β1=0.9, β2=0.95, FIM rate 0.5, warm-up steps, and LR scheduling are described.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding is used; this is direct model evaluation on benchmarks.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 2 and Figure 2 document the full pipeline: crawling, rule filtering, dependency parsing (Algorithm 1), repo-level deduplication, quality screening, and n-gram decontamination.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The 798 GB training corpus is proprietary and not publicly released; no raw training data is available for independent verification.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 2 describes GitHub crawling scope (pre-February 2023), 87-language selection, filter rules, dependency parsing, deduplication, and quality screening with statistics in Table 1.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; evaluation uses automated benchmark testing against fixed test cases.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Figure 2 shows the full data pipeline (crawl → filter → dependency parse → dedup → quality screen), and each step is described in dedicated subsections with specific algorithms and thresholds.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": true,
    296           "justification": "Explicitly stated: 'We collect public repositories created before February 2023 on GitHub.'",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": true,
    302           "justification": "Section 2.4 describes n-gram decontamination filtering HumanEval, MBPP, GSM8K, and MATH examples; CrossCodeEval's post-February 2023 construction is explicitly noted as preventing overlap.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": true,
    308           "justification": "N-gram filtering (10-gram or 3-gram exact match) applied for HumanEval/MBPP/GSM8K/MATH; LeetCode contamination is explicitly acknowledged as unresolvable and flagged for the community.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference latency, memory requirements, or cost estimates are reported despite releasing models for public deployment.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "GPU cluster hardware is described (A100/H800) but total training compute (GPU-hours, FLOPs, or cost) is not reported for any model size.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "DeepSeek-Coder-Base 33B achieves state-of-the-art performance among open-source code models on HumanEval (50.3% avg across 8 languages) and MBPP (66.0%).",
    375       "evidence": "Table 3 shows DeepSeek-Coder-Base 33B outperforming CodeLlama-34B (41.0% avg, 55.2% MBPP) and all other listed open-source models.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "DeepSeek-Coder-Instruct 33B outperforms GPT-3.5-Turbo on code generation benchmarks.",
    380       "evidence": "Table 3 shows Instruct 33B at 69.2% avg vs GPT-3.5-Turbo 64.9%; Table 5 shows Instruct 33B at 27.8% vs GPT-3.5-Turbo 23.3% on LeetCode Contest.",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "DeepSeek-Coder-Base 6.7B matches or exceeds CodeLlama-Base 34B despite having 5x fewer parameters.",
    385       "evidence": "Table 3: DeepSeek 6.7B at 44.7% avg and 60.6% MBPP vs CodeLlama 34B at 41.0% avg and 55.2% MBPP.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Repository-level pre-training improves cross-file code completion performance.",
    390       "evidence": "Table 7 ablation: 'w/o Repo Pre-training' shows performance drops on Java (16.64% vs 17.72% EM), TypeScript (13.23% vs 14.03% EM), and C# (14.48% vs 16.23% EM).",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "FIM training at 50% PSM rate optimally balances code completion (FIM) and code generation performance.",
    395       "evidence": "Figure 3 ablation: 100% FIM rate maximizes HumanEval-FIM but minimizes HumanEval and MBPP pass@1; 50% PSM outperforms MSP strategy.",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "Continuing pre-training from a general LLM (DeepSeek-LLM-7B) significantly improves math and natural language capabilities of DeepSeek-Coder-v1.5.",
    400       "evidence": "Table 10: GSM8K improves from 43.2% to 62.4%, MATH from 19.2% to 24.7%, MMLU from 36.6% to 49.1% at modest cost of ~1.5pp HumanEval regression.",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval"
    406   ],
    407   "key_findings": "DeepSeek-Coder introduces a family of open-source code LLMs (1.3B–33B) trained from scratch on 2 trillion tokens with repository-level data organization and Fill-in-the-Middle training, achieving state-of-the-art performance among open-source models across HumanEval, MBPP, DS-1000, CrossCodeEval, and LeetCode Contest benchmarks. The 33B instruct variant surpasses GPT-3.5-Turbo on most code tasks. Ablations show that 50% PSM-mode FIM rate optimally balances completion and generation ability, and that repo-level pre-training provides modest but consistent improvements on cross-file completion. Continued pre-training from a general LLM substantially improves mathematical and natural language capabilities at minor cost to code performance.",
    408   "red_flags": [
    409     {
    410       "flag": "Self-evaluation",
    411       "detail": "All evaluations are conducted by DeepSeek-AI employees on their own models; no independent third-party replication is reported."
    412     },
    413     {
    414       "flag": "No statistical significance testing",
    415       "detail": "All comparative claims are based on point estimates with no confidence intervals, error bars, or significance tests, despite small benchmark sizes (HumanEval n=164)."
    416     },
    417     {
    418       "flag": "GPT baselines not version-pinned",
    419       "detail": "GPT-3.5-Turbo and GPT-4-Turbo are referenced without snapshot dates; these models changed significantly during 2023–2024, making comparisons unreliable."
    420     },
    421     {
    422       "flag": "Training data not released",
    423       "detail": "The 798 GB proprietary training corpus is not publicly available, making it impossible to fully reproduce the work or verify data quality claims."
    424     },
    425     {
    426       "flag": "LeetCode contamination unresolved",
    427       "detail": "The paper acknowledges 'the possibility of data contamination cannot be entirely ruled out' for LeetCode, and notes higher scores in July/August contests, but does not resolve or quantify the contamination."
    428     },
    429     {
    430       "flag": "No limitations section",
    431       "detail": "Despite making strong comparative claims (SOTA, surpassing GPT-3.5), there is no dedicated limitations or threats-to-validity section."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    437       "relevance": "Primary benchmark used throughout; introduces pass@k evaluation for code generation models."
    438     },
    439     {
    440       "title": "StarCoder: may the source be with you!",
    441       "relevance": "Key open-source baseline model and data source (StarCoder data pipeline) that DeepSeek-Coder directly competes with and builds upon."
    442     },
    443     {
    444       "title": "Code Llama: Open Foundation Models for Code",
    445       "relevance": "Primary open-source baseline across all benchmarks; DeepSeek-Coder's 6.7B model is claimed to match CodeLlama-34B."
    446     },
    447     {
    448       "title": "CrossCodeEval: A Diverse and Multilingual Benchmark for Cross-File Code Completion",
    449       "relevance": "Used to evaluate the novel repo-level pre-training contribution; provides contamination-free evaluation (post February 2023)."
    450     },
    451     {
    452       "title": "DS-1000: A Natural and Reliable Benchmark for Data Science Code Generation",
    453       "relevance": "More realistic benchmark than HumanEval for practical data science coding tasks across 7 libraries."
    454     },
    455     {
    456       "title": "Efficient Training of Language Models to Fill in the Middle (FIM)",
    457       "relevance": "Foundation for the Fill-in-the-Middle training objective that is a core contribution of DeepSeek-Coder."
    458     },
    459     {
    460       "title": "The Stack: 3 TB of Permissively Licensed Source Code",
    461       "relevance": "Data source and deduplication methodology that DeepSeek-Coder's data pipeline extends with repo-level deduplication."
    462     },
    463     {
    464       "title": "Program Synthesis with Large Language Models (MBPP)",
    465       "relevance": "Secondary code generation benchmark used throughout all comparisons."
    466     }
    467   ],
    468   "engagement_factors": {
    469     "practical_relevance": {
    470       "score": 3,
    471       "justification": "Models released open-source with permissive license, directly usable by practitioners as drop-in replacement for closed-source code assistants."
    472     },
    473     "surprise_contrarian": {
    474       "score": 2,
    475       "justification": "An open-source model matching or beating GPT-3.5-Turbo on code was surprising at the time of publication (January 2024)."
    476     },
    477     "fear_safety": {
    478       "score": 0,
    479       "justification": "No AI safety or risk concerns raised; paper is purely a technical model introduction."
    480     },
    481     "drama_conflict": {
    482       "score": 1,
    483       "justification": "Mild competitive framing against OpenAI's closed-source models, but no controversy or confrontational claims."
    484     },
    485     "demo_ability": {
    486       "score": 3,
    487       "justification": "Models are publicly available on GitHub and HuggingFace; anyone can run them immediately."
    488     },
    489     "brand_recognition": {
    490       "score": 2,
    491       "justification": "DeepSeek-AI has become well-known in the open-source LLM community; Peking University affiliation adds academic credibility."
    492     }
    493   },
    494   "hn_data": {
    495     "threads": [
    496       {
    497         "hn_id": "39142278",
    498         "title": "Python has 189X the dataset size compared to Rust",
    499         "points": 2,
    500         "comments": 4,
    501         "url": "https://news.ycombinator.com/item?id=39142278",
    502         "created_at": "2024-01-26T13:18:01Z"
    503       }
    504     ],
    505     "top_points": 2,
    506     "total_points": 2,
    507     "total_comments": 4
    508   }
    509 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs