scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29041B)
      1 {
      2   "paper": {
      3     "title": "DeepSeek-Coder-V2: Breaking the Barrier of Closed-Source Models in Code Intelligence",
      4     "authors": [
      5       "Qihao Zhu",
      6       "Daya Guo",
      7       "Zhihong Shao",
      8       "Dejian Yang",
      9       "Peiyi Wang",
     10       "Runxin Xu",
     11       "Y. Wu",
     12       "Yukun Li",
     13       "Huazuo Gao",
     14       "Shirong Ma",
     15       "Wangding Zeng",
     16       "Xiao Bi",
     17       "Zihui Gu",
     18       "Hanwei Xu",
     19       "Damai Dai",
     20       "Kai Dong",
     21       "Liyue Zhang",
     22       "Yishi Piao",
     23       "Zhibin Gou",
     24       "Zhenda Xie",
     25       "Zhewen Hao",
     26       "Bingxuan Wang",
     27       "Junxiao Song",
     28       "Deli Chen",
     29       "Xin Xie",
     30       "Kang Guan",
     31       "Yuxiang You",
     32       "Aixin Liu",
     33       "Qiushi Du",
     34       "Wenjun Gao",
     35       "Xuan Lu",
     36       "Qinyu Chen",
     37       "Yaohui Wang",
     38       "Chengqi Deng",
     39       "Jiashi Li",
     40       "Chenggang Zhao",
     41       "Chong Ruan",
     42       "Fuli Luo",
     43       "Wenfeng Liang"
     44     ],
     45     "year": 2024,
     46     "venue": "arXiv",
     47     "arxiv_id": "2406.11931"
     48   },
     49   "scan_version": 3,
     50   "active_modules": [
     51     "experimental_rigor",
     52     "data_leakage"
     53   ],
     54   "methodology_tags": [
     55     "benchmark-eval"
     56   ],
     57   "key_findings": "DeepSeek-Coder-V2, a 236B MoE model (21B active params), achieves performance comparable to GPT-4 Turbo on code and math benchmarks including 90.2% on HumanEval, 76.2% on MBPP+, 75.7% on MATH, and 43.4% on LiveCodeBench. The 16B variant (2.4B active) outperforms larger dense models like DeepSeek-Coder-33B. The model supports 338 programming languages and 128K context length. A 1B ablation study validates the new code corpus improves over the original DeepSeek-Coder corpus.",
     58   "checklist": {
     59     "artifacts": {
     60       "code_released": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "GitHub repository URL provided in the paper header: https://github.com/deepseek-ai/DeepSeek-Coder-V2. Model weights released under permissive license."
     64       },
     65       "data_released": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "The training data (6T tokens from GitHub, CommonCrawl) is not released. Standard evaluation benchmarks (HumanEval, MBPP, etc.) are public, but the proprietary training corpus is not available."
     69       },
     70       "environment_specified": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No environment specifications, requirements.txt, or dependency details are provided in the paper."
     74       },
     75       "reproduction_instructions": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No step-by-step reproduction instructions for the benchmark evaluations are provided in the paper. The paper mentions using 'identical scripts and environments' for baselines but does not provide these."
     79       }
     80     },
     81     "statistical_methodology": {
     82       "confidence_intervals_or_error_bars": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "All results are reported as single point estimates (e.g., '90.2% on HumanEval') with no confidence intervals or error bars."
     86       },
     87       "significance_tests": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "The paper claims superiority over various models based solely on comparing raw numbers. No statistical significance tests are performed."
     91       },
     92       "effect_sizes_reported": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Results are reported with baseline context throughout — e.g., ablation study shows improvement from 30.5% to 37.2% on HumanEval. All comparison tables show both the proposed model and baselines' raw scores."
     96       },
     97       "sample_size_justified": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No justification for benchmark sizes or number of evaluation examples. Standard benchmarks are used (HumanEval has 164 problems) but no discussion of whether these are sufficient for the claims."
    101       },
    102       "variance_reported": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run greedy decoding (the paper states 'greedy search strategy'). The one exception is AIME where maj@64 is mentioned but this is a different metric, not variance."
    106       }
    107     },
    108     "evaluation_design": {
    109       "baselines_included": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Extensive baseline comparisons including CodeLlama, StarCoder, StarCoder2, DeepSeek-Coder, Codestral, Llama3, GPT-4 variants, Claude 3 Opus, and Gemini 1.5 Pro (Tables 3-10)."
    113       },
    114       "baselines_contemporary": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Baselines include GPT-4o-0513, Claude 3 Opus, Gemini 1.5 Pro, Llama3-70B, and Codestral — all contemporary at time of publication (June 2024)."
    118       },
    119       "ablation_study": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Table 1 provides an ablation study comparing the new code corpus vs. old DeepSeek-Coder corpus using a 1B parameter model. Figure 3 compares reward model signal vs. compiler signal for RL training."
    123       },
    124       "multiple_metrics": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Multiple benchmarks and metrics used: HumanEval, MBPP+, LiveCodeBench, USACO, SWE-bench, Aider, Defects4J, CruxEval, RepoBench, FIM tasks, GSM8K, MATH, AIME, MMLU, BBH, Arena-Hard, MT-Bench, etc."
    128       },
    129       "human_evaluation": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "No human evaluation of model outputs. All evaluations are automated (test case pass/fail, exact match, GPT-4 as judge for Arena-Hard/MT-Bench). The paper claims code quality but relies entirely on automated metrics."
    133       },
    134       "held_out_test_set": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Standard held-out benchmarks used. For RepoBench, the paper explicitly uses only December 2023 data to avoid overlap with training data (cutoff November 2023). LiveCodeBench uses problems from Dec 2023-June 2024."
    138       },
    139       "per_category_breakdown": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Detailed per-language breakdowns across 14 programming languages in Table 3, per-difficulty breakdowns for LiveCodeBench (Easy/Medium/Hard in Table 4), per-context-length breakdowns for RepoBench (Table 5)."
    143       },
    144       "failure_cases_discussed": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The conclusion discusses failure modes: 'there is still a significant gap in instruction-following capabilities compared to current state-of-the-art models like GPT-4 Turbo. This gap leads to poor performance in complex scenarios and tasks such as those in SWEbench.'"
    148       },
    149       "negative_results_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Several negative results reported: DeepSeek-Coder-V2-Lite drops on knowledge-intensive benchmarks like TriviaQA (59.5% vs 65.2% for DeepSeek-V2-Lite). Performance gap on CruxEval vs closed-source models acknowledged. Instability during training with exponential normalization noted."
    153       }
    154     },
    155     "claims_and_evidence": {
    156       "abstract_claims_supported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The abstract claims 'performance comparable to GPT4-Turbo in code-specific tasks' and this is supported by Tables 3-9 showing competitive or superior results on most benchmarks."
    160       },
    161       "causal_claims_justified": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Causal claims are limited and backed by controlled experiments. The corpus ablation (Table 1) uses the same 1B model architecture with different corpora. The reward model vs. compiler signal comparison (Figure 3) is a controlled comparison. Claims like 'continued pre-training enhances coding capabilities' are supported by before/after comparisons on the same architecture."
    165       },
    166       "generalization_bounded": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The title claims 'Breaking the Barrier of Closed-Source Models in Code Intelligence' — a very broad framing. While results show competitive performance on specific benchmarks, the paper acknowledges 'a significant gap in instruction-following capabilities' and poor SWE-bench performance. The title overclaims relative to the evidence."
    170       },
    171       "alternative_explanations_discussed": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No discussion of alternative explanations for the results. For example, improved performance could partly be due to benchmark contamination (training data includes GitHub repos that may contain benchmark solutions), but this is not discussed."
    175       },
    176       "proxy_outcome_distinction": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper frames benchmark performance (HumanEval, MBPP pass@1) as 'code intelligence' without discussing what code intelligence actually entails or whether these benchmarks adequately measure it. The gap between 'pass@1 on function-level benchmarks' and 'code intelligence' is not acknowledged."
    180       }
    181     },
    182     "setup_transparency": {
    183       "model_versions_specified": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Specific model versions given for baselines: GPT-4-1106, GPT-4-Turbo-0409, GPT-4o-0513. Own model sizes (16B/236B with active params 2.4B/21B) clearly specified. Table 2 details training settings."
    187       },
    188       "prompts_provided": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The HumanEval prompt template is provided in footnote 4. Math evaluation prompt is given in footnote 9 ('Please reason step by step, and put your final answer within \\boxed{}'). FIM format tokens shown in Section 3.1."
    192       },
    193       "hyperparameters_reported": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Training hyperparameters reported in Section 3.3: AdamW with β1=0.9, β2=0.95, weight decay 0.1, cosine decay with 2000 warmup steps. SFT: lr 5e-6, batch 1M tokens, 1B tokens total. YARN hyperparameters: scale 40, α=1, β=32. Evaluation uses greedy search."
    197       },
    198       "scaffolding_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No agentic scaffolding is used. The model is evaluated directly on benchmarks via standard prompting."
    202       },
    203       "data_preprocessing_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Section 2 provides detailed data filtering rules: average line length <100, max line length <1000, >25% alphabetic chars, XML filtering, HTML visible text ratio >20%, JSON/YAML 50-5000 chars. Near-deduplication applied. Three-iteration CommonCrawl pipeline with fastText classifier described."
    207       }
    208     },
    209     "limitations_and_scope": {
    210       "limitations_section_present": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No dedicated limitations section. The conclusion briefly mentions the instruction-following gap but this is more of a future work direction than a limitations discussion."
    214       },
    215       "threats_to_validity_specific": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No threats to validity discussed. No mention of evaluation methodology concerns, benchmark contamination risks, or limitations of the comparison methodology."
    219       },
    220       "scope_boundaries_stated": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No explicit scope boundaries stated. The paper does not state what the results do NOT show or what settings are excluded from the claims."
    224       }
    225     },
    226     "data_integrity": {
    227       "raw_data_available": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Raw evaluation outputs are not available. Only aggregate scores are reported. Training data is not released."
    231       },
    232       "data_collection_described": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 2 describes data collection in detail: GitHub repos before November 2023, CommonCrawl pipeline with fastText classification using seed corpus from StackOverflow/PyTorch docs/StackExchange, three iterations of data collection yielding 1,170B code tokens and 221B math tokens."
    236       },
    237       "recruitment_methods_described": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No human participants. Data sources are standard benchmarks and public code repositories."
    241       },
    242       "data_pipeline_documented": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "The data pipeline is documented: filtering rules → near-deduplication → 821B code + 185B code-related text from GitHub; fastText seed corpus → iterative CommonCrawl collection → 70B code-related + 221B math-related tokens; plus 94B additional GitHub source code from two iterations. Final composition: 60% code, 10% math, 30% NL."
    246       }
    247     },
    248     "conflicts_of_interest": {
    249       "funding_disclosed": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "No funding or acknowledgments section. DeepSeek-AI is the sole affiliation but no funding sources are disclosed."
    253       },
    254       "affiliations_disclosed": {
    255         "applies": true,
    256         "answer": true,
    257         "justification": "All authors are listed under 'DeepSeek-AI'. The affiliation is clear."
    258       },
    259       "funder_independent_of_outcome": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "DeepSeek-AI is both the developer and evaluator of the model. The company has a direct commercial interest in the model performing well on benchmarks."
    263       },
    264       "financial_interests_declared": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No competing interests statement. Authors presumably have financial interest in DeepSeek-AI's products but this is not declared."
    268       }
    269     },
    270     "contamination": {
    271       "training_cutoff_stated": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Training data cutoff stated as 'before November 2023' for GitHub data (Section 2). The paper uses this to justify temporal splits for RepoBench evaluation."
    275       },
    276       "train_test_overlap_discussed": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "For RepoBench, the paper explicitly addresses overlap: 'we only use data from December 2023' to avoid data present in pre-training. For LiveCodeBench, the paper uses the subset from Dec 2023 to June 2024 since 'the cut-off of the training data is before November 2023.'"
    280       },
    281       "benchmark_contamination_addressed": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "While temporal splits are used for RepoBench and LiveCodeBench, the paper does not address contamination for older benchmarks like HumanEval (2021), MBPP (2021), GSM8K (2021), or MATH (2021), all of which were available online well before the November 2023 training cutoff. These are among the paper's most prominent results."
    285       }
    286     },
    287     "human_studies": {
    288       "pre_registered": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "No human participants in this study."
    292       },
    293       "irb_or_ethics_approval": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "No human participants in this study."
    297       },
    298       "demographics_reported": {
    299         "applies": false,
    300         "answer": false,
    301         "justification": "No human participants in this study."
    302       },
    303       "inclusion_exclusion_criteria": {
    304         "applies": false,
    305         "answer": false,
    306         "justification": "No human participants in this study."
    307       },
    308       "randomization_described": {
    309         "applies": false,
    310         "answer": false,
    311         "justification": "No human participants in this study."
    312       },
    313       "blinding_described": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No human participants in this study."
    317       },
    318       "attrition_reported": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No human participants in this study."
    322       }
    323     },
    324     "cost_and_practicality": {
    325       "inference_cost_reported": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "No inference cost or latency reported despite the model being 236B parameters. Active parameter counts (2.4B, 21B) are given but no wall-clock time or API cost data."
    329       },
    330       "compute_budget_stated": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Total training compute is not quantified in GPU hours or cost. The paper states 10.2T tokens were used for training but does not report the hardware or time required."
    334       }
    335     },
    336     "experimental_rigor": {
    337       "seed_sensitivity_reported": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "No seed sensitivity analysis. Results appear to be single-run evaluations using greedy decoding."
    341       },
    342       "number_of_runs_stated": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "Number of runs not stated for any benchmark evaluation. Greedy decoding implies single-run but this is not explicitly confirmed for all benchmarks."
    346       },
    347       "hyperparameter_search_budget": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No hyperparameter search budget reported for training or evaluation. Learning rates and other hyperparameters are stated but not how they were selected."
    351       },
    352       "best_config_selection_justified": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The paper states hyperparameters are 'consistent with DeepSeek V2 methodology' but does not justify the selection or explain if any search was performed for the new training phase."
    356       },
    357       "multiple_comparison_correction": {
    358         "applies": false,
    359         "answer": false,
    360         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    361       },
    362       "self_comparison_bias_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "DeepSeek-AI is evaluating its own model against competitors. The paper claims to use 'identical scripts and environments' for baselines but does not acknowledge the inherent bias of self-evaluation or discuss whether independent evaluation was performed."
    366       },
    367       "compute_budget_vs_performance": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No performance vs. compute comparison. The 236B model uses 21B active params vs. competitors' full dense models but inference FLOPs are not compared. The paper notes the smaller active params as an advantage without quantifying."
    371       },
    372       "benchmark_construct_validity": {
    373         "applies": true,
    374         "answer": false,
    375         "justification": "The paper uses many benchmarks but does not discuss whether they actually measure 'code intelligence' as claimed. No discussion of construct validity for any benchmark."
    376       },
    377       "scaffold_confound_addressed": {
    378         "applies": false,
    379         "answer": false,
    380         "justification": "No scaffolding is used in the evaluations. Models are evaluated directly via standard prompting."
    381       }
    382     },
    383     "data_leakage": {
    384       "temporal_leakage_addressed": {
    385         "applies": true,
    386         "answer": true,
    387         "justification": "Temporal leakage addressed for RepoBench (uses only December 2023 data) and LiveCodeBench (uses problems from Dec 2023-June 2024, after training cutoff). However, not addressed for older benchmarks."
    388       },
    389       "feature_leakage_addressed": {
    390         "applies": true,
    391         "answer": false,
    392         "justification": "No discussion of feature leakage in the evaluation setup."
    393       },
    394       "non_independence_addressed": {
    395         "applies": true,
    396         "answer": false,
    397         "justification": "No discussion of whether training data and benchmark problems share structural similarities (e.g., HumanEval problems sourced from common programming patterns that appear in GitHub training data)."
    398       },
    399       "leakage_detection_method": {
    400         "applies": true,
    401         "answer": false,
    402         "justification": "No concrete leakage detection method applied. Temporal splits are used for two benchmarks but no membership inference, canary strings, or n-gram overlap analysis performed."
    403       }
    404     }
    405   },
    406   "claims": [
    407     {
    408       "claim": "DeepSeek-Coder-V2 achieves 90.2% on HumanEval, outperforming GPT-4 Turbo (88.2%) and rivaling GPT-4o (91.0%)",
    409       "evidence": "Table 3 shows Python HumanEval scores. Greedy decoding used for all models with identical scripts and environments.",
    410       "supported": "moderate"
    411     },
    412     {
    413       "claim": "DeepSeek-Coder-V2 achieves 75.7% on MATH, nearly matching GPT-4o's 76.6%",
    414       "evidence": "Table 9, zero-shot chain-of-thought evaluation as described in footnote 9.",
    415       "supported": "moderate"
    416     },
    417     {
    418       "claim": "The new code corpus improves over the DeepSeek-Coder corpus by 6.7% on HumanEval and 9.4% on MBPP",
    419       "evidence": "Table 1 ablation study with 1B parameter model: old corpus 30.5%/44.6% vs new corpus 37.2%/54.0% at 2T tokens.",
    420       "supported": "strong"
    421     },
    422     {
    423       "claim": "DeepSeek-Coder-V2 is the first open-source model to surpass 10% on SWE-Bench",
    424       "evidence": "Table 7 shows 12.7% on SWE-Bench. Other open-source models score 0.0% (DS-Coder-Instruct, DS-Coder-V2-Lite) or 2.7% (Codestral).",
    425       "supported": "moderate"
    426     },
    427     {
    428       "claim": "Reward model signal outperforms raw compiler signal for RL training",
    429       "evidence": "Figure 3 shows reward model signal achieving higher Pass@1 on both LeetCode and LeetCode-zh compared to compiler signal and SFT baseline.",
    430       "supported": "moderate"
    431     },
    432     {
    433       "claim": "DeepSeek-Coder-V2 maintains comparable general language performance to DeepSeek-V2",
    434       "evidence": "Table 10 shows comparable or improved scores on BBH, MMLU, ARC, C-Eval, CMMLU. Some drops on TriviaQA (82.3 vs 86.7) and NaturalQuestions (47.5 vs 53.4).",
    435       "supported": "moderate"
    436     }
    437   ],
    438   "red_flags": [
    439     {
    440       "flag": "Company evaluating its own product",
    441       "detail": "DeepSeek-AI evaluates DeepSeek-Coder-V2 against competitors. While baselines are reportedly run with identical scripts, there is inherent bias in a company benchmarking its own model — they have full control over prompt templates, evaluation harness details, and which benchmarks to report."
    442     },
    443     {
    444       "flag": "No uncertainty quantification",
    445       "detail": "All results are single-point estimates with no error bars, confidence intervals, or multi-run variance. Given that benchmark scores can vary significantly across runs, prompts, and evaluation configurations, the reported differences may not be statistically meaningful."
    446     },
    447     {
    448       "flag": "Contamination risk for primary benchmarks",
    449       "detail": "HumanEval (2021), MBPP (2021), GSM8K (2021), and MATH (2021) were all published years before the November 2023 training cutoff. The paper addresses contamination only for RepoBench and LiveCodeBench but not for these older, headline benchmarks where contamination risk is highest."
    450     },
    451     {
    452       "flag": "No limitations section",
    453       "detail": "The paper has no limitations or threats-to-validity section. The conclusion's acknowledgment of instruction-following gaps reads more as future work than honest limitations analysis."
    454     },
    455     {
    456       "flag": "Overclaiming title",
    457       "detail": "'Breaking the Barrier of Closed-Source Models in Code Intelligence' is a broad claim. The model is competitive on specific benchmarks but falls significantly short on SWE-bench (12.7% vs 26.7% for GPT-4o) and instruction-following tasks, which are arguably more representative of real-world 'code intelligence.'"
    458     }
    459   ],
    460   "cited_papers": [
    461     {
    462       "title": "Evaluating large language models trained on code",
    463       "authors": [
    464         "M. Chen",
    465         "J. Tworek",
    466         "H. Jun"
    467       ],
    468       "year": 2021,
    469       "arxiv_id": "2107.03374",
    470       "relevance": "Introduces HumanEval benchmark, foundational for code generation evaluation."
    471     },
    472     {
    473       "title": "SWE-bench: Can language models resolve real-world GitHub issues?",
    474       "authors": [
    475         "C. E. Jimenez",
    476         "J. Yang",
    477         "A. Wettig"
    478       ],
    479       "year": 2023,
    480       "arxiv_id": "2310.06770",
    481       "relevance": "Real-world software engineering benchmark used to evaluate model capabilities."
    482     },
    483     {
    484       "title": "LiveCodeBench: Holistic and contamination free evaluation of large language models for code",
    485       "authors": [
    486         "N. Jain",
    487         "K. Han",
    488         "A. Gu"
    489       ],
    490       "year": 2024,
    491       "relevance": "Contamination-free code benchmark using temporally-split competitive programming problems."
    492     },
    493     {
    494       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    495       "authors": [
    496         "J. Liu",
    497         "C. S. Xia",
    498         "Y. Wang",
    499         "L. Zhang"
    500       ],
    501       "year": 2023,
    502       "relevance": "EvalPlus/MBPP+ evaluation framework used in this paper for more rigorous code evaluation."
    503     },
    504     {
    505       "title": "DeepSeek-Coder: When the large language model meets programming – the rise of code intelligence",
    506       "authors": [
    507         "D. Guo",
    508         "Q. Zhu",
    509         "D. Yang"
    510       ],
    511       "year": 2024,
    512       "arxiv_id": "2401.14196",
    513       "relevance": "Predecessor model; provides the base architecture and training methodology built upon in this work."
    514     },
    515     {
    516       "title": "Code Llama: Open foundation models for code",
    517       "authors": [
    518         "B. Roziere",
    519         "J. Gehring",
    520         "F. Gloeckle"
    521       ],
    522       "year": 2023,
    523       "arxiv_id": "2308.12950",
    524       "relevance": "Major open-source code model baseline for comparison."
    525     },
    526     {
    527       "title": "StarCoder 2 and The Stack V2: The next generation",
    528       "authors": [
    529         "A. Lozhkov",
    530         "R. Li",
    531         "L. B. Allal"
    532       ],
    533       "year": 2024,
    534       "arxiv_id": "2402.19173",
    535       "relevance": "Open-source code model and training data corpus baseline."
    536     },
    537     {
    538       "title": "RepoBench: Benchmarking repository-level code auto-completion systems",
    539       "authors": [
    540         "T. Liu",
    541         "C. Xu",
    542         "J. McAuley"
    543       ],
    544       "year": 2023,
    545       "relevance": "Repository-level code completion benchmark used to evaluate long-context code understanding."
    546     },
    547     {
    548       "title": "DeepSeekMath: Pushing the limits of mathematical reasoning in open language models",
    549       "authors": [
    550         "Z. Shao",
    551         "P. Wang",
    552         "Q. Zhu"
    553       ],
    554       "year": 2024,
    555       "arxiv_id": "2402.03300",
    556       "relevance": "Provides the math training data pipeline and GRPO RL algorithm reused in this work."
    557     },
    558     {
    559       "title": "Program synthesis with large language models",
    560       "authors": [
    561         "J. Austin",
    562         "A. Odena",
    563         "M. Nye"
    564       ],
    565       "year": 2021,
    566       "arxiv_id": "2108.07732",
    567       "relevance": "Introduces MBPP benchmark used for code generation evaluation."
    568     }
    569   ],
    570   "engagement_factors": {
    571     "practical_relevance": {
    572       "score": 2,
    573       "justification": "Open-source code model with 338 language support and 128K context that practitioners can deploy, though not as simple as an API call."
    574     },
    575     "surprise_contrarian": {
    576       "score": 2,
    577       "justification": "An open-source model matching GPT-4 Turbo on code benchmarks was a notable achievement at the time, challenging the closed-source dominance narrative."
    578     },
    579     "fear_safety": {
    580       "score": 0,
    581       "justification": "No safety, security, or risk themes are discussed in the paper."
    582     },
    583     "drama_conflict": {
    584       "score": 1,
    585       "justification": "The title explicitly frames it as 'breaking the barrier' of closed-source models, creating mild tension with OpenAI/Google/Anthropic, but doesn't directly accuse anyone."
    586     },
    587     "demo_ability": {
    588       "score": 2,
    589       "justification": "Weights are publicly released on GitHub/HuggingFace under a permissive license, though running a 236B MoE model requires substantial hardware."
    590     },
    591     "brand_recognition": {
    592       "score": 2,
    593       "justification": "DeepSeek gained significant recognition in the AI community and the paper directly benchmarks against GPT-4, Claude, and Gemini."
    594     }
    595   }
    596 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs