ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31881B)


      1 {
      2   "paper": {
      3     "title": "StarCoder 2 and The Stack v2: The Next Generation",
      4     "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal", "Federico Cassano", "Joel Lamy-Poirier", "Nouamane Tazi", "Ao Tang", "Dmytro Pykhtar", "Jiawei Liu", "Yuxiang Wei", "Tianyang Liu", "Max Tian", "Denis Kocetkov", "Arthur Zucker", "Younes Belkada", "Zijian Wang", "Qian Liu", "Dmitry Abulkhanov", "Indraneil Paul", "Zhuang Li", "Wen-Ding Li", "Megan Risdal", "Jia Li", "Jian Zhu", "Terry Yue Zhuo", "Evgenii Zheltonozhskii", "Nii Osae Osae Dade", "Wenhao Yu", "Lucas Krauß", "Naman Jain", "Yixuan Su", "Xuanli He", "Manan Dey", "Edoardo Abati", "Yekun Chai", "Niklas Muennighoff", "Xiangru Tang", "Muhtasham Oblokulov", "Christopher Akiki", "Marc Marone", "Chenghao Mou", "Mayank Mishra", "Alex Gu", "Binyuan Hui", "Tri Dao", "Armel Zebaze", "Olivier Dehaene", "Nicolas Patry", "Canwen Xu", "Julian McAuley", "Han Hu", "Torsten Scholak", "Sebastien Paquet", "Jennifer Robinson", "Carolyn Jane Anderson", "Nicolas Chapados", "Mostofa Patwary", "Nima Tajbakhsh", "Yacine Jernite", "Carlos Muñoz Ferrandis", "Lingming Zhang", "Sean Hughes", "Thomas Wolf", "Arjun Guha", "Leandro von Werra", "Harm de Vries"],
      5     "year": 2024,
      6     "venue": "TMLR (under review)",
      7     "arxiv_id": "2402.19173"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "checklist": {
     12     "artifacts": {
     13       "code_released": {
     14         "applies": true,
     15         "answer": true,
     16         "justification": "The paper releases model weights under an OpenRAIL license, the training dataset (The Stack v2) on Hugging Face, search tools, and references multiple GitHub repositories and Hugging Face spaces for evaluation, data inspection, and attribution."
     17       },
     18       "data_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The Stack v2 dataset is released via Hugging Face (bigcode/the-stack-v2). SWHIDs for source code are provided for full traceability. The training data composition is fully documented."
     22       },
     23       "environment_specified": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "While hardware is mentioned (A100 SXM4 80GB, H100) and some training framework details given (FlashAttention-2), there is no requirements.txt, Dockerfile, or detailed environment specification with library versions for reproducing training or evaluation."
     27       },
     28       "reproduction_instructions": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No step-by-step reproduction instructions are provided. The paper describes the methodology in detail but does not provide runnable scripts or a README with commands to replicate training or evaluation."
     32       }
     33     },
     34     "statistical_methodology": {
     35       "confidence_intervals_or_error_bars": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "Results are reported as point estimates (e.g., 'Pass@1' scores) throughout all tables (Tables 9-18) with no confidence intervals or error bars."
     39       },
     40       "significance_tests": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper makes many comparative claims ('outperforms', 'significantly outperforms') but no statistical significance tests are performed. Comparisons are based solely on point estimate differences."
     44       },
     45       "effect_sizes_reported": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper reports percentage improvements with baselines, e.g., 'StarCoder2-3B outperforms StarCoderBase-3B, exhibiting improvements of 60.2% on HumanEval+ and 32.4% on MBPP+' (§7.1.1). Absolute scores and relative gains are consistently provided."
     49       },
     50       "sample_size_justified": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No justification is given for the number of samples used in evaluation (e.g., 50 samples for MultiPL-E, 40 for DS-1000, 20 for HumanEvalFix). These choices follow prior work but are not independently justified."
     54       },
     55       "variance_reported": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "The CRUXEval section mentions noise margins (~0.2% from sampling, ~1.5% from benchmark selection) from the original paper, but StarCoder2's own results report no standard deviation or variance across runs. Single-run or unreported-run results throughout."
     59       }
     60     },
     61     "evaluation_design": {
     62       "baselines_included": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Extensive baselines are included: StarCoderBase (3B/7B/15B), CodeLlama (7B/13B/34B), DeepSeekCoder (1.3B/6.7B/33B), StableCode-3B, and OctoCoder across all evaluations."
     66       },
     67       "baselines_contemporary": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Baselines include DeepSeekCoder (2024), CodeLlama (2023), and StableCode (2024), which were contemporary and competitive at the time of writing."
     71       },
     72       "ablation_study": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No ablation study is presented. The paper does not systematically vary components (e.g., data sources, FIM rate, repository-context training, IR data) to measure their individual contributions. The Issue vs. Instruct prompt comparison (§7.2) is the closest, but it is not a systematic ablation."
     76       },
     77       "multiple_metrics": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Multiple metrics are used: Pass@1, Pass@5, exact match, edit similarity, CodeBLEU, identifier match F1, and accuracy across different benchmarks and tasks."
     81       },
     82       "human_evaluation": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "No human evaluation of generated code quality is included. All evaluation is automated via test suites, exact match, or classifier-based metrics. The security benchmark uses automated CWE detection, not human review."
     86       },
     87       "held_out_test_set": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "The paper uses established held-out benchmarks (HumanEval, MBPP, DS-1000, etc.) and performs decontamination (§3.3) to remove benchmark examples from training data. RepoBench v1.1 explicitly removes duplicates against The Stack v2."
     91       },
     92       "per_category_breakdown": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Extensive per-category breakdowns: MultiPL-E results per programming language (Table 10), DS-1000 per library (Table 11), HumanEvalFix per language (Table 12), BOLD per demographic category (Table 20), CrossCodeEval per language (Table 18)."
     96       },
     97       "failure_cases_discussed": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper discusses StarCoder2-15B's poor FIM performance due to an implementation bug (§7.5), StarCoder2-7B's unexpectedly weak performance relative to its size (§1), and C++ underperformance with the Issue prompt due to incomplete code generation (§7.2.1)."
    101       },
    102       "negative_results_reported": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Several negative results: StarCoder2-7B underperforms DeepSeekCoder-6.7B significantly; StarCoder2-15B FIM is poor due to a bug; StarCoder2-7B and 15B have high insecure code generation rates (39.8% and 39.2%); the authors state 'It is not clear to this report's authors why StarCoder2-7B does not perform as well.'"
    106       }
    107     },
    108     "claims_and_evidence": {
    109       "abstract_claims_supported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The abstract claims StarCoder2-3B outperforms similar models and StarCoderBase-15B (supported by Tables 9-10), StarCoder2-15B matches or outperforms CodeLlama-34B (supported by multiple tables), and DeepSeekCoder-33B is best at code completion for high-resource languages (acknowledged in results). All claims are supported."
    113       },
    114       "causal_claims_justified": {
    115         "applies": true,
    116         "answer": false,
    117         "justification": "The paper implies causal claims about training data and design choices (e.g., 'pre-training on pull requests is a viable alternative to pre-training on commits', 'repository-level training consistently outperforms') but these are confounded — multiple variables change simultaneously (data size, composition, architecture, tokenizer) with no controlled ablation."
    118       },
    119       "generalization_bounded": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Claims are generally bounded to tested settings. The paper specifies model sizes, benchmarks, and languages tested. It acknowledges limitations like StarCoder2-7B's underperformance and notes DeepSeekCoder-33B dominance on high-resource languages. The title ('StarCoder 2 and The Stack v2') is appropriately scoped."
    123       },
    124       "alternative_explanations_discussed": {
    125         "applies": true,
    126         "answer": false,
    127         "justification": "The paper does not discuss alternative explanations for performance differences. When StarCoder2-15B outperforms CodeLlama-34B, no discussion of whether this is due to data quality, data quantity, architecture, training recipe, or other factors. The unexplained StarCoder2-7B underperformance is noted but not analyzed."
    128       },
    129       "proxy_outcome_distinction": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper primarily reports benchmark-specific metrics (Pass@1, exact match, etc.) without overclaiming broader capabilities. It does not conflate benchmark performance with general 'coding ability.' Claims are tied to specific benchmarks and tasks."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "All model versions are specified with exact sizes (StarCoder2-3B/7B/15B, CodeLlama-7B/13B/34B, DeepSeekCoder-1.3B/6.7B/33B, StableCode-3B). Architecture details, hyperparameters, and training configurations are fully documented in Tables 6-8."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The Issue prompt template for HumanEvalFix is provided in §7.2.1 with the full format. Data formatting templates are extensively documented in §5 with sentinel tokens. The evaluation setup follows standard benchmark prompts with references to original papers."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Training hyperparameters are fully reported in Tables 7-8 (learning rate, batch size, iterations, epochs, RoPE θ, optimizer settings). Evaluation hyperparameters (temperature, top-p, number of samples) are stated for each benchmark."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. The paper evaluates base language models on standard benchmarks without any agent framework, tool use, or multi-step reasoning pipeline."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Extensive documentation of data preprocessing in §2-3: license detection (Figure 1), language filtering, basic filters (line length, autogenerated, alpha), deduplication parameters (5-grams, Jaccard 0.7), PII redaction, decontamination, malware removal, opt-out handling. Table 3 summarizes which steps apply to each source."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 9 ('Social Impact and Limitations') includes §9.3 ('Challenges and Risks') discussing openness/safety risks, privacy, security, societal bias, representation bias, traceability challenges, and job augmentation vs. automation."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Specific threats discussed: FIM implementation bug affecting StarCoder2-15B results (§7.5), RoPE θ parsing bug for 15B (§6.3), representation bias toward high-resource languages, PII identification challenges, and the risk of malicious code generation with specific CWE examples."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The paper does not explicitly state what the results do NOT show. While it acknowledges some weaknesses (e.g., StarCoder2-7B underperformance), it does not clearly bound the scope of generalization — e.g., these are base model results only, instruction-tuning may change rankings, real-world coding tasks differ from benchmarks."
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The training dataset (The Stack v2) is released on Hugging Face with SWHIDs for traceability. The 'Am I in The Stack' tool and Data Portraits Bloom filter enable membership checking. Evaluation benchmarks are all public."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Data collection is described in extensive detail in §2: Software Heritage archive extraction, GHArchive for issues and PRs, Kaggle API, package manager crawling, textbook scraping, with specific versions and dates (SH graph dataset 2023-09-06)."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": false,
    192         "answer": false,
    193         "justification": "No human participants. The paper uses public source code repositories and standard benchmarks. The 15 BigCode community annotators for data inspection (§2.1) are mentioned but this is part of data curation, not an evaluation study."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The full pipeline is documented: extraction → license detection → language detection → visual inspection → basic filters → language-specific filters → deduplication → PII redaction → decontamination → malware removal → opt-out handling. Table 3 summarizes which steps apply to each data source. Specific removal counts are given (e.g., 59,442 malware files, 22,066 opt-out files)."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "§11 (Acknowledgements) discloses funding: NSF awards SES-2326173, SES-2326174, CCF-2131943, Roblox sponsorship, Adams Fellowships, ServiceNow and Hugging Face stewardship, NVIDIA partnership for training infrastructure."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly listed: Hugging Face, ServiceNow Research, NVIDIA, and numerous academic institutions. The corresponding authors are from Hugging Face and ServiceNow."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "Hugging Face and ServiceNow, which steward the project and employ many authors, have direct financial interest in StarCoder2's success. ServiceNow's Q4 earnings report and 52% productivity claim are cited in the paper (Yahoo Finance, 2024). NVIDIA provided compute and employs co-authors."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests statement is present. Multiple authors work at companies (Hugging Face, ServiceNow, NVIDIA, Salesforce, IBM Research, Roblox, etc.) with commercial interests in code generation tools, but no formal declaration of financial interests or conflicts."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "The SH graph dataset version is 2023-09-06 (§2.1). GHArchive data, StackOverflow dump (2023-09-14), and other sources have specified collection dates. The opt-out cutoff was November 20, 2023."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "§3.3 describes decontamination: removing files containing docstrings/solutions from HumanEval, MBPP, APPS, GSM8K questions, and DS1000 prompts, with whitespace-insensitive matching. RepoBench v1.1 removes duplicates against The Stack v2."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "The paper explicitly decontaminates against test benchmarks (§3.3), and RepoBench v1.1 was specifically constructed to avoid data leakage by sourcing from repositories created after October 6, 2023. However, not all benchmarks are decontaminated (docs, LHQ, arXiv, Wikipedia excluded)."
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "No human participants in this study. It is a model training and benchmark evaluation paper."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "No human participants."
    250       },
    251       "demographics_reported": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "blinding_described": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "attrition_reported": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "No inference cost, latency, or tokens consumed are reported for any benchmark evaluation. The number of samples generated per benchmark is stated but not the computational cost."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": true,
    286         "justification": "§6.4 reports CO2 emissions and GPU hours: 3B model used 97,120 GPU hours on A100s (16,107 kgCO2eq total), 7B model used 145,152 GPU hours on H100s (29,623 kgCO2eq). 15B estimates were pending. FLOPs are reported in Table 6."
    287       }
    288     },
    289     "experimental_rigor": {
    290       "seed_sensitivity_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No seed sensitivity analysis. Results appear to be from single training runs. No variance across seeds is reported for any benchmark evaluation."
    294       },
    295       "number_of_runs_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "The number of samples per benchmark evaluation is stated: 50 for MultiPL-E, 40 for DS-1000, 20 for HumanEvalFix, 100 for CanItEdit, 10 for CRUXEval, 25 for security benchmark, 1 for greedy decoding on HumanEval/MBPP."
    299       },
    300       "hyperparameter_search_budget": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No hyperparameter search budget is reported. The paper mentions 'preliminary experiments' and 'preliminary ablation study' for architecture choices (RoPE, FIM variants) but does not quantify the search process."
    304       },
    305       "best_config_selection_justified": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "Architecture and training decisions (RoPE, GQA, FIM rate, data composition) are stated as chosen but the selection process is not described. The paper mentions 'we explored several FIM variants in preliminary experiments' without detailing results or justifying the final selection."
    309       },
    310       "multiple_comparison_correction": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite comparing many models across many benchmarks."
    314       },
    315       "self_comparison_bias_addressed": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The authors evaluate their own models against baselines but do not acknowledge self-comparison bias. They use standard benchmarks and evaluation frameworks, which partially mitigates this, but the bias is not discussed."
    319       },
    320       "compute_budget_vs_performance": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "FLOPs are reported in Table 6 but performance is not analyzed as a function of compute budget. StarCoder2-15B uses more training tokens (4.1T) than DeepSeekCoder-33B's training budget, but this compute imbalance is not discussed when comparing performance."
    324       },
    325       "benchmark_construct_validity": {
    326         "applies": true,
    327         "answer": true,
    328         "justification": "The paper discusses EvalPlus as addressing construct validity issues in HumanEval and MBPP (§7.1.1, Listings 1-2): insufficient tests, wrong test cases, and ambiguous descriptions. This demonstrates awareness that benchmarks may not measure what they claim."
    329       },
    330       "scaffold_confound_addressed": {
    331         "applies": false,
    332         "answer": false,
    333         "justification": "No scaffolding is used. Models are evaluated directly on benchmarks without agent frameworks. The Issue prompt comparison is a prompt format variation, not a scaffold confound."
    334       }
    335     },
    336     "data_leakage": {
    337       "temporal_leakage_addressed": {
    338         "applies": true,
    339         "answer": true,
    340         "justification": "RepoBench v1.1 specifically uses repositories created after October 2023 to avoid temporal leakage. The decontamination step (§3.3) removes known benchmark content. Training data dates are specified."
    341       },
    342       "feature_leakage_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "No discussion of whether evaluation setups provide hints not available in real usage (e.g., whether docstrings in HumanEval give excessive information compared to real-world coding tasks)."
    346       },
    347       "non_independence_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "The deduplication removes near-duplicates within training data but there is no discussion of structural similarities between training data and benchmark problems (e.g., whether competitive programming solutions in training overlap with HumanEval-style problems)."
    351       },
    352       "leakage_detection_method": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "Concrete decontamination method applied (§3.3): string matching (with whitespace removal for improved recall) against HumanEval docstrings/solutions, MBPP, APPS, GSM8K, and DS1000. RepoBench removes duplicates against The Stack v2."
    356       }
    357     }
    358   },
    359   "claims": [
    360     {
    361       "claim": "StarCoder2-3B outperforms other Code LLMs of similar size on most benchmarks and matches or surpasses StarCoderBase-15B.",
    362       "evidence": "Tables 9-10 show StarCoder2-3B leading on HumanEval (31.7), HumanEval+ (27.4), MBPP (57.4), MBPP+ (47.4) among small models, and outperforming StarCoderBase-15B on 11/18 MultiPL-E languages.",
    363       "supported": "strong"
    364     },
    365     {
    366       "claim": "StarCoder2-15B significantly outperforms other models of comparable size and matches or outperforms CodeLlama-34B.",
    367       "evidence": "Table 9: StarCoder2-15B scores 46.3 on HumanEval vs CodeLlama-13B's 37.8. Tables 10, 14: outperforms CodeLlama-34B on 10/18 MultiPL-E languages and GSM8K (65.1 vs 54.2).",
    368       "supported": "strong"
    369     },
    370     {
    371       "claim": "StarCoder2-15B outperforms DeepSeekCoder-33B on math and code reasoning benchmarks.",
    372       "evidence": "Table 14: GSM8K 65.1 vs 58.7. Table 15: CRUXEval-I Pass@1 48.1 vs 46.5. However, DeepSeekCoder-33B leads on CRUXEval-O (48.6 vs 47.1).",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "The Stack v2 is 4× larger than The Stack v1 training dataset.",
    377       "evidence": "Table 4 shows 913B+ unique tokens for the 15B model vs ~232B for StarCoderBase (referenced in text). Table 1 compares v1 and v2 sizes across languages.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "Pre-training on pull requests (StarCoder2 Issue format) is a viable alternative to pre-training on commits (StarCoderBase Commit format).",
    382       "evidence": "Table 12: StarCoder2-15B with Issue prompt (38.7 avg) vs StarCoderBase-15B with Commit prompt (26.7 avg) on HumanEvalFix. Table 13: Similar advantage on CanItEdit.",
    383       "supported": "weak"
    384     },
    385     {
    386       "claim": "StarCoder2-7B underperforms relative to expectations — it does not perform as well as StarCoder2-3B and StarCoder2-15B for their size.",
    387       "evidence": "Tables 9-10: DeepSeekCoder-6.7B significantly outperforms StarCoder2-7B (47.6 vs 35.4 on HumanEval). The authors explicitly acknowledge this and state they do not know why.",
    388       "supported": "strong"
    389     }
    390   ],
    391   "methodology_tags": ["benchmark-eval"],
    392   "key_findings": "StarCoder2 introduces a family of open-weight code LLMs (3B/7B/15B) trained on The Stack v2, a 4× larger dataset built on Software Heritage's archive spanning 619 languages. StarCoder2-3B is state-of-the-art for its size class, and StarCoder2-15B matches or outperforms CodeLlama-34B (a model 2× its size) on many benchmarks while excelling at math reasoning (GSM8K: 65.1 vs 54.2) and low-resource languages. The project demonstrates a fully transparent training pipeline with released data, SWHIDs, opt-out mechanisms, and decontamination, setting a standard for open Code LLM development. Notable weaknesses include StarCoder2-7B's underperformance and a FIM implementation bug degrading StarCoder2-15B's infilling capability.",
    393   "red_flags": [
    394     {
    395       "flag": "Non-independent funders",
    396       "detail": "Hugging Face and ServiceNow steward the project, employ many authors, and have commercial interests in StarCoder2's success. ServiceNow's 52% productivity increase claim (Yahoo Finance, 2024) from StarCoder-based products is cited in the paper's introduction. No competing interests statement is present."
    397     },
    398     {
    399       "flag": "No statistical tests for comparative claims",
    400       "detail": "The paper uses 'significantly outperforms' and 'outperforms' throughout but never performs statistical significance tests. All comparisons are point estimate differences. With sampling-based evaluation, differences could be within noise margins."
    401     },
    402     {
    403       "flag": "Confounded causal claims about training choices",
    404       "detail": "Claims about the benefits of repository-context training, pull request data, or data composition changes cannot be attributed to individual factors because many variables changed simultaneously (architecture, data size, data composition, tokenizer, training length)."
    405     },
    406     {
    407       "flag": "Missing 15B CO2 estimate",
    408       "detail": "The paper states 'The paper will soon be updated with estimates for the 15B model' for CO2 emissions — this was not completed, leaving the largest model's environmental cost undocumented."
    409     },
    410     {
    411       "flag": "High insecure code generation rate",
    412       "detail": "StarCoder2-7B (39.8%) and StarCoder2-15B (39.2%) have the highest insecure code rates among similar-sized models on the security benchmark, higher than all baselines. The paper downplays this as a 'side-effect' of generating more correct code."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Evaluating large language models trained on code",
    418       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    419       "year": 2021,
    420       "arxiv_id": "2107.03374",
    421       "relevance": "Introduced HumanEval and Codex, foundational benchmarks and models for code generation evaluation."
    422     },
    423     {
    424       "title": "StarCoder: may the source be with you!",
    425       "authors": ["Raymond Li", "Loubna Ben Allal", "Yangtian Zi"],
    426       "year": 2023,
    427       "arxiv_id": "2305.06161",
    428       "relevance": "Predecessor model and dataset (The Stack v1, StarCoder) that this work directly builds upon."
    429     },
    430     {
    431       "title": "Code llama: Open foundation models for code",
    432       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    433       "year": 2023,
    434       "arxiv_id": "2308.12950",
    435       "relevance": "Major baseline model family for code generation comparison."
    436     },
    437     {
    438       "title": "DeepSeek-Coder: when the large language model meets programming",
    439       "authors": ["Daya Guo", "Qihao Zhu", "Dejian Yang"],
    440       "year": 2024,
    441       "arxiv_id": "2401.14196",
    442       "relevance": "Primary competitor showing strongest performance on code completion benchmarks."
    443     },
    444     {
    445       "title": "Is your code generated by chatGPT really correct? Rigorous evaluation of large language models for code generation",
    446       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    447       "year": 2023,
    448       "relevance": "Introduced EvalPlus framework addressing inadequate tests in HumanEval/MBPP, adopted for evaluation."
    449     },
    450     {
    451       "title": "MultiPL-E: a scalable and polyglot approach to benchmarking neural code generation",
    452       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    453       "year": 2023,
    454       "doi": "10.1109/TSE.2023.3267446",
    455       "relevance": "Multi-language code generation benchmark used for cross-language evaluation."
    456     },
    457     {
    458       "title": "OctoPack: instruction tuning code large language models",
    459       "authors": ["Niklas Muennighoff", "Qian Liu", "Armel Randy Zebaze"],
    460       "year": 2024,
    461       "relevance": "HumanEvalFix benchmark for code fixing and instruction-tuned model evaluation."
    462     },
    463     {
    464       "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions",
    465       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan"],
    466       "year": 2022,
    467       "relevance": "Security vulnerability benchmark for evaluating code generation safety."
    468     },
    469     {
    470       "title": "The impact of AI on developer productivity: Evidence from GitHub Copilot",
    471       "authors": ["Sida Peng", "Eirini Kalliamvakou", "Peter Cihon", "Mert Demirer"],
    472       "year": 2023,
    473       "arxiv_id": "2302.06590",
    474       "relevance": "Key study on AI coding tool productivity impact, cited for Copilot's 56% productivity increase claim."
    475     },
    476     {
    477       "title": "Scaling data-constrained language models",
    478       "authors": ["Niklas Muennighoff", "Alexander M Rush", "Boaz Barak"],
    479       "year": 2023,
    480       "relevance": "Foundational work on data repetition and multi-epoch training for LLMs, directly informing StarCoder2's training strategy."
    481     },
    482     {
    483       "title": "CRUXEval: a benchmark for code reasoning, understanding and execution",
    484       "authors": ["Alex Gu", "Baptiste Rozière", "Hugh Leather"],
    485       "year": 2024,
    486       "arxiv_id": "2401.03065",
    487       "relevance": "Code reasoning benchmark where StarCoder2-15B outperforms much larger models."
    488     },
    489     {
    490       "title": "The gradient of generative AI release: Methods and considerations",
    491       "authors": ["Irene Solaiman"],
    492       "year": 2023,
    493       "arxiv_id": "2302.04844",
    494       "relevance": "Framework for understanding openness in LLM development, directly informing BigCode's release strategy."
    495     }
    496   ]
    497 }

Impressum · Datenschutz