scan.json (19756B)
1 { 2 "paper": { 3 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 4 "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang", "Nan Duan", "Xiaocheng Feng", "Ming Gong", "Linjun Shou", "Bing Qin", "Ting Liu", "Daxin Jiang", "Ming Zhou"], 5 "year": 2020, 6 "venue": "EMNLP 2020 Findings", 7 "arxiv_id": "2002.08155" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "The paper states 'All the codes and data are available at https://github.com/microsoft/CodeBERT' (footnote 1)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "They use the publicly available CodeSearchNet Corpus and release their NL-PL probing dataset. The GitHub repo contains data." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions hardware (NVIDIA DGX-2 with V100 GPUs) and some hyperparameters but does not provide a requirements.txt, Dockerfile, or detailed environment specification with library versions." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper provides hyperparameters in Appendix B but no step-by-step reproduction instructions or README with commands in the paper itself." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results are reported as point estimates (e.g., MRR values in Table 2, BLEU scores in Tables 3-5) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims CodeBERT 'achieves state-of-the-art performance' based on comparing numbers without any statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "Raw metric values are reported but no standardized effect sizes (Cohen's d, etc.). Differences are stated numerically (e.g., '1.3 BLEU score over RoBERTa') but without formal effect size measures." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for why the CodeSearchNet dataset sizes or probing dataset sizes are appropriate. No power analysis." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or multiple-run results are reported. All results appear to be single-run numbers." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines are included: joint embeddings (NBoW, CNN, BiRNN, SelfAtt), RoBERTa, RoBERTa pre-trained with code only, and task-specific baselines for code summarization (Table 2, 3, 4, 5)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include RoBERTa (2019), ELECTRA-style RTD (2020), code2seq (2019), and the CodeSearchNet baselines which were contemporary at submission time." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 2 ablates initialization (from scratch vs. RoBERTa) and training objectives (MLM only, RTD only, MLM+RTD). Section 4.1 discusses these systematically." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Code search uses MRR across 6 languages. Code summarization uses smoothed BLEU-4. Probing uses accuracy. Multiple tasks with appropriate metrics for each." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation is included. Code search is evaluated by MRR and code summarization by BLEU only. Qualitative examples are shown in Appendix E but not systematically evaluated by humans." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper uses separate train/dev/test splits from CodeSearchNet (Table 6) and reports results on test sets. Models are selected based on dev set performance (Appendix B.2)." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down per programming language (6 languages for code search in Table 2, 6 languages for summarization in Table 3/4)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section 4.4 discusses that CodeBERT achieves 'slightly lower results than code2seq' on C# and explains the likely reason (lack of AST information). Section 4.2 discusses probing limitations." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that training CodeBERT by traversing AST structure 'does not bring improvements on generation tasks' (Section 4.4), and that CodeBERT underperforms code2seq on C#." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims state-of-the-art on code search and code documentation generation, which are supported by Tables 2-4. The probing claims are supported by Table 1." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims about which components help are supported by ablation studies in Table 2 (different objectives and initialization strategies). The ablation design is controlled single-variable manipulation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title says 'Programming and Natural Languages' broadly, but results are limited to 6 languages for code search and 1 additional language (C#) for generalization. The broad title overstates the tested scope." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for the results. The paper does not consider confounds such as whether improvements come from more data, different tokenization, or other factors beyond the proposed objectives." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper describes CodeBERT as having 'the same model architecture as RoBERTa-base' (125M parameters) but does not specify exact RoBERTa checkpoint version used for initialization." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "This paper does not use prompting. It is a pre-training + fine-tuning approach." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Detailed hyperparameters are provided in Appendix B: batch size, learning rate, optimizer, warmup steps, max length, training steps for pre-training (B.1), code search fine-tuning (B.2), and summarization (B.3, B.4)." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. This is a standard pre-train + fine-tune approach." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 3 describes the data sources (CodeSearchNet for bimodal, additional GitHub repos for unimodal), filtering criteria, and data statistics are in Table 6 and Appendix A." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion mentions future directions but does not discuss limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed anywhere in the paper." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show or what settings are excluded from the claims." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The CodeSearchNet Corpus is publicly available, and the probing dataset is released via GitHub. Raw data can be independently verified." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 3 describes data collection: bimodal data from CodeSearchNet (NL-PL pairs from GitHub), unimodal data from GitHub repos with at least one star. Probing dataset construction is described in Section 4.2." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Data comes from public GitHub repositories and CodeSearchNet benchmark." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3 describes the pipeline: CodeSearchNet provides bimodal data across 6 languages, unimodal data from GitHub with filtering (repos with ≥1 star), and data statistics are provided." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgments section lists National Key R&D Program of China (2018YFB1005103) and NSFC grants (61632011, 61772156)." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: Harbin Institute of Technology, Sun Yat-sen University, Microsoft Research Asia, and Microsoft Search Technology Center Asia." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Government grants (Chinese national programs and NSFC) are independent of the outcome. However, Microsoft authors evaluating a Microsoft-released model is a potential conflict not explicitly addressed, though this question specifically asks about funders." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present. Several authors are Microsoft employees evaluating a Microsoft-released model, but no financial interest disclosure is provided." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper does not state when the CodeSearchNet or unimodal training data was collected, nor does it discuss the temporal relationship between training and test data." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of potential overlap between pre-training data (GitHub repos) and downstream evaluation data (also from GitHub). The probing dataset is constructed from the same CodeSearchNet data." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The pre-training uses GitHub code which could overlap with CodeSearchNet test sets. This contamination risk is not discussed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost or latency is reported for any of the downstream tasks." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": true, 279 "justification": "Appendix B.1 states pre-training was done on one NVIDIA DGX-2 with 16 V100 GPUs using FP16, and provides timing: '1,000 batches costs 600 minutes with MLM objective, 120 minutes with RTD objective.'" 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CodeBERT achieves state-of-the-art performance on natural language code search across six programming languages.", 286 "evidence": "Table 2 shows CodeBERT (MLM+RTD, INIT=R) achieves the highest MRR on all six languages compared to baselines including RoBERTa and joint embedding methods.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "CodeBERT achieves state-of-the-art performance on code-to-documentation generation.", 291 "evidence": "Tables 3 and 4 show CodeBERT outperforms RoBERTa and code-only pre-trained models on smoothed BLEU-4 across six languages.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "The replaced token detection (RTD) objective combined with MLM improves over MLM alone.", 296 "evidence": "Table 2 ablation shows MLM+RTD consistently outperforms MLM-only across languages. The improvement is modest but consistent.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "CodeBERT learns better NL-PL representations than RoBERTa in zero-shot probing.", 301 "evidence": "Table 1 shows CodeBERT achieves higher accuracy than RoBERTa on NL and PL probing tasks in a zero-shot setting with frozen parameters.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "CodeBERT generalizes to programming languages not seen during pre-training (C#).", 306 "evidence": "Table 5 shows CodeBERT achieves 22.36 BLEU on C# summarization, improving over RoBERTa (19.81), though below code2seq (23.04).", 307 "supported": "moderate" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval"], 311 "key_findings": "CodeBERT is a bimodal pre-trained model for programming and natural languages, trained with masked language modeling and replaced token detection objectives on CodeSearchNet data. It achieves state-of-the-art results on natural language code search (MRR) and code documentation generation (BLEU-4) across six programming languages. The model also demonstrates transfer to unseen languages (C#) and shows superior zero-shot probing performance compared to RoBERTa, suggesting it learns meaningful NL-PL alignment.", 312 "red_flags": [ 313 { 314 "flag": "No variance or significance testing", 315 "detail": "All results are single-run point estimates with no error bars, standard deviations, or significance tests. Differences between models may not be statistically significant." 316 }, 317 { 318 "flag": "Potential contamination between pre-training and evaluation data", 319 "detail": "Both pre-training data and evaluation benchmarks come from GitHub. The paper does not discuss whether test set code appeared in pre-training data." 320 }, 321 { 322 "flag": "Microsoft employees evaluating Microsoft model", 323 "detail": "Multiple authors are from Microsoft Research Asia and Microsoft Search Technology Center. CodeBERT is a Microsoft product released on Microsoft's GitHub. No conflict of interest statement is provided." 324 }, 325 { 326 "flag": "No limitations section", 327 "detail": "The paper has no dedicated limitations or threats-to-validity discussion despite several methodological gaps (single runs, contamination risk, limited language coverage)." 328 } 329 ], 330 "cited_papers": [ 331 { 332 "title": "CodeSearchNet Challenge: Evaluating the State of Semantic Code Search", 333 "authors": ["Hamel Husain", "Ho-Hsiang Wu", "Tiferet Gazit", "Miltiadis Allamanis", "Marc Brockschmidt"], 334 "year": 2019, 335 "arxiv_id": "1909.09436", 336 "relevance": "Major benchmark dataset for code search that CodeBERT is evaluated on; foundational for NL-PL evaluation methodology." 337 }, 338 { 339 "title": "ELECTRA: Pre-training Text Encoders as Discriminators Rather than Generators", 340 "authors": ["Kevin Clark", "Minh-Thang Luong", "Quoc V. Le", "Christopher D. Manning"], 341 "year": 2020, 342 "relevance": "Introduces the replaced token detection objective that CodeBERT adapts for bimodal NL-PL pre-training." 343 }, 344 { 345 "title": "RoBERTa: A Robustly Optimized BERT Pretraining Approach", 346 "authors": ["Yinhan Liu", "Myle Ott", "Naman Goyal"], 347 "year": 2019, 348 "arxiv_id": "1907.11692", 349 "relevance": "Primary baseline model and initialization source for CodeBERT; key comparison point for measuring the value of bimodal pre-training." 350 }, 351 { 352 "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", 353 "authors": ["Jacob Devlin", "Ming-Wei Chang", "Kenton Lee", "Kristina Toutanova"], 354 "year": 2018, 355 "arxiv_id": "1810.04805", 356 "relevance": "Foundational pre-training approach that CodeBERT extends to the bimodal NL-PL setting." 357 }, 358 { 359 "title": "code2seq: Generating Sequences from Structured Representations of Code", 360 "authors": ["Uri Alon", "Shaked Brody", "Omer Levy", "Eran Yahav"], 361 "year": 2019, 362 "relevance": "Key baseline for code summarization that outperforms CodeBERT on C#, demonstrating value of AST-based representations." 363 }, 364 { 365 "title": "Summarizing Source Code Using a Neural Attention Model", 366 "authors": ["Srinivasan Iyer", "Ioannis Konstas", "Alvin Cheung", "Luke Zettlemoyer"], 367 "year": 2016, 368 "relevance": "Introduces CodeNN and the StackOverflow C# dataset used for evaluating CodeBERT's generalization to unseen languages." 369 }, 370 { 371 "title": "Deep Code Search", 372 "authors": ["Xiaodong Gu", "Hongyu Zhang", "Sunghun Kim"], 373 "year": 2018, 374 "relevance": "Early neural code search approach and baseline for evaluating CodeBERT on code retrieval tasks." 375 }, 376 { 377 "title": "Pre-trained Contextual Embedding of Source Code", 378 "authors": ["Aditya Kanade", "Petros Maniatis", "Gogul Balakrishnan", "Kensen Shi"], 379 "year": 2019, 380 "arxiv_id": "2001.00059", 381 "relevance": "Concurrent work on pre-training for code (CuBERT), relevant to understanding the landscape of code pre-training models." 382 } 383 ] 384 }