scan.json (24914B)
1 { 2 "paper": { 3 "title": "CodeFill: Multi-token Code Completion by Jointly Learning from Structure and Naming Sequences", 4 "authors": ["Maliheh Izadi", "Roberta Gismondi", "Georgios Gousios"], 5 "year": 2022, 6 "venue": "ICSE '22", 7 "arxiv_id": "2202.06689", 8 "doi": "10.1145/3510003.3510172" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "The paper provides a GitHub link: https://github.com/saltudelft/codefill (footnote 1, Section 1). The abstract also states 'We publicly release our source code and datasets.'" 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper states 'We publicly release our source code and datasets' in the abstract. One dataset (ETH 150K/PY117K) is a public benchmark from Raychev et al. The other (PY1690K) was collected by the authors and shared via the same GitHub repository." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "Section 4.3 mentions the hardware used (two GeForce GTX 1080 Ti GPUs, Intel Xeon CPU, 128G RAM) and the HuggingFace library, but there is no requirements.txt, Dockerfile, or detailed listing of library versions sufficient to recreate the environment." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper describes the approach and configuration (Section 4.3) but does not include step-by-step reproduction instructions, a README with commands, or scripts to replicate experiments. While code is released, the paper itself does not provide reproduction instructions." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "All results in Tables 3-8 and Figure 6 are reported as single point estimates (e.g., 'MRR: 70.9%') with no confidence intervals, error bars, or uncertainty measures." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper makes claims that 'CodeFill surpasses all baselines' and 'outperforms the state of the art' but provides no statistical significance tests (no p-values, t-tests, or other hypothesis tests). Comparisons are based solely on comparing raw numbers." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper consistently reports baseline context alongside improvements, e.g., 'MRR: 70.9% vs. 66.2% and 67.8%' and 'ROUGE-L: 63.7% vs. 52.4% and 59.2%' in the abstract and throughout the results tables, providing enough context to understand the magnitude of differences." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper uses two datasets (29M and 425M LOC) but does not justify why these sizes are adequate, discuss power analysis, or acknowledge whether smaller datasets might yield different conclusions. No justification for the 90/10 train/test split is provided." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "All reported results appear to be from single runs. There is no mention of multiple runs, standard deviation, variance across seeds, or any spread measure. Tables 3-8 report only single numbers per metric." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper compares CodeFill against six baselines: n-gram+LSTM, Pointer Mixture, OpenVocab, T-XL+Bi-LSTM, GPT-C (IntelliCode Compose), and TravTrans+ (Section 2.4, Tables 3-8)." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "The baselines span 2017-2021: TravTrans+ (ICSE 2021), GPT-C (FSE 2020), OpenVocab (ICSE 2020), T-XL+Bi-LSTM (ICPC 2020). For a 2022 paper, these are contemporary and include the state of the art at the time." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "Table 7 presents an ablation study examining vanilla GPT-2 vs. MTL with hard-parameter sharing vs. soft-parameter sharing vs. adding the statement completion task. Each component's contribution is measured (Section 5.3)." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper uses multiple metrics: Accuracy and MRR for token-level prediction (Tables 3-6), and METEOR and ROUGE-L for statement-level prediction (Figure 6). Runtime latency is also reported (Table 8)." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": false, 84 "justification": "The paper evaluates code completion quality, which involves subjective developer experience, but uses only automated metrics (Accuracy, MRR, METEOR, ROUGE-L). No human evaluation or user study is included to assess whether the completions are actually useful to developers in practice." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": true, 89 "justification": "Section 4.2 states: 'We then use 90% of PY117K for fine-tuning the model on the tasks... and finally the last 10% of PY117K for evaluation.' PY1690K is used exclusively for pre-training, ensuring no train-test overlap." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Tables 4 and 5 provide per-category breakdowns by token type (Identifier, Keyword, Punctuation, Literals, Operators) and by leaf node type (Attribute Access, Names, Function names, Numeric constant). Table 6 provides cardinal point results." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": false, 99 "justification": "The paper does not discuss specific failure cases, error examples, or qualitative analysis of where CodeFill produces incorrect completions. Results are presented only in aggregate metrics." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The ablation study (Table 7) shows that hard-parameter sharing MTL performs slightly worse than soft-parameter sharing on MRR (79.6 vs 79.5), and the paper notes that TTP was excluded from fine-tuning because the model quickly learns type sequences. The paper also notes CodeFill is 'on par' (not superior) for MRR on keywords and punctuation (Section 5.1.2)." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims 'MRR: 70.9% vs. 66.2% and 67.8%' for single token and 'ROUGE-L: 63.7% vs. 52.4% and 59.2%' for multi-token prediction. These are supported in Table 6 (cardinal point MRR) and Figure 6 (SLP ROUGE-L at n=4). The claim of public code/data release is supported by the GitHub link." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The ablation study (Table 7) provides controlled single-variable manipulation to support claims like 'employing the MTL technique to train the models jointly on multiple tasks indeed helps the model learn better' and 'training on two different granularity also helps them learn better.' Each component is added incrementally with measured impact." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper explicitly states Python is the target language (Section 3.2) and the threats to validity section acknowledges 'further studies are needed to validate and generalize our findings to other programming languages' (Section 7). The title mentions 'code completion' broadly but the scope is bounded in the body." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "The threats to validity section (Section 7) discusses generic threats (dataset quality, baseline implementation errors, evaluation metrics) but does not consider specific alternative explanations for why CodeFill outperforms baselines. For example, whether the gains come from the larger model size (258M vs. 119-177M params) rather than the architectural innovations is not addressed." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper states 'GPT-2 Transformer' and 'HuggingFace library' but does not specify which version of GPT-2 (small, medium, large) or the HuggingFace library version. Section 4.3 mentions implementation details but not specific model version identifiers." 134 }, 135 "prompts_provided": { 136 "applies": false, 137 "answer": false, 138 "justification": "This paper trains models from scratch using standard language modeling objectives. No prompting is involved." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 4.3 reports: learning rate 0.00001, maximum sequence length 2048, 100 epochs, beam width 5 (Section 3.2.4). Section 3.2 reports task probability distribution (20%/40%/40% for TTP/TVP/SC). Table 1 provides post-processing weights. 'We set the remaining parameters to default values' is stated." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "No agentic scaffolding is used. CodeFill is a standard neural model architecture with no agent loop, tool use, or retry logic." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 3.1 describes the full preprocessing pipeline: comment removal, AST extraction, module/library alias extraction, tokenization, BPE encoding, string/numeric literal normalization, and indentation handling. Section 4.2 describes dataset collection and deduplication using the method from Allamanis [2]." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 7 'Threats to the Validity' provides a dedicated subsection covering internal, external, and construct validity threats." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 7 discusses specific threats: using replication packages from baseline studies to avoid implementation errors, using only Python datasets (external validity), the need to validate on other languages, data quality from open-source GitHub repos, and suitability of ROUGE-L/METEOR metrics adapted from NLP." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "While the threats to validity section mentions Python-only evaluation and the need for other languages, it does not explicitly state what the results do NOT show. It does not bound claims to specific repository types, developer experience levels, or completion scenarios beyond what was tested." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The paper states datasets are publicly released. PY117K is based on the public ETH 150K dataset. PY1690K was collected from GHTorrent and is released at the GitHub repository." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 4.2 describes data collection: ETH 150K from Raychev et al. (repositories with permissive licenses, parsed ASTs, removed obfuscated files), PY1690K from GHTorrent (all non-forked Python repositories with more than 20 stars, 58k repositories). Deduplication method from Allamanis [2] is cited." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants are involved. Data comes from standard public code repositories." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "The full pipeline is documented: raw GitHub repos → filter by stars/license → deduplication (Allamanis method) → PY117K and PY1690K → preprocessing (Section 3.1: comment removal, AST extraction, BPE encoding) → 90/10 train/eval split. Table 2 provides dataset statistics at each stage." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "The Acknowledgments section discloses: 'This work has received funding from the European Union's Horizon 2020 research and innovation programme under grant number 825328 (FASTEN project), and also the NWO MIPL project, grant number 628.008.003.'" 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "All three authors are listed as affiliated with Delft University of Technology, Delft, Netherlands. Their email addresses are provided. No conflicts with evaluated products exist since CodeFill is their own novel system." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "Funding comes from the EU Horizon 2020 program (FASTEN project) and NWO (Dutch Research Council). These are public research funding bodies with no financial stake in whether CodeFill outperforms baselines." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial interests statement is present in the paper. While the risk of conflict appears low (academic researchers), the absence of a declaration means this criterion is not satisfied." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses models trained from scratch on collected datasets, but does not state when the GHTorrent data snapshot was taken for the PY1690K dataset. The ETH 150K dataset collection date is also not specified. Without knowing when the data was collected, one cannot assess whether test files may have appeared in similar training distributions." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "The paper addresses potential overlap: PY1690K is used exclusively for pre-training and PY117K for fine-tuning/evaluation (Section 4.2). It explicitly states 'PY1690K and PY117K do not have any common files.' Deduplication using Allamanis [2] is applied to both datasets to prevent file-level overlap." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": true, 231 "justification": "The paper uses the established deduplication method from Allamanis [2] and explicitly ensures no overlap between training and evaluation datasets. Section 4.2 states datasets were deduplicated and have no common files. The models are trained from scratch (not pre-trained on external data), limiting contamination risk." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants are involved in this study. It is a benchmark evaluation of code completion models." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "Table 8 reports inference latency in milliseconds for all models (CodeFill: 73ms). Section 5.4 discusses the practical constraint that latency must be under 100ms for IDE use." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": true, 280 "justification": "Table 8 reports training time in hours for all models (CodeFill: 24h). Section 4.3 specifies the hardware: two GeForce GTX 1080 Ti GPUs, Intel Xeon CPU E5-2690 v4, 128G RAM. The ablation study (Table 7) also reports training times for each configuration." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "CodeFill outperforms all baselines in single token prediction (MRR: 81.7% vs. TravTrans+ 79.4% and GPT-C 80.0%)", 287 "evidence": "Table 3 (TLP-A) shows CodeFill achieves 80.6% accuracy and 81.7% MRR, surpassing TravTrans+ (78.9%/79.4%) and GPT-C (79.8%/80.0%).", 288 "supported": "moderate" 289 }, 290 { 291 "claim": "CodeFill significantly outperforms the state of the art for multi-token prediction (ROUGE-L: 63.7% vs. 52.4% for TravTrans+ at n=4)", 292 "evidence": "Figure 6 shows ROUGE-L scores for statement-level prediction across n=2 to n=8. At n=4, CodeFill achieves approximately 63.7% vs. TravTrans+ 52.4%. The margin grows wider as n increases.", 293 "supported": "moderate" 294 }, 295 { 296 "claim": "Each component of the CodeFill model (MTL, soft-parameter sharing, statement completion task) contributes to performance", 297 "evidence": "Table 7 ablation study shows incremental improvements: GPT-2 (78.2 MRR) → MTL HP (79.6) → MTL SP (79.5) → MTL SP with statement completion (81.7).", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "CodeFill's cardinal point evaluation is more representative of real-world autocompletion use", 302 "evidence": "Section 4.1.1 describes the method for identifying cardinal points and cites Hellendoorn et al. [18] arguing standard evaluation is unrealistic. Table 6 shows CodeFill outperforms baselines on cardinal points (MRR: 70.9% vs. 66.2%).", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "All models feature average latency under 100ms, making them practical for IDE use", 307 "evidence": "Table 8 reports latencies: CodeFill 73ms, all baselines between 53-79ms. Section 5.4 states this is within the 'golden standard in the industry.'", 308 "supported": "strong" 309 } 310 ], 311 "methodology_tags": ["benchmark-eval"], 312 "key_findings": "CodeFill, a multi-task learning approach using parallel GPT-2 Transformers that jointly learns from token values, token types, and statement completion, outperforms six baselines on Python code completion. On single-token prediction, CodeFill achieves 81.7% MRR (vs. 79.4% for the best baseline TravTrans+), and on statement-level prediction with 4 tokens, it achieves 63.7% ROUGE-L (vs. 52.4% for TravTrans+). The ablation study confirms each component (MTL, soft-parameter sharing, statement completion task) contributes to performance. The paper also introduces a cardinal-point evaluation methodology that better reflects real-world autocompletion trigger points.", 313 "red_flags": [ 314 { 315 "flag": "No statistical significance testing", 316 "detail": "All comparisons between CodeFill and baselines are based on raw point estimates without any significance tests. The claimed improvements (e.g., 81.7% vs. 79.4% MRR) could potentially be within noise range, but no tests are performed to rule this out." 317 }, 318 { 319 "flag": "No variance or multi-run reporting", 320 "detail": "Results appear to be from single training runs with no reported standard deviation or variance across seeds. Without knowing result variability, the reader cannot assess whether the improvements are stable or an artifact of a particular random initialization." 321 }, 322 { 323 "flag": "Model size confound in ablation", 324 "detail": "CodeFill has 258M parameters compared to 119-177M for baselines (Table 8). The ablation study does not control for model size — it is unclear whether performance gains come from the architectural innovations or simply from having more parameters. The paper does not discuss this confound." 325 }, 326 { 327 "flag": "No failure analysis", 328 "detail": "Despite claiming practical benefits for developers, the paper presents no qualitative analysis of failure cases — what kinds of completions CodeFill gets wrong, or where the baselines outperform it on specific examples." 329 } 330 ], 331 "cited_papers": [ 332 { 333 "title": "IntelliCode Compose: Code Generation Using Transformer", 334 "authors": ["Alexey Svyatkovskiy", "Shao Kun Deng", "Shengyu Fu", "Neel Sundaresan"], 335 "year": 2020, 336 "relevance": "GPT-2 based multilingual code completion deployed in VS Code IDE, a direct baseline for code completion research." 337 }, 338 { 339 "title": "Code Prediction by Feeding Trees to Transformers", 340 "authors": ["Seohyun Kim", "Jinman Zhao", "Yuchi Tian", "Satish Chandra"], 341 "year": 2021, 342 "doi": "10.1109/ICSE43902.2021.00026", 343 "relevance": "State-of-the-art tree-based Transformer for code completion (TravTrans+), the primary comparison baseline." 344 }, 345 { 346 "title": "When Code Completion Fails: A Case Study on Real-World Completions", 347 "authors": ["Vincent J Hellendoorn", "Sebastian Proksch", "Harald C Gall", "Alberto Bacchelli"], 348 "year": 2019, 349 "relevance": "Identified fundamental issues with code completion evaluation methodology, motivating CodeFill's cardinal point evaluation approach." 350 }, 351 { 352 "title": "Big Code != Big Vocabulary: Open-Vocabulary Models for Source Code", 353 "authors": ["Rafael-Michael Karampatsis", "Hlib Babii", "Romain Robbes", "Charles Sutton", "Andrea Janes"], 354 "year": 2020, 355 "relevance": "BPE-based open-vocabulary code model addressing the OOV problem in code completion, a baseline in this study." 356 }, 357 { 358 "title": "A Self-Attentional Neural Architecture for Code Completion with Multi-Task Learning", 359 "authors": ["Fang Liu", "Ge Li", "Bolin Wei", "Xin Xia", "Zhiyi Fu", "Zhi Jin"], 360 "year": 2020, 361 "relevance": "MTL-based code completion using Transformer-XL and Bi-LSTM, a direct predecessor and baseline." 362 }, 363 { 364 "title": "Pythia: AI-Assisted Code Completion System", 365 "authors": ["Alexey Svyatkovskiy", "Ying Zhao", "Shengyu Fu", "Neel Sundaresan"], 366 "year": 2019, 367 "relevance": "Early LSTM-based code completion integrated into VS Code IDE, demonstrating practical deployment of ML-based completion." 368 }, 369 { 370 "title": "The Adverse Effects of Code Duplication in Machine Learning Models of Code", 371 "authors": ["Miltiadis Allamanis"], 372 "year": 2019, 373 "relevance": "Proposed deduplication method used in CodeFill's dataset preparation, addressing data quality in code ML research." 374 }, 375 { 376 "title": "Are Deep Neural Networks the Best Choice for Modeling Source Code?", 377 "authors": ["Vincent J Hellendoorn", "Premkumar Devanbu"], 378 "year": 2017, 379 "relevance": "Foundational study comparing n-gram and neural approaches for source code modeling, a baseline in this evaluation." 380 }, 381 { 382 "title": "On the Naturalness of Software", 383 "authors": ["Abram Hindle", "Earl T Barr", "Zhendong Su", "Mark Gabel", "Premkumar Devanbu"], 384 "year": 2012, 385 "relevance": "Established the naturalness hypothesis for software, foundational to treating code completion as a language modeling task." 386 }, 387 { 388 "title": "Sequence Model Design for Code Completion in the Modern IDE", 389 "authors": ["Gareth Ari Aye", "Gail E Kaiser"], 390 "year": 2020, 391 "relevance": "Study on designing sequence models specifically for IDE code completion, relevant to practical code completion systems." 392 } 393 ] 394 }