scan.json (17298B)
1 { 2 "paper": { 3 "title": "Compilable Neural Code Generation with Compiler Feedback", 4 "authors": ["Xin Wang", "Yasheng Wang", "Yao Wan", "Fei Mi", "Yitong Li", "Pingyi Zhou", "Jin Liu", "Hao Wu", "Xin Jiang", "Qun Liu"], 5 "year": 2022, 6 "venue": "ACL 2022 (Findings)", 7 "arxiv_id": "2203.05132" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive is provided in the paper." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available datasets: CodeSearchNet-Python and AdvTest from CodeXGLUE. The filtering criteria to extract subsets are described." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper mentions Python 3 and NVIDIA Tesla V100 GPUs but does not provide a requirements.txt, Dockerfile, or detailed dependency list." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results are reported as single point estimates without confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims COMPCODER outperforms baselines but provides no statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Improvements are reported with baseline context, e.g., 'improving the success rate of compilation from 44.18 to 89.18' and absolute point differences are stated." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification is given for the choice of 50k/45k/5k split for code completion or 41k/40k/1k for text-to-code." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or multiple-run results are reported. All numbers appear to be single-run." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Multiple baselines are compared: BiLSTM, Transformer, GPT-2, CodeGPT, PLBART, CodeT5 (Tables 1 and 2)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "CodeGPT, PLBART, and CodeT5 were all recent state-of-the-art models at the time of writing (2021-2022)." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Table 3 presents an ablation study examining RL, Dtrain, and Dtest components individually and in combination." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Two metrics are used: Edit Similarity (ES) and Compilation Rate (CR)." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation is conducted. All evaluation is automated via compiler checks and edit similarity." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Separate test sets are used: 5k for code completion, 1k for text-to-code generation (Section 4.1)." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Code completion results are broken down by token count (25, 30, 35, 40, 45 tokens) in Table 1 and Figure 4." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Figure 3 shows failure cases where candidates fail to compile, and Figure 5 shows case studies comparing CodeGPT failures with COMPCODER successes." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": false, 103 "justification": "No negative results or failed approaches are reported. Every experiment shows improvement." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of improving compilation rate from 44.18 to 89.18 (code completion) and 70.3 to 96.2 (text-to-code) are supported by Tables 1 and 2." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims about component contributions are supported by the ablation study (Table 3), which uses controlled single-variable manipulation." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title says 'Neural Code Generation' broadly but experiments are only on Python. No explicit statement bounding generalization to Python." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for the results. The weak constraint issue (whitespace strings compiling) is noted but no broader alternatives are considered." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The specific CodeGPT checkpoint is identified via HuggingFace URL: 'microsoft/CodeGPT-small-py-adaptedGPT2' (Section 4.4)." 133 }, 134 "prompts_provided": { 135 "applies": false, 136 "answer": false, 137 "justification": "This paper fine-tunes models rather than using prompting. No prompts are used." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 4.4 reports learning rate (1.5e-5), batch size (32/16), max epochs (20), sequence lengths, beam size (5), and RL data sampling (5%)." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. This is a training pipeline, not an agent system." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 4.1 describes filtering: extracting compilable Python 3 methods with token lengths 64-96 from CodeSearchNet, and filtering AdvTest for code lengths 128-170 and text lengths >5." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "No dedicated limitations or threats-to-validity section. The conclusion briefly mentions compilability doesn't guarantee correctness." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "No explicit statement of what the results do NOT show. The paper does not bound its claims to Python or the specific datasets tested." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The underlying datasets (CodeSearchNet, AdvTest/CodeXGLUE) are publicly available, though the specific filtered subsets are not released." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.1 describes how data was collected: extracting from CodeSearchNet with length filters, and from AdvTest with length and semantic filters." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; data comes from standard benchmarks." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.1 documents the pipeline: source dataset → language/version filter → length filter → compilability filter → train/test split, with counts at each stage." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgements section lists NSFC grants No. 61972290, 62102157, and 61962061." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed, including Huawei Noah's Ark Lab and Huawei Technologies Co., Ltd." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funding is from NSFC (Chinese national science foundation), which has no stake in the specific outcomes of this code generation research." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present. Several authors are from Huawei, which has commercial interest in code generation, but this is not explicitly acknowledged as a conflict." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The CodeGPT model's training data cutoff is not stated. The paper uses pre-trained CodeGPT on benchmarks without discussing when its training data was collected." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether CodeSearchNet or AdvTest data appeared in CodeGPT's pre-training corpus." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "CodeSearchNet was published in 2019 and CodeGPT was pre-trained on code corpora that likely included it. This contamination risk is not addressed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost or latency is reported despite the multi-stage pipeline involving beam search and discriminator evaluation." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "Hardware is mentioned (2 NVIDIA Tesla V100 32GB) but total training time and compute budget are not stated." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "COMPCODER improves average compilation rate from 44.18 to 89.18 in code completion compared to CodeGPT", 286 "evidence": "Table 1 and Figure 4 show compilation rates across different token completion lengths (25-45 tokens). Average improvement is ~45 points.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "COMPCODER improves compilation rate from 70.3 to 96.2 in text-to-code generation compared to CodeGPT", 291 "evidence": "Table 2 shows CR of 96.2 for COMPCODER vs 70.3 for CodeGPT on the AdvTest dataset.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "COMPCODER does not sacrifice code fluency (Edit Similarity) while improving compilation rate", 296 "evidence": "Tables 1 and 2 show ES scores are comparable or slightly better (64.53 vs 64.47 in completion, 62.74 vs 61.82 in text-to-code).", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "All three components (RL, Dtrain, Dtest) contribute to compilation rate improvement", 301 "evidence": "Table 3 ablation study shows incremental improvements: CodeGPT 46.84 → +Dtrain 64.88 → +RL 76.48 → +RL+Dtrain 83.14 → full model 94.48.", 302 "supported": "strong" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "COMPCODER, a three-stage pipeline using compiler feedback (fine-tuning, RL-based compilability reinforcement, and compilability discrimination), dramatically improves compilation rates of generated code. On Python code completion, it raises compilation rate from 44.18% to 89.18% vs CodeGPT, and on text-to-code generation from 70.3% to 96.2%, without degrading edit similarity. An ablation study confirms all three components contribute, with the discriminator at inference time providing the largest single boost.", 307 "red_flags": [ 308 { 309 "flag": "No variance or multiple runs", 310 "detail": "All results appear to be single-run with no error bars, standard deviations, or confidence intervals, making it impossible to assess result stability." 311 }, 312 { 313 "flag": "No limitations section", 314 "detail": "The paper has no dedicated limitations or threats-to-validity section. Only a brief mention in the conclusion that compilability does not equal correctness." 315 }, 316 { 317 "flag": "Potential contamination risk", 318 "detail": "CodeGPT was pre-trained on code corpora that likely include CodeSearchNet data. The paper does not discuss whether training/test overlap could inflate results." 319 }, 320 { 321 "flag": "Python-only evaluation with broad claims", 322 "detail": "The title and framing suggest general 'neural code generation' but all experiments are on Python only. No discussion of generalizability to other languages." 323 } 324 ], 325 "cited_papers": [ 326 { 327 "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation", 328 "authors": ["Shuai Lu", "Daya Guo", "Shuo Ren"], 329 "year": 2021, 330 "relevance": "Major code generation benchmark used in this paper's evaluation, widely used in LLM code generation research." 331 }, 332 { 333 "title": "Unified Pre-training for Program Understanding and Generation", 334 "authors": ["Wasi Uddin Ahmad", "Saikat Chakraborty", "Baishakhi Ray", "Kai-Wei Chang"], 335 "year": 2021, 336 "relevance": "PLBART, a pre-trained code model used as a baseline, relevant to understanding code generation model evolution." 337 }, 338 { 339 "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation", 340 "authors": ["Yue Wang"], 341 "year": 2021, 342 "relevance": "CodeT5 baseline model, influential in pre-trained code generation models." 343 }, 344 { 345 "title": "Sequencer: Sequence-to-Sequence Learning for End-to-End Program Repair", 346 "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano"], 347 "year": 2021, 348 "relevance": "Program repair using deep learning, relevant to automated code generation and repair quality." 349 }, 350 { 351 "title": "Energy-based Models for Code Generation under Compilability Constraints", 352 "authors": ["Tomasz Korbak", "Hady ElSahar", "Marc Dymetman", "German Kruszewski"], 353 "year": 2021, 354 "arxiv_id": "2106.04985", 355 "relevance": "Directly related work on compilability-constrained code generation using energy-based models." 356 }, 357 { 358 "title": "SPoC: Search-based Pseudocode to Code", 359 "authors": ["Sumith Kulal", "Panupong Pasupat", "Kartik Chandra"], 360 "year": 2019, 361 "relevance": "Introduced compilation rate as an evaluation metric for code generation." 362 }, 363 { 364 "title": "Retrieval Augmented Code Generation and Summarization", 365 "authors": ["Md. Rizwan Parvez", "Wasi Uddin Ahmad"], 366 "year": 2021, 367 "relevance": "Retrieval-augmented approach to code generation, relevant to AI-assisted programming." 368 }, 369 { 370 "title": "CodeSearchNet Challenge: Evaluating the State of Semantic Code Search", 371 "authors": ["Hamel Husain", "Ho-Hsiang Wu"], 372 "year": 2019, 373 "relevance": "Source dataset used in this paper's code completion experiments, widely used code benchmark." 374 } 375 ] 376 }