scan.json (25920B)
1 { 2 "paper": { 3 "title": "CCTEST: Testing and Repairing Code Completion Systems", 4 "authors": [ 5 "Zongjie Li", 6 "Chaozheng Wang", 7 "Zhibo Liu", 8 "Haoxuan Wang", 9 "Dong Chen", 10 "Shuai Wang", 11 "Cuiyun Gao" 12 ], 13 "year": 2022, 14 "venue": "ICSE 2023", 15 "arxiv_id": "2208.08289", 16 "doi": "10.1109/ICSE48619.2023.00110" 17 }, 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper states 'We have released CCTEST to facilitate further research' and provides a URL: https://sites.google.com/view/cctest-info (reference [12])." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The evaluation uses two publicly available datasets: LeetCode solutions from a public GitHub repository (reference [6]) and the CodeSearchNet test split (reference [39]). The artifact page also references released data." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions 'Intel Xeon Platinum 8276 CPU, 256 GB memory, and 4 NVIDIA A100 GPUs' and that CCTEST is 'implemented in Python, with about 5k LOC' using tree-sitter for parsing, but no requirements.txt, Dockerfile, or detailed library versions are provided." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are included in the paper. The artifact site is referenced but the paper itself does not contain a 'Reproducing Results' section or specific commands to run." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "All results in Tables V, VI, VIII, and IX are reported as point estimates without confidence intervals or error bars." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims CCTEST improves accuracy by 40% and 67% and compares systems, but no statistical significance tests (p-values, t-tests, etc.) are reported for any comparison." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "The paper reports percentage improvements with baseline context, e.g., 'the accuracy of code completion systems is notably increased by 40.25% and 67.43% with respect to BLEU score and Levenshtein edit similarity' (Table VIII provides per-system improvement ratios). The human study reports the average score change from 2.3188 to 3.1565 (36.12% gain)." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification is given for the choice of 2,910 seed programs, 4,000 manually inspected samples, 60 samples for human evaluation, or 12 human evaluators. No power analysis is discussed." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Results are presented as aggregate counts and percentages. No standard deviations, variance, or spread measures across seeds, models, or runs are reported in any table." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper compares the original (unenhanced) code completion outputs as the baseline against CCTEST-enhanced outputs. Table VIII reports improvement ratios relative to the baseline performance for all 8 systems." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "The 8 code completion systems tested (Copilot, CodeParrot, GPT-Neo, GPT-J, CodeGen) were contemporary and actively used at the time of publication (2022-2023). The paper notes these 'represent the best systems available to the public.'" 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Table VII and Table IX provide per-PSC-transformation breakdowns showing the contribution of each of the 9 mutation schemes to both outlier detection (RQ2) and enhancement (RQ3). This functions as an ablation showing which components contribute to the results." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "The paper uses multiple metrics: precision, recall, F1 for outlier detection (Table VI); BLEU score and Levenshtein edit similarity for enhancement (Table VIII); and a 1-5 human rating scale for the human study." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": true, 92 "justification": "Section V-C includes a human study with 12 experts (4 industrial developers and 8 academic researchers) who rated 60 samples on a 1-5 scale, with sanity checks and Fleiss' Kappa reported (0.94)." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "The CodeSearchNet test split is explicitly used for evaluation (Sec. IV). LeetCode programs are separate from any training data. No model tuning is done on the evaluation data since CCTEST is a black-box testing framework with no training phase." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down per-system (8 systems) and per-dataset (LeetCode vs CodeSearchNet) in Tables V and VIII. Per-PSC-transformation breakdowns are in Tables VII and IX." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section V-B discusses false positives in detail: 59.46% are from highly robust completion outputs, 26.13% from vague prompts with no mainstream results. False negatives are also measured and reported." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports that IRR and RTF mutations have limited application scope and produce fewer outliers (Table VII). It also reports that Copilot has the most 'no response' failures (41 of 58). The discussion of false positives and false negatives constitutes negative findings." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims '33,540 inputs that can trigger erroneous cases (with a true positive rate of 86%)' which matches Table V (T=9: 5912+27628=33540) and Table VI (precision 0.861). The '40% and 67%' improvement claims match Table VIII averages (40.20% BLEU, ~67% edit sim)." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper makes causal claims that PSC mutations 'trigger' erroneous outputs. The experimental design supports this: each mutation is a controlled single-variable manipulation of the prompt, and the consistency testing oracle cross-compares outputs from structure-consistent variants. This constitutes adequate controlled manipulation for the causal claims made." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper explicitly states CCTEST 'currently focuses on mutating Python code' (Sec. IV), acknowledges the scope is limited to Python, and notes 'we leave supporting other programming languages as one future work' (Sec. III). The discussion section addresses migration to other languages." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": true, 134 "justification": "Section VI discusses multiple alternative explanations: that mutations may not be 'natural-looking', that the consistency oracle cannot catch aligned-yet-erroneous patterns, and that cross-system differential testing would be an alternative. The paper discusses why syntactic comparison and functionality checking are less appropriate oracles." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Table III lists model names and parameter counts (e.g., 'CodeParrot-small 110M', 'GPT-Neo-125M', 'GPT-J 6B', 'Codegen-2B') but no specific version numbers, snapshot dates, or commit hashes. Copilot is marked with '?' for all details. Models are stated to be from Hugging Face but exact model card versions are not given." 142 }, 143 "prompts_provided": { 144 "applies": false, 145 "answer": false, 146 "justification": "CCTEST does not use prompting in the LLM prompting sense (system prompts, instructions). It feeds code snippets directly to code completion systems as inputs. The actual code inputs are generated programmatically from LeetCode/CodeSearchNet and the mutation algorithms are fully described." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "The paper states 'we follow Copilot's interface to only choose the completion result with the highest confidence score' and 'For other tested models, we disable their sampling strategy' (Sec. IV). The threshold T is discussed with empirical evaluation across T=1,3,5,7,9. Token length bounds (32-2048) are specified." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "CCTEST is not an agentic system with scaffolding. It is a testing framework that applies mutations and collects outputs from code completion APIs." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section IV documents the preprocessing pipeline: programs are selected with token length between 32 and 2048, for CodeSearchNet only test split is used and duplicates by 'path' attribute are removed, programs are parsed with tree-sitter, and sanity checks determine which PSC transformations are feasible. Table II provides full statistics." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section VI begins with 'Limitations and Threats to Validity' and provides substantive discussion covering construct validity, external validity, and multiple specific limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The paper discusses specific threats: (1) the consistency oracle cannot guarantee correctness, only detect inconsistencies; (2) defects can be missed when all outputs share aligned-yet-erroneous patterns; (3) mutations may not be natural-looking (e.g., LocalVar1); (4) the framework is tested only on Python. These are specific to this study." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section III explicitly states 'We primarily target the erroneous code completion outputs, denoting stealthy logic bugs' and that 'CCTEST cannot guarantee that the repaired outputs become semantically correct.' The study scope is limited to Python, and migration to other languages is left as future work." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The artifact is released at the referenced URL (reference [12]), and the seed datasets (LeetCode solutions, CodeSearchNet) are publicly available. This allows independent verification of the generated test inputs and outputs." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section IV describes data collection in detail: 613 LeetCode programs from a specific GitHub repository (reference [6]), 2,297 CodeSearchNet programs from the test split, with token length filtering (32-2048) and deduplication by path attribute." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": false, 195 "justification": "For the human evaluation, the paper states 'We invite twelve experts, including four industrial developers and eight academy researchers with expertise in LLMs' but does not describe how these specific participants were recruited, whether they were convenience-sampled, or whether the recruitment could introduce bias." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented: seed programs collected and filtered by token length, parsed with tree-sitter, sanity checks for PSC applicability, mutation to generate 19,898 variants from 2,910 seeds, code completion output collection, then outlier detection and enhancement. Table II provides statistics at each stage." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "The Acknowledgement section states 'HKUST authors are supported in part by a RGC ECS grant under the contract 26206520.'" 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: The Hong Kong University of Science and Technology, Harbin Institute of Technology (Shenzhen), and Swiss Federal Institute of Technology Lausanne. None of the authors are affiliated with the companies whose products are tested (GitHub/Microsoft, Salesforce, EleutherAI)." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "The funder is the Hong Kong Research Grants Council (RGC), a government research funding agency with no commercial interest in the performance of code completion systems." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper does not state training data cutoff dates for any of the evaluated models. It notes that the systems 'are advertised as being trained with millions or even billions of lines of code' but provides no temporal cutoff information." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "The paper uses LeetCode solutions and CodeSearchNet as test inputs, both of which are publicly available and could have been in the training data of the evaluated models. No discussion of potential train/test overlap is provided." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "LeetCode solutions and CodeSearchNet were published before the models were trained. The paper does not address whether these test programs appeared in the training data, which could affect the interpretation of 'consistency' since the models may have memorized these specific programs." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": true, 245 "answer": false, 246 "justification": "The human evaluation study with 12 experts is not pre-registered. No link to any pre-registration is provided." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": true, 250 "answer": false, 251 "justification": "The paper involves 12 human participants in an evaluation study but makes no mention of IRB or ethics board approval." 252 }, 253 "demographics_reported": { 254 "applies": true, 255 "answer": false, 256 "justification": "Participants are described only as 'four industrial developers and eight academy researchers with expertise in LLMs.' No further demographics (experience level, years of experience, geographic distribution, etc.) are reported." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": true, 260 "answer": false, 261 "justification": "No inclusion or exclusion criteria for participants are stated beyond 'expertise in LLMs.' It is unclear what qualified someone as having this expertise." 262 }, 263 "randomization_described": { 264 "applies": true, 265 "answer": true, 266 "justification": "The paper states '60 samples' were 'randomly select[ed]', 'five sanity-check test items randomly [inserted] into the questionnaire', and 'We evenly assigned 30 real samples with five SC to each participant, and ensured that each selected sample was examined by six participants.'" 267 }, 268 "blinding_described": { 269 "applies": true, 270 "answer": true, 271 "justification": "The paper states 'We provide two completion outputs for each seed program without specifying whether they are the original or the CCTEST's enhanced outputs,' indicating participants were blinded to the condition." 272 }, 273 "attrition_reported": { 274 "applies": true, 275 "answer": true, 276 "justification": "The paper states 'All participants passed the sanity check,' indicating no attrition from the initial 12 participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "The paper reports: 'it takes 35 GPU seconds to finish completing one prompt on average,' '2.7 CPU seconds to generate nine mutated prompts,' 'about 82 GPU seconds for code completion systems to infer and obtain the results in parallel,' and 'about 1.1 CPU seconds' for outlier detection and enhancement." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "The hardware is specified: 'Intel Xeon Platinum 8276 CPU, 256 GB memory, and 4 NVIDIA A100 GPUs.' Per-prompt timing is provided, allowing total compute to be estimated from the 182,464 test inputs." 289 } 290 } 291 }, 292 "claims": [ 293 { 294 "claim": "CCTEST detected 33,540 inputs that trigger erroneous cases from eight LLM-based code completion systems with a true positive rate of 86%.", 295 "evidence": "Table V reports 5,912 + 27,628 = 33,540 outliers at T=9. Table VI shows precision of 0.861 at T=9 based on manual inspection of 800 randomly sampled positive findings.", 296 "supported": "strong" 297 }, 298 { 299 "claim": "CCTEST's enhancement improves code completion accuracy by 40.25% (BLEU) and 67.43% (edit similarity) on average.", 300 "evidence": "Table VIII reports per-system improvement ratios. The average enhancement column shows 40.20% BLEU and approximately 62-72% edit similarity improvements across 8 systems and 2 datasets. The abstract rounds these numbers.", 301 "supported": "moderate" 302 }, 303 { 304 "claim": "Human evaluators rated CCTEST-enhanced outputs 36.12% higher on average than original outputs.", 305 "evidence": "Section V-C reports average human scores of 2.3188 (original) vs 3.1565 (enhanced) from 12 experts on 60 samples, with Fleiss' Kappa of 0.94.", 306 "supported": "moderate" 307 }, 308 { 309 "claim": "All nine PSC transformation schemes contribute meaningfully to both outlier detection and enhancement.", 310 "evidence": "Tables VII and IX show per-transformation distributions. All schemes have non-zero contributions, though IRR and RTF have smaller scope. This is presented across both datasets.", 311 "supported": "strong" 312 }, 313 { 314 "claim": "97.8% of enhanced outputs were rated equal to or better than original outputs by human evaluators.", 315 "evidence": "Section V-C states 'Respondents believe that for 2.2% of the cases, the enhanced completion outputs look worse than the original outputs, while the remaining 97.8% treat the enhanced outputs equal to or better.'", 316 "supported": "moderate" 317 } 318 ], 319 "methodology_tags": [ 320 "benchmark-eval" 321 ], 322 "key_findings": "CCTEST introduces a metamorphic testing framework for code completion systems using program structure-consistent (PSC) mutations. Testing 8 code completion systems with ~18K seed inputs, it found 33,540 defect-triggering inputs with 86% precision at T=9. The enhancement strategy, which selects the output closest to the average appearance, improved BLEU scores by ~40% and edit similarity by ~67% on average. A human evaluation with 12 experts confirmed that 97.8% of enhanced outputs were rated equal to or better than originals, with high inter-rater agreement (Fleiss' Kappa = 0.94).", 323 "red_flags": [ 324 { 325 "flag": "No contamination analysis", 326 "detail": "The paper uses LeetCode solutions and CodeSearchNet as test inputs without discussing whether these appeared in the training data of the evaluated models. Since these are widely-used public datasets and the models were trained on open-source code, memorization could affect the consistency metrics. A model might produce inconsistent outputs on memorized programs specifically because mutations break memorization patterns." 327 }, 328 { 329 "flag": "No statistical significance tests", 330 "detail": "Enhancement improvements of 40% (BLEU) and 67% (edit similarity) are reported without any significance testing. The comparison between original and enhanced outputs lacks statistical validation despite being the core claim." 331 }, 332 { 333 "flag": "Small human evaluation sample", 334 "detail": "The human evaluation uses only 60 samples (30 per participant) and 12 evaluators with no power analysis. While Fleiss' Kappa is high, the sample size is small relative to the 33,540 detected defects and 182,464 total test cases." 335 }, 336 { 337 "flag": "No exact model versions", 338 "detail": "Models are identified by name and parameter count only (e.g., 'GPT-J 6B', 'Codegen-2B-mono'). No Hugging Face model card version hashes or download dates are provided, making exact replication difficult as model weights on Hugging Face can be updated." 339 } 340 ], 341 "cited_papers": [ 342 { 343 "title": "Evaluating large language models trained on code", 344 "authors": ["Mark Chen", "Jerry Tworek"], 345 "year": 2021, 346 "arxiv_id": "2107.03374", 347 "relevance": "Introduces HumanEval benchmark for code completion evaluation and Codex model, directly relevant to evaluating LLM code generation capabilities." 348 }, 349 { 350 "title": "A conversational paradigm for program synthesis", 351 "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi"], 352 "year": 2022, 353 "arxiv_id": "2203.13474", 354 "relevance": "Introduces CodeGen models evaluated in this paper, relevant to understanding LLM-based code completion systems." 355 }, 356 { 357 "title": "An empirical evaluation of GitHub Copilot's code suggestions", 358 "authors": ["Nhan Nguyen", "Sarah Nadi"], 359 "year": 2022, 360 "relevance": "Empirically evaluates Copilot using LeetCode programs, directly relevant to assessing code completion system quality." 361 }, 362 { 363 "title": "Competition-level code generation with AlphaCode", 364 "authors": ["Yujia Li", "David H. Choi"], 365 "year": 2022, 366 "arxiv_id": "2203.07814", 367 "relevance": "Demonstrates LLM code generation at competition level, relevant to understanding the capabilities and evaluation of code generation models." 368 }, 369 { 370 "title": "Is GitHub Copilot a substitute for human pair-programming? An empirical study", 371 "authors": ["Saki Imai"], 372 "year": 2022, 373 "relevance": "Empirical study on Copilot's effectiveness as a pair programming substitute, relevant to AI-assisted programming productivity." 374 }, 375 { 376 "title": "CodeBERT: A pre-trained model for programming and natural languages", 377 "authors": ["Zhangyin Feng", "Daya Guo"], 378 "year": 2020, 379 "relevance": "Foundational pre-trained model for code understanding, relevant to the broader landscape of neural code models." 380 }, 381 { 382 "title": "Can OpenAI Codex and other large language models help us fix security bugs?", 383 "authors": ["Hammond Pearce", "Benjamin Tan"], 384 "year": 2021, 385 "arxiv_id": "2112.02125", 386 "relevance": "Evaluates LLMs for security bug fixing, relevant to AI-assisted programming and LLM code quality assessment." 387 }, 388 { 389 "title": "Adversarial robustness for code", 390 "authors": ["Pavol Bielik", "Martin Vechev"], 391 "year": 2020, 392 "relevance": "Addresses adversarial robustness of code models, directly relevant to testing and evaluating neural code systems." 393 }, 394 { 395 "title": "Semantic robustness of models of source code", 396 "authors": ["Jordan Henke", "Goutham Ramakrishnan"], 397 "year": 2022, 398 "relevance": "Studies semantic robustness of source code models under transformations, directly relevant to testing code intelligence systems." 399 }, 400 { 401 "title": "CodeSearchNet challenge: Evaluating the state of semantic code search", 402 "authors": ["Hamel Husain", "Ho-Hsiang Wu"], 403 "year": 2019, 404 "arxiv_id": "1909.09436", 405 "relevance": "Introduces the CodeSearchNet dataset used as a test corpus in this paper, relevant as a standard benchmark for code understanding." 406 }, 407 { 408 "title": "Automatic testing and improvement of machine translation", 409 "authors": ["Zeyu Sun", "Jie M Zhang", "Mark Harman"], 410 "year": 2020, 411 "relevance": "Applies metamorphic testing to machine translation, directly inspiring CCTEST's approach to testing code completion." 412 } 413 ] 414 }