scan.json (19844B)
1 { 2 "paper": { 3 "title": "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair", 4 "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"], 5 "year": 2023, 6 "venue": "ESEC/FSE 2023", 7 "arxiv_id": "2309.00608", 8 "doi": "10.1145/3611643.3616271" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "GitHub link provided: https://github.com/ise-uiuc/Repilot. Also an immutable Zenodo artifact is mentioned in the Data Availability section." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "Uses publicly available Defects4J 1.2 and 2.0 benchmarks. Correct patches are released for public evaluation (reference [62], Zenodo)." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": true, 25 "justification": "Section 5.1 specifies hardware (32-Core Ryzen Threadripper PRO 3975WX, 256 GB RAM, NVIDIA RTX A6000), OS (Ubuntu 20.04.4 LTS), Java version (OpenJDK 1.8.0_181), and models from Hugging Face. Python implementation with 5K lines of code and modified Eclipse JDT LS in Java with 1.5K lines." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "While code is released, the paper itself does not include step-by-step reproduction instructions. The Zenodo artifact and GitHub repo may contain them, but the paper does not describe reproduction steps." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "Results are reported as point estimates (number of correct fixes, compilation rates). No confidence intervals or error bars are provided." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper claims Repilot 'outperforms' baselines based on comparing raw numbers (e.g., 66 vs 52 fixes) without any statistical significance tests." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports improvements with baseline context: '27% and 47% more bugs' fixed, percentage point improvements in compilation rate (e.g., 43.2% to 63.4%), and absolute counts for both Repilot and baselines." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "No justification for why 138 and 135 single-hunk bugs are sufficient, or why 5000/500 samples per bug were chosen beyond 'fair comparison' and 'high cost'." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "Section 8 (Threats to Validity) acknowledges 'we only run each of our experiments once, which could introduce extra statistical biases.' No variance or standard deviation across runs is reported." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "19 APR baselines are compared: 12 traditional, 6 NMT-based, and 1 LLM-based (AlphaRepair). Listed in Section 5.3." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "Baselines include recent tools like AlphaRepair (2023), RewardRepair (2022), Recoder (2022), representing state-of-the-art at the time." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "RQ3 (Section 6.3) presents a detailed ablation study with four variants: Repilot∅ (base LLM only), Repilotp (with pruning), Repilotmp (with memorization), and full Repilot (with active completion)." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "Multiple metrics used: number of correct fixes, number of plausible fixes, compilation rate (%), plausible rate (%), and generation time." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "Correct patches are determined by manual examination: 'we determine semantic equivalency by manually examining each plausible patch' (Section 5.4)." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "This is not a learning/training evaluation — Repilot uses pre-trained models on a fixed benchmark. No train/test split concept applies to the evaluation itself." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Results are broken down by dataset (Defects4J 1.2 vs 2.0), by model (CodeT5 vs InCoder), and by bug type (single-hunk vs single-line). Venn diagrams show unique fixes per tool." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 7 (Limitations) discusses when Repilot fails: dynamically typed languages, higher compilation rate not proportionally increasing correct fixes, and lack of evidence for rare token claims." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 7 acknowledges that 'a significantly higher compilation rate does not necessarily translate to a proportionally large increase in plausible and correct fixes.' The ablation also shows overhead costs." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "Abstract claims '27% and 47% more bugs' are supported by Table 1 (66 vs 52 for D4J 1.2 = 27%, 50 vs 34 for D4J 2.0 = 47%). Compilation rate and validity claims are supported by Tables 2-3." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "Causal claims about component contributions are supported by the ablation study (RQ3) which systematically adds components. The ablation design (controlled single-variable manipulation) is adequate." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "Section 7 explicitly bounds scope: 'the scope of our evaluation considering two LLMs (CodeT5 and InCoder) and one programming language (Java) is still narrow.' Also acknowledges difficulty with dynamically typed languages." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 8 discusses training data overlap as an alternative explanation (7/66 and 6/50 bugs overlap with CodeT5 training data) and provides adjusted numbers excluding overlapping bugs." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "Specific models named: CodeT5-large (770M params) and InCoder-6.7B, obtained from Hugging Face. These are specific model identifiers with sizes." 134 }, 135 "prompts_provided": { 136 "applies": false, 137 "answer": false, 138 "justification": "Repilot does not use prompting — it uses cloze-style infilling where buggy code is replaced with a mask token and the LLM fills it in. This is a model inference approach, not a prompting approach." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 5.1: top-p nucleus sampling with p=1.0, temperature=1.0, max_tokens=50, 5000 samples per bug (RQ1/RQ2) or 500 samples (RQ3/RQ4), 5-hour timeout per bug." 144 }, 145 "scaffolding_described": { 146 "applies": true, 147 "answer": true, 148 "justification": "The entire paper describes the scaffolding in detail: the interaction loop between LLM and Completion Engine (Algorithms 1-3), pruning, memorization (Trie), and active completion are all formally specified." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 5.2 documents data selection: starting from Defects4J, filtering to single-hunk bugs, removing 4 deprecated bugs, removing bugs incompatible with Completion Engine, resulting in 138 and 135 bugs." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Dedicated Section 7 (Limitations) and Section 8 (Threats to Validity) are present with substantive discussion." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "Specific threats discussed: training data overlap (7/66 bugs), single-run experiments, manually examining patches for correctness, Completion Engine soundness depending on implementation, baseline results taken from prior work without reproduction." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 7 states specific boundaries: only Java evaluated, only two LLMs tested, only APR task (not other code generation tasks), difficulty with dynamically typed languages, limited evidence for rare token claim." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "Zenodo artifact [62] and GitHub repo contain the full set of correct patches. Defects4J benchmark is publicly available." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 5.2 describes the data: Defects4J is a manually curated Java dataset. Bug selection criteria (single-hunk, compatible with Completion Engine) are stated with counts." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants. Data comes from a standard benchmark (Defects4J)." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "Pipeline documented: start with Defects4J 1.2 (391 bugs - 4 deprecated), filter to single-hunk, remove incompatible = 138 bugs. For D4J 2.0: 438 bugs filtered to 135 single-hunk. Repair templates applied for RQ1/RQ2." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "Acknowledgments section: 'This work was partially supported by NSF grants CCF-2131943 and CCF-2141474, as well as Kwai Inc.'" 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "All authors affiliated with University of Illinois Urbana-Champaign. No evaluated product is from their employer." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "NSF is an independent funder. Kwai Inc. does not have a direct stake in APR tool outcomes. Neither funder's product is being evaluated." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests statement is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "No explicit training data cutoff date is stated for CodeT5 or InCoder. The paper acknowledges overlap risk but does not state when training data was collected." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": true, 226 "justification": "Section 8 explicitly addresses this: '7 out of 66 and 6 out of 50 overlap with training data on Defects4J 1.2 and 2.0 respectively' and provides adjusted results excluding these." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": true, 231 "justification": "Section 8 addresses contamination: computes overlap between CodeT5 training data and Defects4J, reports adjusted results. For InCoder, acknowledges the threat cannot be fully addressed since training data is not revealed." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants in the study." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants in the study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants in the study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in the study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in the study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in the study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in the study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "Generation time per patch reported in Table 3 (e.g., 0.232s for base, 0.248s for full Repilot) and Table 4. Overhead percentages discussed (7% for CodeT5, negligible for InCoder)." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": true, 280 "justification": "Hardware specified (Section 5.1): NVIDIA RTX A6000, 5-hour timeout per bug, 5000 or 500 samples per bug. Generation time per patch reported." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "Repilot fixes 27% more bugs on Defects4J 1.2 and 47% more on Defects4J 2.0 than state-of-the-art baselines.", 287 "evidence": "Table 1: Repilot fixes 66 bugs vs AlphaRepair's 52 on D4J 1.2 (27% more), and 50 vs 34 on D4J 2.0 (47% more).", 288 "supported": "strong" 289 }, 290 { 291 "claim": "Repilot achieves ~60% compilation rate at 5000 patches, far exceeding prior tools.", 292 "evidence": "Table 2: Repilot achieves 59% at Top-5000 vs AlphaRepair's 13% and CURE's 9%.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "All components (pruning, memorization, active completion) contribute positively to Repilot's effectiveness.", 297 "evidence": "Table 3 ablation: compilation rate increases from 43.2% (base) to 60.7% (pruning) to 63.4% (full), with generation time overhead reduced from 25% to 7%.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Repilot generalizes across different LLMs (CodeT5 and InCoder) and bug datasets.", 302 "evidence": "Table 4: Repilot improves over baseline for both CodeT5 and InCoder on both D4J 1.2 and 2.0.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "Repilot incurs minimal overhead (7% for CodeT5, negligible for InCoder).", 307 "evidence": "Table 3: 0.232s vs 0.248s for CodeT5. Table 4: 1.70s vs 1.70s for InCoder.", 308 "supported": "strong" 309 } 310 ], 311 "methodology_tags": ["benchmark-eval"], 312 "key_findings": "Repilot fuses LLMs with semantics-based Completion Engines to guide autoregressive code generation during automated program repair. On Defects4J 1.2 and 2.0, Repilot fixes 66 and 50 single-hunk/single-line bugs respectively, outperforming 19 baseline APR tools. The Completion Engine pruning raises compilation rates from ~43% to ~63% with only 7% time overhead for CodeT5, and negligible overhead for the larger InCoder model. The approach generalizes across two LLM architectures (encoder-decoder and decoder-only) and two benchmark versions.", 313 "red_flags": [ 314 { 315 "flag": "Single-run experiments", 316 "detail": "All experiments were run only once, as acknowledged in Section 8. With sampling-based generation (5000 or 500 samples), results could vary across runs, but no variance is reported." 317 }, 318 { 319 "flag": "Baseline results not reproduced", 320 "detail": "Section 8 states: 'we follow the convention used in prior work to directly report the bug fix results without reproducing them.' This means baseline comparisons rely on numbers from other papers under potentially different conditions." 321 }, 322 { 323 "flag": "Limited language coverage", 324 "detail": "Only Java is evaluated despite claims of generalizability. The paper acknowledges this but the title and framing suggest broader applicability." 325 } 326 ], 327 "cited_papers": [ 328 { 329 "title": "Evaluating Large Language Models Trained on Code", 330 "authors": ["Mark Chen"], 331 "year": 2021, 332 "arxiv_id": "2107.03374", 333 "relevance": "Foundational Codex paper establishing LLM code generation evaluation methodology." 334 }, 335 { 336 "title": "AlphaRepair: Zero-shot cloze-style APR using LLMs", 337 "authors": ["Chunqiu Steven Xia", "Lingming Zhang"], 338 "year": 2022, 339 "relevance": "Direct predecessor and primary LLM baseline for automated program repair." 340 }, 341 { 342 "title": "Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation", 343 "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"], 344 "year": 2023, 345 "arxiv_id": "2305.01210", 346 "relevance": "Evaluation methodology for LLM-generated code correctness." 347 }, 348 { 349 "title": "InCoder: A Generative Model for Code Infilling and Synthesis", 350 "authors": ["Daniel Fried"], 351 "year": 2023, 352 "relevance": "One of two LLMs used in Repilot's evaluation; decoder-only code infilling model." 353 }, 354 { 355 "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair", 356 "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"], 357 "year": 2021, 358 "relevance": "Prior APR work that also uses static analysis to constrain code generation, key baseline." 359 }, 360 { 361 "title": "Patch Generation with Language Models: Feasibility and Scaling Behavior", 362 "authors": ["Sophia D Kolak", "Ruben Martins", "Claire Le Goues", "Vincent Josua Hellendoorn"], 363 "year": 2022, 364 "relevance": "Studies feasibility and scaling of LLMs for patch generation in APR." 365 }, 366 { 367 "title": "Competition-level code generation with AlphaCode", 368 "authors": ["Yujia Li"], 369 "year": 2022, 370 "doi": "10.1126/science.abq1158", 371 "relevance": "Large-scale LLM code generation with sampling-based evaluation methodology." 372 }, 373 { 374 "title": "ChatRepair: Conversational APR approach", 375 "authors": ["Chunqiu Steven Xia"], 376 "year": 2023, 377 "relevance": "Dialogue-based LLM approach for APR using iterative feedback from test failures." 378 }, 379 { 380 "title": "Recoder: Edit-based NMT model for APR", 381 "authors": ["Shangwen Wang"], 382 "year": 2022, 383 "relevance": "NMT-based APR that enforces syntax correctness via edit-based model and placeholder tokens." 384 }, 385 { 386 "title": "RewardRepair: Penalizing uncompilable patches during training for APR", 387 "authors": ["Hao Ye"], 388 "year": 2022, 389 "relevance": "APR approach that boosts compilation rate through training-time penalties, key baseline for compilation rate comparison." 390 } 391 ] 392 }