scan.json (30143B)
1 { 2 "paper": { 3 "title": "GAMMA: Revisiting Template-based Automated Program Repair via Mask Prediction", 4 "authors": [ 5 "Quanjun Zhang", 6 "Chunrong Fang", 7 "Tongke Zhang", 8 "Bowen Yu", 9 "Weisong Sun", 10 "Zhenyu Chen" 11 ], 12 "year": 2023, 13 "venue": "International Conference on Automated Software Engineering (ASE)", 14 "arxiv_id": "2309.09308", 15 "doi": "10.1109/ASE56229.2023.00063" 16 }, 17 "scan_version": 3, 18 "active_modules": ["experimental_rigor", "data_leakage"], 19 "methodology_tags": ["benchmark-eval"], 20 "key_findings": "GAMMA combines template-based APR fix patterns with pre-trained language model mask prediction to generate donor code, fixing 82 bugs on Defects4J-v1.2 (81.19% precision), outperforming all compared APR techniques. The approach generalizes to Defects4J-v2.0 (45 bugs) and QuixBugs (22 bugs), and is scalable across different pre-trained models (UniXcoder: 82, CodeBERT: 80, ChatGPT: 67). Data leakage investigation found only 3 overlapping bugs in the pre-training data, with GAMMA still outperforming baselines when these are excluded.", 21 "checklist": { 22 "artifacts": { 23 "code_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The authors state 'we release the relevant materials (including source code, experimental results, and correct patches) in our experiment for replication and future research' with a GitHub link (reference [29]: https://github.com/iSEngLab/GAMMA)." 27 }, 28 "data_released": { 29 "applies": true, 30 "answer": true, 31 "justification": "All benchmarks used (Defects4J-v1.2, Defects4J-v2.0, QuixBugs) are publicly available standard benchmarks. No proprietary data was collected." 32 }, 33 "environment_specified": { 34 "applies": true, 35 "answer": false, 36 "justification": "The paper mentions 'Ubuntu 18.04.3 server with two Tesla V100-SXM2 GPUs' and 'unixcoder-base' model, but provides no software dependency versions (Python, PyTorch, etc.), requirements.txt, or Dockerfile." 37 }, 38 "reproduction_instructions": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper describes the approach at a conceptual level but does not include step-by-step reproduction instructions, specific commands to run, or a detailed README. A GitHub repository is referenced but the paper itself lacks reproduction guidance." 42 } 43 }, 44 "statistical_methodology": { 45 "confidence_intervals_or_error_bars": { 46 "applies": true, 47 "answer": false, 48 "justification": "All results are reported as point estimates (e.g., '82 bugs', '45 bugs') with no confidence intervals, error bars, or uncertainty measures." 49 }, 50 "significance_tests": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper claims GAMMA 'substantially outperforms' baselines based solely on comparing raw bug counts (e.g., 82 vs 68 for TBar). No statistical significance tests are applied to any comparison." 54 }, 55 "effect_sizes_reported": { 56 "applies": true, 57 "answer": true, 58 "justification": "The paper reports percentage improvements with baseline context: '20.59% (14 bugs) and 26.15% (17 bugs) improvement over TBar and Recoder' and precision rates '81.19% (82/101) for plausible patches, 9.61% higher than TBar (68/95)'." 59 }, 60 "sample_size_justified": { 61 "applies": true, 62 "answer": false, 63 "justification": "No justification is given for the benchmark sizes (395 bugs in Defects4J-v1.2, 257 in v2.0, 40 in QuixBugs). No power analysis or discussion of whether these sample sizes are sufficient for the claims made." 64 }, 65 "variance_reported": { 66 "applies": true, 67 "answer": false, 68 "justification": "Results are single-run numbers with no standard deviation, variance, or spread measures reported across experimental runs." 69 } 70 }, 71 "evaluation_design": { 72 "baselines_included": { 73 "applies": true, 74 "answer": true, 75 "justification": "GAMMA is compared against 9 APR tools: SequenceR, CoCoNuT, CURE, DLFix, Recoder, AlphaRepair, CIRCLE, PraPR, and TBar (Table I)." 76 }, 77 "baselines_contemporary": { 78 "applies": true, 79 "answer": true, 80 "justification": "Baselines include recent state-of-the-art tools: AlphaRepair (ESEC/FSE 2022), CIRCLE (ISSTA 2022), Recoder (ESEC/FSE 2021), all published within 1-2 years of this work." 81 }, 82 "ablation_study": { 83 "applies": true, 84 "answer": false, 85 "justification": "GAMMA has multiple components (fix template transformation, context construction with comment line, mask prediction model) but no systematic ablation study isolates their individual contributions. RQ3 varies the pre-trained model but does not ablate other components." 86 }, 87 "multiple_metrics": { 88 "applies": true, 89 "answer": true, 90 "justification": "Three metrics are reported: number of plausible patches, number of correct patches, and precision (correct/plausible ratio). Table I shows both plausible and correct counts (e.g., '82/101')." 91 }, 92 "human_evaluation": { 93 "applies": true, 94 "answer": true, 95 "justification": "Three authors manually verify all plausible patches: 'A plausible patch is considered to be correct if all three authors identify it as equivalent to a ground truth patch semantically' (Section VI)." 96 }, 97 "held_out_test_set": { 98 "applies": true, 99 "answer": true, 100 "justification": "GAMMA uses UniXcoder in a zero-shot setting with no fine-tuning on any bug-fixing data. Defects4J and QuixBugs serve as independent test sets that were never used for any model selection or tuning decisions." 101 }, 102 "per_category_breakdown": { 103 "applies": true, 104 "answer": true, 105 "justification": "Table I provides per-project breakdowns across all 6 Defects4J-v1.2 projects (Chart, Closure, Lang, Math, Mockito, Time) for all compared techniques." 106 }, 107 "failure_cases_discussed": { 108 "applies": true, 109 "answer": false, 110 "justification": "The paper shows case studies where GAMMA succeeds but TBar fails (Listings 3, 4). However, no specific examples or analysis of where GAMMA itself fails are provided. Template coverage limitations are mentioned only in passing." 111 }, 112 "negative_results_reported": { 113 "applies": true, 114 "answer": true, 115 "justification": "ChatGPT-based GAMMA fixes only 67 bugs compared to UniXcoder's 82 (Section V-C). GAMMA also underperforms AlphaRepair on QuixBugs (22 vs 28, Table II). Template limitations for new benchmarks are discussed." 116 } 117 }, 118 "claims_and_evidence": { 119 "abstract_claims_supported": { 120 "applies": true, 121 "answer": true, 122 "justification": "All abstract claims are supported: 82 bugs on Defects4J-v1.2 (Table I), 20.59% and 26.15% improvements over TBar and Recoder (Table I), 45 bugs on Defects4J-v2.0 and 22 on QuixBugs (Table II), CodeBERT-based (80) and ChatGPT-based (67) variants (Section V-C)." 123 }, 124 "causal_claims_justified": { 125 "applies": true, 126 "answer": true, 127 "justification": "The main causal claim is that mask prediction improves donor code generation over local file search. The comparison with TBar (same templates, different donor code strategy) provides a controlled single-variable manipulation. RQ3 further validates by varying only the pre-trained model component." 128 }, 129 "generalization_bounded": { 130 "applies": true, 131 "answer": false, 132 "justification": "The title 'Automated Program Repair via Mask Prediction' makes no language restriction, but all evaluation is on Java programs only. The abstract and contributions section make broad claims about APR without bounding to Java or to the specific template types used." 133 }, 134 "alternative_explanations_discussed": { 135 "applies": true, 136 "answer": true, 137 "justification": "Section VI (Threats to Validity) discusses three specific alternative explanations: manual inspection bias, fault localization settings biasing results, and potential data leakage in pre-trained models affecting observed performance." 138 }, 139 "proxy_outcome_distinction": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper measures number of correctly-fixed bugs verified by manual inspection against developer patches, which is a direct measure of repair capability. No proxy gap exists between what is measured and what is claimed." 143 } 144 }, 145 "setup_transparency": { 146 "model_versions_specified": { 147 "applies": true, 148 "answer": true, 149 "justification": "UniXcoder is specified as 'unixcoder-base' (Section IV-E), and ChatGPT is specified as 'gpt-3.5-turbo-0301, which is the latest version available' (Section V-C). CodeBERT is referred to simply as 'CodeBERT' without a specific checkpoint identifier." 150 }, 151 "prompts_provided": { 152 "applies": true, 153 "answer": true, 154 "justification": "The ChatGPT prompt is provided verbatim: 'Next token prediction task, the first line is a comment to help prediction, just return 250 possible predictions for <mask> with highest probability:' (Section V-C). UniXcoder input construction (comment line + method with masked tokens) is described in detail (Section III-C)." 155 }, 156 "hyperparameters_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "Beam size is set to 250 (Section IV-E), mask number range 1-20 for CodeBERT (Section V-C), and 5-hour running-time limit per bug. The prediction mode (beam search for UniXcoder, sequential for CodeBERT) is described." 160 }, 161 "scaffolding_described": { 162 "applies": false, 163 "answer": false, 164 "justification": "GAMMA does not use agentic scaffolding. It is a pipeline of template selection → mask prediction → patch validation with no tool use, retry logic, or feedback mechanisms." 165 }, 166 "data_preprocessing_documented": { 167 "applies": true, 168 "answer": true, 169 "justification": "The pipeline is documented: (1) AST parsing with Eclipse JDT for template selection (Section III-B), (2) input construction with comment line + method context (Section III-C), (3) mask prediction and candidate patch generation, (4) compilation filtering and test suite validation (Section III-D)." 170 } 171 }, 172 "limitations_and_scope": { 173 "limitations_section_present": { 174 "applies": true, 175 "answer": true, 176 "justification": "Section VI 'Threats to Validity' provides substantive discussion of three specific threats: manual inspection bias, fault localization settings, and data leakage of pre-trained models." 177 }, 178 "threats_to_validity_specific": { 179 "applies": true, 180 "answer": true, 181 "justification": "Threats are specific to this study: (1) mitigating manual inspection bias with three independent reviewers, (2) acknowledging perfect fault localization may not reflect practice, (3) querying pre-training datasets to find 3 leaked bugs (Closure-73, Closure-126, Time-19) and verifying results hold without them." 182 }, 183 "scope_boundaries_stated": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper explicitly states: fix templates are 'designed for Java' (Section V-B), 'most of the templates are summarized from Defects4J-v1.2, which may mean that some templates cannot be applied to any bugs except those from Defects4J-v1.2' (Section V-B), and perfect fault localization 'may bring bias in repair performance' (Section VI)." 187 } 188 }, 189 "data_integrity": { 190 "raw_data_available": { 191 "applies": true, 192 "answer": true, 193 "justification": "The authors release 'source code, experimental results, and correct patches' via GitHub (reference [29]), enabling independent verification of the main results." 194 }, 195 "data_collection_described": { 196 "applies": true, 197 "answer": true, 198 "justification": "Data sources are well-described: Defects4J-v1.2 (395 bugs from 6 open-source Java projects), Defects4J-v2.0 (257 single-location bugs from 17 projects), QuixBugs (40 Java programs). Each benchmark's composition and origin are cited." 199 }, 200 "recruitment_methods_described": { 201 "applies": false, 202 "answer": false, 203 "justification": "No human participants. The data sources are standard public benchmarks (Defects4J, QuixBugs)." 204 }, 205 "data_pipeline_documented": { 206 "applies": true, 207 "answer": true, 208 "justification": "The full pipeline is documented: buggy program → fault localization → AST-based template selection → mask token insertion → UniXcoder beam search prediction → compilation filtering → test suite validation → manual inspection. Each step is described in Sections III-A through III-D." 209 } 210 }, 211 "conflicts_of_interest": { 212 "funding_disclosed": { 213 "applies": true, 214 "answer": true, 215 "justification": "Funding is disclosed in the Acknowledgment section: 'This work is supported partially by the National Natural Science Foundation of China (61932012, 62141215).'" 216 }, 217 "affiliations_disclosed": { 218 "applies": true, 219 "answer": true, 220 "justification": "All authors are from Nanjing University's State Key Laboratory for Novel Software Technology. They are not evaluating a product from their own institution or company." 221 }, 222 "funder_independent_of_outcome": { 223 "applies": true, 224 "answer": true, 225 "justification": "The National Natural Science Foundation of China is a government funding body with no commercial interest in GAMMA's repair performance." 226 }, 227 "financial_interests_declared": { 228 "applies": true, 229 "answer": false, 230 "justification": "No competing interests statement or financial interest declaration is present in the paper." 231 } 232 }, 233 "contamination": { 234 "training_cutoff_stated": { 235 "applies": true, 236 "answer": false, 237 "justification": "No explicit training data cutoff date is stated for UniXcoder, CodeBERT, or ChatGPT. The paper mentions UniXcoder was pre-trained on CodeSearchNet data (2.3M NL-PL pairs and 4.1M unimodal code) but does not state when this data was collected." 238 }, 239 "train_test_overlap_discussed": { 240 "applies": true, 241 "answer": true, 242 "justification": "Section VI explicitly addresses overlap: 'we query the pre-training datasets including 2.3M functions paired with comments and 4.1M unimodal code from CodeSearchNet' and found 3 bugs leaked (Closure-73, Closure-126, Time-19). Two authors independently inspected, confirmed by a third." 243 }, 244 "benchmark_contamination_addressed": { 245 "applies": true, 246 "answer": true, 247 "justification": "The paper investigates contamination, finds 3 leaked bugs, shows GAMMA still works on perturbed versions of those bugs, and demonstrates that excluding them still yields better results than baselines (79 vs 68 for TBar, 79 vs 74 for AlphaRepair)." 248 } 249 }, 250 "human_studies": { 251 "pre_registered": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study. It is a benchmark evaluation of an APR tool." 255 }, 256 "irb_or_ethics_approval": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants. The study evaluates software bugs from public benchmarks." 260 }, 261 "demographics_reported": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants." 265 }, 266 "inclusion_exclusion_criteria": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants." 270 }, 271 "randomization_described": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants." 275 }, 276 "blinding_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants." 280 }, 281 "attrition_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants." 285 } 286 }, 287 "cost_and_practicality": { 288 "inference_cost_reported": { 289 "applies": true, 290 "answer": false, 291 "justification": "A 5-hour time limit per bug is mentioned as a fairness constraint, but no actual inference times, API costs, or per-bug costs are reported. The paper notes CodeBERT 'takes much more time' than UniXcoder without quantifying." 292 }, 293 "compute_budget_stated": { 294 "applies": true, 295 "answer": false, 296 "justification": "Hardware is mentioned ('one Ubuntu 18.04.3 server with two Tesla V100-SXM2 GPUs') but total GPU-hours, wall-clock time for the full experiment, or total computational cost are not quantified." 297 } 298 }, 299 "experimental_rigor": { 300 "seed_sensitivity_reported": { 301 "applies": true, 302 "answer": false, 303 "justification": "No discussion of random seed sensitivity. UniXcoder beam search is deterministic, but ChatGPT results would vary across runs and this is not addressed." 304 }, 305 "number_of_runs_stated": { 306 "applies": true, 307 "answer": false, 308 "justification": "The number of experimental runs is not stated. It appears results are from a single run for each configuration." 309 }, 310 "hyperparameter_search_budget": { 311 "applies": true, 312 "answer": false, 313 "justification": "Beam size is set to 250 'due to the limitation of our device' without exploring other values. No hyperparameter search was performed." 314 }, 315 "best_config_selection_justified": { 316 "applies": true, 317 "answer": true, 318 "justification": "The paper uses a single configuration with beam size 250 justified by device limitations. No configuration selection from multiple candidates occurs, so cherry-picking risk is minimal." 319 }, 320 "multiple_comparison_correction": { 321 "applies": true, 322 "answer": false, 323 "justification": "GAMMA is compared against 9 baselines across multiple benchmarks without any statistical tests, let alone multiple comparison corrections." 324 }, 325 "self_comparison_bias_addressed": { 326 "applies": true, 327 "answer": false, 328 "justification": "The authors reuse baseline results from prior work [18] rather than re-implementing baselines (which partially mitigates Lucic et al.'s concern), but they do not explicitly acknowledge or discuss self-comparison bias." 329 }, 330 "compute_budget_vs_performance": { 331 "applies": true, 332 "answer": false, 333 "justification": "No performance-vs-compute analysis. The paper notes beam size 250 is smaller than CURE's and CoCoNuT's 1000, but does not systematically analyze the effect of compute budget on performance." 334 }, 335 "benchmark_construct_validity": { 336 "applies": true, 337 "answer": false, 338 "justification": "The paper uses Defects4J without questioning whether bug count is a valid measure of repair capability or whether the benchmark is representative of real-world bug distributions." 339 }, 340 "scaffold_confound_addressed": { 341 "applies": false, 342 "answer": false, 343 "justification": "No agentic scaffolding is involved. GAMMA is a deterministic pipeline (template selection → mask prediction → validation) without scaffolding or tool use." 344 } 345 }, 346 "data_leakage": { 347 "temporal_leakage_addressed": { 348 "applies": true, 349 "answer": true, 350 "justification": "The authors directly queried UniXcoder's pre-training datasets (CodeSearchNet) for overlap with Defects4J bugs, which is a stronger check than temporal analysis alone. They identified 3 overlapping bugs and verified results hold without them." 351 }, 352 "feature_leakage_addressed": { 353 "applies": true, 354 "answer": true, 355 "justification": "Section VI acknowledges that perfect fault localization provides information not available in practice ('the perfect fault localization results are usually unavailable in real practice'), which is a form of feature leakage in the evaluation setup." 356 }, 357 "non_independence_addressed": { 358 "applies": true, 359 "answer": false, 360 "justification": "No discussion of whether Defects4J bugs share structural similarities, come from the same code patterns, or are otherwise non-independent. Multiple bugs from the same project (e.g., 24 Closure bugs) could share code structure." 361 }, 362 "leakage_detection_method": { 363 "applies": true, 364 "answer": true, 365 "justification": "The authors queried UniXcoder's pre-training datasets (2.3M NL-PL pairs, 4.1M unimodal code from CodeSearchNet) for overlap. Two authors independently performed manual inspection, confirmed by a third. They also tested GAMMA on perturbed versions of the 3 leaked bugs." 366 } 367 } 368 }, 369 "claims": [ 370 { 371 "claim": "GAMMA correctly repairs 82 bugs on Defects4J-v1.2, outperforming all compared state-of-the-art APR techniques.", 372 "evidence": "Table I shows GAMMA fixes 82/101 (correct/plausible), compared to TBar (68/95), Recoder (65/112), AlphaRepair (74/109), and others. Per-project breakdowns are provided.", 373 "supported": "strong" 374 }, 375 { 376 "claim": "GAMMA achieves a correct rate of 81.19% for plausible patches, higher than all baselines.", 377 "evidence": "Computed from Table I: 82/101 = 81.19%. TBar is 68/95 = 71.58%, Recoder is 65/112 = 58.04%, AlphaRepair is 74/109 = 67.89%.", 378 "supported": "strong" 379 }, 380 { 381 "claim": "GAMMA fixes 14 unique bugs that no other compared APR approach can fix.", 382 "evidence": "Figure 2 shows the overlap Venn diagram among GAMMA, AlphaRepair, TBar, Recoder, and CURE, with 14 bugs unique to GAMMA.", 383 "supported": "moderate" 384 }, 385 { 386 "claim": "GAMMA generalizes to additional benchmarks, fixing 45 bugs on Defects4J-v2.0 and 22 on QuixBugs.", 387 "evidence": "Table II shows GAMMA at 45 on Defects4J-v2.0 (vs AlphaRepair 36, Recoder 11, TBar 8) and 22 on QuixBugs (vs AlphaRepair 28, CURE 26). GAMMA underperforms AlphaRepair on QuixBugs.", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "GAMMA is scalable to different pre-trained models: CodeBERT fixes 80 bugs, ChatGPT fixes 67 bugs on Defects4J-v1.2.", 392 "evidence": "Section V-C and Figure 3 report results. The combination of all three models fixes 93 bugs total. ChatGPT underperforms UniXcoder and CodeBERT.", 393 "supported": "strong" 394 }, 395 { 396 "claim": "Data leakage is minimal and does not affect the conclusion: only 3 bugs overlap with UniXcoder's pre-training data.", 397 "evidence": "Section VI describes querying CodeSearchNet pre-training data, finding 3 overlapping bugs (Closure-73, Closure-126, Time-19). Excluding these, GAMMA still fixes 79 bugs vs 68 for TBar and 74 for AlphaRepair. Perturbed versions of leaked bugs are still fixable.", 398 "supported": "moderate" 399 } 400 ], 401 "red_flags": [ 402 { 403 "flag": "No statistical tests for comparative claims", 404 "detail": "All claims of outperformance are based on comparing raw bug counts without any statistical significance tests, confidence intervals, or multiple-run variance. With small differences between some tools (e.g., 82 vs 80 for CodeBERT variant), the significance of improvements is unclear." 405 }, 406 { 407 "flag": "Perfect fault localization only", 408 "detail": "All experiments use perfect (ground-truth) fault localization, which is not available in practice. The authors acknowledge this but do not evaluate with realistic fault localization tools, limiting practical applicability claims." 409 }, 410 { 411 "flag": "No ablation study", 412 "detail": "GAMMA has multiple novel components (template transformation to mask patterns, comment-line context addition, pre-trained model choice) but no systematic ablation isolates their individual contributions. RQ3 varies only the model, not other design decisions." 413 }, 414 { 415 "flag": "Baseline results reused from prior work", 416 "detail": "Table I footnote states results are reused from AlphaRepair [18] rather than independently reproduced. The paper acknowledges 'the results of some APR tools may be different from the reported results in their published papers' due to community re-validation, introducing potential inconsistency." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "TBar: Revisiting Template-based Automated Program Repair", 422 "authors": ["K. Liu", "A. Koyuncu", "D. Kim", "T. F. Bissyandé"], 423 "year": 2019, 424 "relevance": "State-of-the-art template-based APR tool whose fix patterns GAMMA transforms into mask patterns; primary baseline." 425 }, 426 { 427 "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-shot Learning", 428 "authors": ["C. S. Xia", "L. Zhang"], 429 "year": 2022, 430 "relevance": "AlphaRepair: cloze-style APR using CodeBERT without fine-tuning, closely related zero-shot approach and key baseline." 431 }, 432 { 433 "title": "A Syntax-guided Edit Decoder for Neural Program Repair", 434 "authors": ["Q. Zhu", "Z. Sun", "Y.-a. Xiao", "W. Zhang", "K. Yuan", "Y. Xiong", "L. Zhang"], 435 "year": 2021, 436 "relevance": "Recoder: syntax-guided learning-based APR tool, key baseline comparison." 437 }, 438 { 439 "title": "CURE: Code-aware Neural Machine Translation for Automatic Program Repair", 440 "authors": ["N. Jiang", "T. Lutellier", "L. Tan"], 441 "year": 2021, 442 "relevance": "NMT-based APR with code-aware search and pre-trained language model, key baseline." 443 }, 444 { 445 "title": "UniXcoder: Unified Cross-Modal Pre-Training for Code Representation", 446 "authors": ["D. Guo", "S. Lu", "N. Duan", "Y. Wang", "M. Zhou", "J. Yin"], 447 "year": 2022, 448 "relevance": "The primary pre-trained model used by GAMMA for mask prediction, with MLM objective for code understanding and generation." 449 }, 450 { 451 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 452 "authors": ["Z. Feng", "D. Guo", "D. Tang", "N. Duan", "X. Feng"], 453 "year": 2020, 454 "relevance": "Pre-trained code model used as alternative backbone in GAMMA's scalability experiments." 455 }, 456 { 457 "title": "CIRCLE: Continual Repair Across Programming Languages", 458 "authors": ["W. Yuan", "Q. Zhang", "T. He", "C. Fang", "N. Q. V. Hung", "X. Hao", "H. Yin"], 459 "year": 2022, 460 "relevance": "T5-based continual learning APR framework evaluated as a baseline; co-authored by GAMMA authors." 461 }, 462 { 463 "title": "Impact of Code Language Models on Automated Program Repair", 464 "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"], 465 "year": 2023, 466 "relevance": "Explores pre-trained models for APR with and without fine-tuning, directly related to GAMMA's investigation of LLMs for program repair." 467 }, 468 { 469 "title": "Automated Program Repair in the Era of Large Pre-trained Language Models", 470 "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"], 471 "year": 2023, 472 "relevance": "Extensive evaluation of pre-trained models for APR including CodeX fixing 99 bugs on Defects4J-v1.2, contextualizes GAMMA's approach." 473 }, 474 { 475 "title": "CoCoNuT: Combining Context-aware Neural Translation Models Using Ensemble for Program Repair", 476 "authors": ["T. Lutellier", "H. V. Pham", "L. Pang", "Y. Li", "M. Wei", "L. Tan"], 477 "year": 2020, 478 "relevance": "Context-aware NMT approach for APR, baseline comparison using beam size 1000." 479 }, 480 { 481 "title": "An Analysis of the Automatic Bug Fixing Performance of ChatGPT", 482 "authors": ["D. Sobania", "M. Briesch", "C. Hanna", "J. Petke"], 483 "year": 2023, 484 "relevance": "Investigates ChatGPT's performance on QuixBugs benchmark, directly relevant to LLM-based program repair evaluation." 485 }, 486 { 487 "title": "Pre-trained Model-based Automated Software Vulnerability Repair: How Far Are We?", 488 "authors": ["Q. Zhang", "C. Fang", "B. Yu", "W. Sun", "T. Zhang", "Z. Chen"], 489 "year": 2023, 490 "relevance": "Studies pre-trained models for security vulnerability repair, extending the GAMMA team's line of work to security domain." 491 } 492 ], 493 "engagement_factors": { 494 "practical_relevance": { 495 "score": 2, 496 "justification": "GAMMA is a released practical APR tool with source code on GitHub, usable for Java bug repair, but requires Defects4J setup and Java-specific infrastructure." 497 }, 498 "surprise_contrarian": { 499 "score": 1, 500 "justification": "The combination of templates with pre-trained models is a natural and expected direction, not contrarian, though demonstrating zero-shot mask prediction outperforming trained NMT models is somewhat surprising." 501 }, 502 "fear_safety": { 503 "score": 0, 504 "justification": "No AI safety, security, or risk concerns raised; purely a software engineering tool paper." 505 }, 506 "drama_conflict": { 507 "score": 0, 508 "justification": "No controversy; straightforward empirical comparison with existing tools." 509 }, 510 "demo_ability": { 511 "score": 2, 512 "justification": "Source code released on GitHub; a researcher could set up and run GAMMA, though it requires Java development environment and benchmark infrastructure." 513 }, 514 "brand_recognition": { 515 "score": 1, 516 "justification": "Uses ChatGPT (recognizable) and CodeBERT (known in NLP/SE community) but from an academic lab without major brand recognition." 517 } 518 } 519 }