scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33232B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Gamma: Revisiting Template-Based Automated Program Repair Via Mask Prediction",
      6     "authors": [
      7       "Quanjun Zhang",
      8       "Bowen Yu",
      9       "Chunrong Fang",
     10       "Weisong Sun",
     11       "Tongke Zhang",
     12       "Zhenyu Chen"
     13     ],
     14     "year": 2023,
     15     "venue": "International Conference on Automated Software Engineering",
     16     "arxiv_id": "2309.09308",
     17     "doi": "10.1109/ASE56229.2023.00063"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All abstract claims are supported: 82 bugs on Defects4J-v1.2 (Table I), 20.59% and 26.15% improvements over TBar and Recoder (Table I), 45 bugs on Defects4J-v2.0 and 22 on QuixBugs (Table II), CodeBERT-based (80) and ChatGPT-based (67) variants (Section V-C).",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The main causal claim is that mask prediction improves donor code generation over local file search. The comparison with TBar (same templates, different donor code strategy) provides a controlled single-variable manipulation. RQ3 further validates by varying only the pre-trained model component.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The title 'Automated Program Repair via Mask Prediction' makes no language restriction, but all evaluation is on Java programs only. The abstract and contributions section make broad claims about APR without bounding to Java or to the specific template types used.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section VI (Threats to Validity) discusses three specific alternative explanations: manual inspection bias, fault localization settings biasing results, and potential data leakage in pre-trained models affecting observed performance.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper measures number of correctly-fixed bugs verified by manual inspection against developer patches, which is a direct measure of repair capability. No proxy gap exists between what is measured and what is claimed.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section VI 'Threats to Validity' provides substantive discussion of three specific threats: manual inspection bias, fault localization settings, and data leakage of pre-trained models.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Threats are specific to this study: (1) mitigating manual inspection bias with three independent reviewers, (2) acknowledging perfect fault localization may not reflect practice, (3) querying pre-training datasets to find 3 leaked bugs (Closure-73, Closure-126, Time-19) and verifying results hold without them.",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states: fix templates are 'designed for Java' (Section V-B), 'most of the templates are summarized from Defects4J-v1.2, which may mean that some templates cannot be applied to any bugs except those from Defects4J-v1.2' (Section V-B), and perfect fault localization 'may bring bias in repair performance' (Section VI).",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Funding is disclosed in the Acknowledgment section: 'This work is supported partially by the National Natural Science Foundation of China (61932012, 62141215).'",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All authors are from Nanjing University's State Key Laboratory for Novel Software Technology. They are not evaluating a product from their own institution or company.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The National Natural Science Foundation of China is a government funding body with no commercial interest in GAMMA's repair performance.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or financial interest declaration is present in the paper.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Key terms are defined in Section II: 'fix pattern,' 'donor code,' 'plausible patch,' 'correct patch,' 'mask prediction task,' and 'pre-trained language model' are all explicitly defined with examples.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Four contributions are explicitly enumerated at the end of Section I: new dimension (bridging PLMs and template-based APR), novel APR tool (GAMMA), extensive empirical study, and available artifacts.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section VII provides detailed engagement with related APR approaches, explicitly differentiating GAMMA from AlphaRepair (cloze-style but no templates), TBar (templates but local search only), and learning-based NMT approaches; shows how GAMMA is complementary rather than merely adjacent.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "The authors state 'we release the relevant materials (including source code, experimental results, and correct patches) in our experiment for replication and future research' with a GitHub link (reference [29]: https://github.com/iSEngLab/GAMMA).",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "All benchmarks used (Defects4J-v1.2, Defects4J-v2.0, QuixBugs) are publicly available standard benchmarks. No proprietary data was collected.",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "The paper mentions 'Ubuntu 18.04.3 server with two Tesla V100-SXM2 GPUs' and 'unixcoder-base' model, but provides no software dependency versions (Python, PyTorch, etc.), requirements.txt, or Dockerfile.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper describes the approach at a conceptual level but does not include step-by-step reproduction instructions, specific commands to run, or a detailed README. A GitHub repository is referenced but the paper itself lacks reproduction guidance.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results are reported as point estimates (e.g., '82 bugs', '45 bugs') with no confidence intervals, error bars, or uncertainty measures.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper claims GAMMA 'substantially outperforms' baselines based solely on comparing raw bug counts (e.g., 82 vs 68 for TBar). No statistical significance tests are applied to any comparison.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "The paper reports percentage improvements with baseline context: '20.59% (14 bugs) and 26.15% (17 bugs) improvement over TBar and Recoder' and precision rates '81.19% (82/101) for plausible patches, 9.61% higher than TBar (68/95)'.",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No justification is given for the benchmark sizes (395 bugs in Defects4J-v1.2, 257 in v2.0, 40 in QuixBugs). No power analysis or discussion of whether these sample sizes are sufficient for the claims made.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Results are single-run numbers with no standard deviation, variance, or spread measures reported across experimental runs.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "GAMMA is compared against 9 APR tools: SequenceR, CoCoNuT, CURE, DLFix, Recoder, AlphaRepair, CIRCLE, PraPR, and TBar (Table I).",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines include recent state-of-the-art tools: AlphaRepair (ESEC/FSE 2022), CIRCLE (ISSTA 2022), Recoder (ESEC/FSE 2021), all published within 1-2 years of this work.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "GAMMA has multiple components (fix template transformation, context construction with comment line, mask prediction model) but no systematic ablation study isolates their individual contributions. RQ3 varies the pre-trained model but does not ablate other components.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Three metrics are reported: number of plausible patches, number of correct patches, and precision (correct/plausible ratio). Table I shows both plausible and correct counts (e.g., '82/101').",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Three authors manually verify all plausible patches: 'A plausible patch is considered to be correct if all three authors identify it as equivalent to a ground truth patch semantically' (Section VI).",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "GAMMA uses UniXcoder in a zero-shot setting with no fine-tuning on any bug-fixing data. Defects4J and QuixBugs serve as independent test sets that were never used for any model selection or tuning decisions.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Table I provides per-project breakdowns across all 6 Defects4J-v1.2 projects (Chart, Closure, Lang, Math, Mockito, Time) for all compared techniques.",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": false,
    225           "justification": "The paper shows case studies where GAMMA succeeds but TBar fails (Listings 3, 4). However, no specific examples or analysis of where GAMMA itself fails are provided. Template coverage limitations are mentioned only in passing.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "ChatGPT-based GAMMA fixes only 67 bugs compared to UniXcoder's 82 (Section V-C). GAMMA also underperforms AlphaRepair on QuixBugs (22 vs 28, Table II). Template limitations for new benchmarks are discussed.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "UniXcoder is specified as 'unixcoder-base' (Section IV-E), and ChatGPT is specified as 'gpt-3.5-turbo-0301, which is the latest version available' (Section V-C). CodeBERT is referred to simply as 'CodeBERT' without a specific checkpoint identifier.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "The ChatGPT prompt is provided verbatim: 'Next token prediction task, the first line is a comment to help prediction, just return 250 possible predictions for <mask> with highest probability:' (Section V-C). UniXcoder input construction (comment line + method with masked tokens) is described in detail (Section III-C).",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Beam size is set to 250 (Section IV-E), mask number range 1-20 for CodeBERT (Section V-C), and 5-hour running-time limit per bug. The prediction mode (beam search for UniXcoder, sequential for CodeBERT) is described.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "GAMMA does not use agentic scaffolding. It is a pipeline of template selection → mask prediction → patch validation with no tool use, retry logic, or feedback mechanisms.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "The pipeline is documented: (1) AST parsing with Eclipse JDT for template selection (Section III-B), (2) input construction with comment line + method context (Section III-C), (3) mask prediction and candidate patch generation, (4) compilation filtering and test suite validation (Section III-D).",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "The authors release 'source code, experimental results, and correct patches' via GitHub (reference [29]), enabling independent verification of the main results.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Data sources are well-described: Defects4J-v1.2 (395 bugs from 6 open-source Java projects), Defects4J-v2.0 (257 single-location bugs from 17 projects), QuixBugs (40 Java programs). Each benchmark's composition and origin are cited.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants. The data sources are standard public benchmarks (Defects4J, QuixBugs).",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline is documented: buggy program → fault localization → AST-based template selection → mask token insertion → UniXcoder beam search prediction → compilation filtering → test suite validation → manual inspection. Each step is described in Sections III-A through III-D.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No explicit training data cutoff date is stated for UniXcoder, CodeBERT, or ChatGPT. The paper mentions UniXcoder was pre-trained on CodeSearchNet data (2.3M NL-PL pairs and 4.1M unimodal code) but does not state when this data was collected.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": true,
    303           "justification": "Section VI explicitly addresses overlap: 'we query the pre-training datasets including 2.3M functions paired with comments and 4.1M unimodal code from CodeSearchNet' and found 3 bugs leaked (Closure-73, Closure-126, Time-19). Two authors independently inspected, confirmed by a third.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": true,
    309           "justification": "The paper investigates contamination, finds 3 leaked bugs, shows GAMMA still works on perturbed versions of those bugs, and demonstrates that excluding them still yields better results than baselines (79 vs 68 for TBar, 79 vs 74 for AlphaRepair).",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study. It is a benchmark evaluation of an APR tool.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants. The study evaluates software bugs from public benchmarks.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "A 5-hour time limit per bug is mentioned as a fairness constraint, but no actual inference times, API costs, or per-bug costs are reported. The paper notes CodeBERT 'takes much more time' than UniXcoder without quantifying.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Hardware is mentioned ('one Ubuntu 18.04.3 server with two Tesla V100-SXM2 GPUs') but total GPU-hours, wall-clock time for the full experiment, or total computational cost are not quantified.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "No discussion of random seed sensitivity. UniXcoder beam search is deterministic, but ChatGPT results would vary across runs and this is not addressed.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "The number of experimental runs is not stated. It appears results are from a single run for each configuration.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "Beam size is set to 250 'due to the limitation of our device' without exploring other values. No hyperparameter search was performed.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": true,
    393           "justification": "The paper uses a single configuration with beam size 250 justified by device limitations. No configuration selection from multiple candidates occurs, so cherry-picking risk is minimal.",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "GAMMA is compared against 9 baselines across multiple benchmarks without any statistical tests, let alone multiple comparison corrections.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The authors reuse baseline results from prior work [18] rather than re-implementing baselines (which partially mitigates Lucic et al.'s concern), but they do not explicitly acknowledge or discuss self-comparison bias.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "No performance-vs-compute analysis. The paper notes beam size 250 is smaller than CURE's and CoCoNuT's 1000, but does not systematically analyze the effect of compute budget on performance.",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": false,
    417           "justification": "The paper uses Defects4J without questioning whether bug count is a valid measure of repair capability or whether the benchmark is representative of real-world bug distributions.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": false,
    422           "answer": false,
    423           "justification": "No agentic scaffolding is involved. GAMMA is a deterministic pipeline (template selection → mask prediction → validation) without scaffolding or tool use.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": true,
    431           "justification": "The authors directly queried UniXcoder's pre-training datasets (CodeSearchNet) for overlap with Defects4J bugs, which is a stronger check than temporal analysis alone. They identified 3 overlapping bugs and verified results hold without them.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": true,
    437           "justification": "Section VI acknowledges that perfect fault localization provides information not available in practice ('the perfect fault localization results are usually unavailable in real practice'), which is a form of feature leakage in the evaluation setup.",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of whether Defects4J bugs share structural similarities, come from the same code patterns, or are otherwise non-independent. Multiple bugs from the same project (e.g., 24 Closure bugs) could share code structure.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": true,
    449           "justification": "The authors queried UniXcoder's pre-training datasets (2.3M NL-PL pairs, 4.1M unimodal code from CodeSearchNet) for overlap. Two authors independently performed manual inspection, confirmed by a third. They also tested GAMMA on perturbed versions of the 3 leaked bugs.",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "GAMMA fixes 82 bugs on Defects4J-v1.2 under perfect fault localization, outperforming all prior APR approaches including TBar (68) and AlphaRepair (74).",
    458       "evidence": "Table I shows GAMMA = 82/101, TBar = 68/95, AlphaRepair = 74/109 across all six D4J-v1.2 projects.",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "GAMMA achieves 81.19% correct-to-plausible patch ratio, indicating it significantly mitigates the patch overfitting problem.",
    463       "evidence": "Table I reports 82/101 correct/plausible for GAMMA vs 71.6% (68/95) for TBar and 67.9% (74/109) for AlphaRepair.",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "GAMMA fixes 14 unique bugs that no other state-of-the-art APR approach can fix.",
    468       "evidence": "Figure 2 Venn diagram shows GAMMA-exclusive bugs; 14 unique to GAMMA compared to TBar, AlphaRepair, CURE, and Recoder jointly.",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "GAMMA generalizes better than baselines across unseen benchmarks, fixing 45 bugs on Defects4J-v2.0 versus 36 for the best baseline (AlphaRepair).",
    473       "evidence": "Table II reports GAMMA = 45, AlphaRepair = 36 on D4J-v2.0; paper attributes this to not relying on benchmark-specific learned patterns.",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Data leakage of pre-training data affects at most 3 bugs and does not change the main comparative conclusions.",
    478       "evidence": "Section VI identifies Closure-73, Closure-126, Time-19 as leaked; manual perturbation testing shows GAMMA still fixes all three; GAMMA still leads (79 vs 68 for TBar) excluding them.",
    479       "supported": "moderate"
    480     },
    481     {
    482       "claim": "UniXcoder outperforms CodeBERT and ChatGPT for mask-based APR due to alignment between MLM pre-training objective and the fill-in-the-blank task.",
    483       "evidence": "Figure 3 shows UniXcoder=82, CodeBERT=80, ChatGPT=67; attributed to UniXcoder's MLM design vs ChatGPT's conversational fine-tuning.",
    484       "supported": "moderate"
    485     }
    486   ],
    487   "methodology_tags": [
    488     "benchmark-eval"
    489   ],
    490   "key_findings": "GAMMA integrates pre-trained language models (UniXcoder) with fix templates to address the donor code problem in template-based APR, replacing local file search with mask prediction. On Defects4J-v1.2, GAMMA fixes 82 bugs with 81.19% precision, outperforming the prior best (AlphaRepair, 74 bugs) and fixing 14 bugs unique to GAMMA. The approach demonstrates better generalizability on held-out benchmarks (D4J-v2.0, QuixBugs) than both traditional and learning-based APR approaches, suggesting that zero-shot PLM-based donor code retrieval reduces benchmark overfitting. ChatGPT underperforms task-aligned models (UniXcoder, CodeBERT) for mask prediction, pointing to the importance of pre-training objective alignment for code-specific structured prediction.",
    491   "red_flags": [
    492     {
    493       "flag": "Perfect fault localization",
    494       "detail": "All comparisons use perfect fault localization (ground-truth buggy line known), which is unrealistic in practice. The paper acknowledges this but provides no results under automated FL, making reported numbers unrepresentative of real-world performance."
    495     },
    496     {
    497       "flag": "No statistical significance testing",
    498       "detail": "All comparative claims are made based on raw bug counts without any statistical tests, confidence intervals, or effect size analysis. With 395 bugs and single-digit to low-double-digit differences between systems, the statistical significance of comparisons is unknown."
    499     },
    500     {
    501       "flag": "Beam size disadvantage for baselines",
    502       "detail": "GAMMA uses beam size 250 due to hardware constraints, while CURE and CoCoNuT use beam size 1000. The paper does not analyze whether this puts GAMMA at a disadvantage or advantage, making the comparison potentially unfair."
    503     },
    504     {
    505       "flag": "Reused baseline results",
    506       "detail": "Table I footnote states baselines' results are reused from prior work rather than re-run, meaning experimental conditions (hardware, environment versions) may differ from GAMMA's evaluation run."
    507     }
    508   ],
    509   "cited_papers": [
    510     {
    511       "title": "TBar: Revisiting Template-Based Automated Program Repair",
    512       "relevance": "Primary template-based APR baseline and source of fix patterns adapted for GAMMA"
    513     },
    514     {
    515       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning (AlphaRepair)",
    516       "relevance": "Most directly related work — cloze-style APR without fine-tuning; GAMMA's closest competitor and key differentiator"
    517     },
    518     {
    519       "title": "UniXcoder: Unified Cross-Modal Pre-training for Code Representation",
    520       "relevance": "Core pre-trained model used in GAMMA's mask prediction"
    521     },
    522     {
    523       "title": "A Syntax-Guided Edit Decoder for Neural Program Repair (Recoder)",
    524       "relevance": "State-of-the-art learning-based APR baseline for comparison"
    525     },
    526     {
    527       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    528       "relevance": "Learning-based APR baseline with pre-trained PLM component"
    529     },
    530     {
    531       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    532       "relevance": "Primary evaluation benchmark used throughout all experiments"
    533     },
    534     {
    535       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    536       "relevance": "Alternative PLM evaluated in RQ3 scalability experiments"
    537     },
    538     {
    539       "title": "Where Were the Repair Ingredients for Defects4J Bugs? Exploring the Impact of Repair Ingredient Retrieval",
    540       "relevance": "Motivation for GAMMA — shows half of D4J bugs fail because donor code is unavailable in local file"
    541     }
    542   ],
    543   "engagement_factors": {
    544     "practical_relevance": {
    545       "score": 2,
    546       "justification": "GAMMA is a released practical APR tool with source code on GitHub, usable for Java bug repair, but requires Defects4J setup and Java-specific infrastructure."
    547     },
    548     "surprise_contrarian": {
    549       "score": 1,
    550       "justification": "The combination of templates with pre-trained models is a natural and expected direction, not contrarian, though demonstrating zero-shot mask prediction outperforming trained NMT models is somewhat surprising."
    551     },
    552     "fear_safety": {
    553       "score": 0,
    554       "justification": "No AI safety, security, or risk concerns raised; purely a software engineering tool paper."
    555     },
    556     "drama_conflict": {
    557       "score": 0,
    558       "justification": "No controversy; straightforward empirical comparison with existing tools."
    559     },
    560     "demo_ability": {
    561       "score": 2,
    562       "justification": "Source code released on GitHub; a researcher could set up and run GAMMA, though it requires Java development environment and benchmark infrastructure."
    563     },
    564     "brand_recognition": {
    565       "score": 1,
    566       "justification": "Uses ChatGPT (recognizable) and CodeBERT (known in NLP/SE community) but from an academic lab without major brand recognition."
    567     }
    568   },
    569   "hn_data": {
    570     "threads": [
    571       {
    572         "hn_id": "34890787",
    573         "title": "The usability of advanced type systems: Rust as a case study",
    574         "points": 141,
    575         "comments": 162,
    576         "url": "https://news.ycombinator.com/item?id=34890787"
    577       },
    578       {
    579         "hn_id": "36962148",
    580         "title": "WizMap - A tool for the visualization of latent spaces",
    581         "points": 17,
    582         "comments": 1,
    583         "url": "https://news.ycombinator.com/item?id=36962148"
    584       },
    585       {
    586         "hn_id": "34310167",
    587         "title": "The Usability of Advanced Type Systems: Rust as a Case Study",
    588         "points": 2,
    589         "comments": 0,
    590         "url": "https://news.ycombinator.com/item?id=34310167"
    591       },
    592       {
    593         "hn_id": "24598805",
    594         "title": "Michelson Holography: Dual-SLM Holography with Camera-in-the-Loop Optimization",
    595         "points": 2,
    596         "comments": 0,
    597         "url": "https://news.ycombinator.com/item?id=24598805"
    598       },
    599       {
    600         "hn_id": "41688042",
    601         "title": "Efficient Streaming Inference of Multimodal Large Language Models on 1 GPU",
    602         "points": 1,
    603         "comments": 0,
    604         "url": "https://news.ycombinator.com/item?id=41688042"
    605       },
    606       {
    607         "hn_id": "35669201",
    608         "title": "Data-Centric Parallelism in the Chiplet Age",
    609         "points": 1,
    610         "comments": 0,
    611         "url": "https://news.ycombinator.com/item?id=35669201"
    612       },
    613       {
    614         "hn_id": "34879761",
    615         "title": "The Usability of Advanced Type Systems: Rust as a Case Study",
    616         "points": 1,
    617         "comments": 0,
    618         "url": "https://news.ycombinator.com/item?id=34879761"
    619       }
    620     ],
    621     "top_points": 141,
    622     "total_points": 165,
    623     "total_comments": 163
    624   }
    625 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs