scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29098B)
      1 {
      2   "paper": {
      3     "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-shot Learning",
      4     "authors": [
      5       "Chunqiu Steven Xia",
      6       "Lingming Zhang"
      7     ],
      8     "year": 2022,
      9     "venue": "ESEC/FSE 2022",
     10     "arxiv_id": "2207.08281",
     11     "doi": "10.1145/3540250.3549101"
     12   },
     13   "scan_version": 3,
     14   "active_modules": ["experimental_rigor", "data_leakage"],
     15   "methodology_tags": ["benchmark-eval"],
     16   "key_findings": "AlphaRepair introduces cloze-style APR using CodeBERT's masked language model objective in a zero-shot setting, requiring no fine-tuning on bug-fix data. It fixes 74 bugs on Defects4J 1.2 (surpassing the previous best of 68), 36 bugs on Defects4J 2.0 (3.3X more than the best baseline of 11), and achieves state-of-the-art on both Java and Python QuixBugs. The ablation study shows template masking contributes the most (+21 fixes) and the re-ranking step reduces average correct patch rank by 31.7%.",
     17   "checklist": {
     18     "artifacts": {
     19       "code_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "Reference [2] provides a Zenodo archive (https://zenodo.org/record/6819444) containing 'all correct patches for public evaluation along with the code to reproduce our experiments' (Section 6)."
     23       },
     24       "data_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The evaluation uses publicly available benchmarks (Defects4J, QuixBugs), and the Zenodo archive [2] includes correct patches and reproduction code."
     28       },
     29       "environment_specified": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "Section 4.2 lists hardware (Intel i7 10700KF, RTX 3080 Ti, 16GB RAM) and OS (Ubuntu 20.04.3 LTS, OpenJDK 1.8.0_312), but no requirements.txt, Dockerfile, or library version list is provided. Python and PyTorch versions are not specified."
     33       },
     34       "reproduction_instructions": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper references a Zenodo archive with code but does not include step-by-step reproduction instructions in the paper itself. No 'Reproducing Results' section or explicit run commands are provided."
     38       }
     39     },
     40     "statistical_methodology": {
     41       "confidence_intervals_or_error_bars": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "All results in Tables 1–5 are point estimates (counts of correct/plausible patches). No confidence intervals, error bars, or uncertainty quantification is reported."
     45       },
     46       "significance_tests": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Claims of outperformance (e.g., '74 vs 68', '36 vs 11') are based solely on comparing raw counts. No statistical significance tests (p-values, t-tests, etc.) are reported."
     50       },
     51       "effect_sizes_reported": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper reports effect sizes with baseline context: '3.3X more fixes than best baseline' on D4J 2.0 (36 vs 11), '31.7% reduction' in average patch ranking (612→418), and per-project breakdowns in Tables 1 and 4."
     55       },
     56       "sample_size_justified": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "The paper uses standard benchmarks (391 bugs in D4J 1.2, 82 in D4J 2.0, 40 in QuixBugs) without discussing whether these sample sizes are adequate for the statistical claims being made."
     60       },
     61       "variance_reported": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "All results appear to be from single runs. No standard deviations, variance across seeds, or spread measures are reported anywhere in the paper."
     65       }
     66     },
     67     "evaluation_design": {
     68       "baselines_included": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "The paper compares against 18 APR tools including 6 learning-based (Recoder, CURE, CoCoNuT, DLFix, SequenceR, DeepDebug) and 12 traditional (TBar, PraPR, AVATAR, SimFix, etc.) across Tables 1, 2, and 5."
     72       },
     73       "baselines_contemporary": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Baselines include recent tools: Recoder (2021), CURE (2021), CoCoNuT (2020), DeepDebug (2021). These were state-of-the-art at the time of submission."
     77       },
     78       "ablation_study": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Table 3 presents a detailed ablation study showing the contribution of each masking strategy (complete +20, partial begin +13, partial end +15, template +21, comment buggy line +5). Section 5.2 also evaluates the re-ranking component."
     82       },
     83       "multiple_metrics": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "The paper reports both correct patches (manually verified semantic equivalence) and plausible patches (pass all tests) as distinct metrics throughout all tables."
     87       },
     88       "human_evaluation": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Section 4.4 states: 'the correct patches are determined by manually inspecting each plausible patch for semantic equivalency.' The authors manually evaluated all plausible patches to determine correctness."
     92       },
     93       "held_out_test_set": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "There is no explicit dev/test split. Hyperparameters (beam width=25, max 5000 patches) appear set without formal validation. While D4J 2.0 serves as a separate generalizability test, the main D4J 1.2 results likely informed design choices."
     97       },
     98       "per_category_breakdown": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Tables 1 and 4 provide per-project breakdowns (Chart, Closure, Lang, Math, Mockito, Time for D4J 1.2; 11 projects for D4J 2.0). Table 3 breaks down contribution by mask type."
    102       },
    103       "failure_cases_discussed": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "The paper discusses successful example fixes (Figures 7, 9) but does not analyze failure cases or discuss where AlphaRepair fails to generate correct patches."
    107       },
    108       "negative_results_reported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The ablation shows complete mask alone achieves only 20/74 correct patches. Without re-ranking, average patch rank is 612th vs 418th with re-ranking. These demonstrate component limitations."
    112       }
    113     },
    114     "claims_and_evidence": {
    115       "abstract_claims_supported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "Abstract claims are supported: 'outperform state-of-the-art' (Table 1: 74 vs 68), '3.3X more fixes' on D4J 2.0 (Table 4: 36 vs 11), 'state-of-the-art results on both Java and Python' QuixBugs (Table 5: 28/27)."
    119       },
    120       "causal_claims_justified": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Causal claims (component contributions) are supported by an ablation study (Table 3) that incrementally adds components and measures their individual contribution. The re-ranking contribution is separately evaluated via controlled comparison (Section 5.2)."
    124       },
    125       "generalization_bounded": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The title claims 'Revisiting Automated Program Repair' broadly. 'Multilingual' capability is claimed from only 2 languages (Java, Python). The approach is presented as generalizable to 'various pre-trained code models' but only CodeBERT is tested. Single-line focus is stated but not prominently bounded."
    129       },
    130       "alternative_explanations_discussed": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Section 6 discusses CodeBERT training data overlap (16.6% of D4J 1.2), demonstrates AlphaRepair still works on perturbed overlapping bugs, discusses experimental setup differences, and addresses benchmark representativeness."
    134       },
    135       "proxy_outcome_distinction": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper measures 'correct patches' (semantically equivalent to developer patches) and 'plausible patches' (pass tests), and explicitly distinguishes them: 'a plausible patch might still fail under other inputs' (Section 1). Claims match measurement granularity."
    139       }
    140     },
    141     "setup_transparency": {
    142       "model_versions_specified": {
    143         "applies": true,
    144         "answer": false,
    145         "justification": "The paper states 'CodeBERT' and references [23] but does not specify an exact model checkpoint, version hash, or model size. No snapshot date or Hugging Face model ID is provided."
    146       },
    147       "prompts_provided": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "AlphaRepair uses CodeBERT's Masked Language Model objective, not prompting. Inputs are structured code tokens with mask tokens, not natural language prompts."
    151       },
    152       "hyperparameters_reported": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Section 4.2 reports beam width (25 for perfect FL, 5 for non-perfect), max patches (5000), top suspicious lines (40), timeout (5 hours), and mask line length (L+10 tokens)."
    156       },
    157       "scaffolding_described": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "No agentic scaffolding is used. AlphaRepair is a deterministic pipeline: input processing → mask generation → patch generation → re-ranking → validation."
    161       },
    162       "data_preprocessing_documented": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "Section 3.1 describes input tokenization with BBPE, context extraction, buggy line comment transformation. Section 3.2 details mask generation strategies. Section 4.3 describes bug selection (82 single-line from D4J 2.0, 391 - 4 deprecated from D4J 1.2)."
    166       }
    167     },
    168     "limitations_and_scope": {
    169       "limitations_section_present": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "Section 6 'Threats to Validity' contains substantive discussion organized into Internal and External subsections covering training data overlap, manual analysis reliability, experimental setup, and benchmark representativeness."
    173       },
    174       "threats_to_validity_specific": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Section 6 discusses study-specific threats: 16.6% of D4J 1.2 bugs overlap with CodeBERT training data, manual patch inspection by the authors could introduce bias, machine configuration differences could affect results, and they verify 15 overlapping bugs still work after perturbation."
    178       },
    179       "scope_boundaries_stated": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The paper mentions 'we focus on single line patches in this work' and notes benchmark limitations, but does not explicitly state what the results do NOT show. The external threats are generic ('may not translate to other datasets')."
    183       }
    184     },
    185     "data_integrity": {
    186       "raw_data_available": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Reference [2] provides a Zenodo archive with correct patches and reproduction code. The underlying benchmarks (Defects4J, QuixBugs) are publicly available, enabling independent verification."
    190       },
    191       "data_collection_described": {
    192         "applies": true,
    193         "answer": true,
    194         "justification": "Section 4.3 describes each benchmark: Defects4J 1.2 (391 bugs across 6 projects, 4 deprecated removed), D4J 2.0 (438 new bugs, 82 single-line selected), QuixBugs (40 classic algorithm bugs in Java and Python)."
    195       },
    196       "recruitment_methods_described": {
    197         "applies": false,
    198         "answer": false,
    199         "justification": "No human participants. Data sources are standard public benchmarks (Defects4J, QuixBugs)."
    200       },
    201       "data_pipeline_documented": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "The full pipeline is documented: input processing (Section 3.1), mask generation with three strategies (Section 3.2), iterative patch generation with beam search (Section 3.3), re-ranking (Section 3.4), and compilation/validation (Section 3.5)."
    205       }
    206     },
    207     "conflicts_of_interest": {
    208       "funding_disclosed": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "No funding information, acknowledgments section, or grant numbers are mentioned in the provided paper text."
    212       },
    213       "affiliations_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Both authors are from University of Illinois Urbana-Champaign. Affiliations are clearly listed. They are not evaluating their own company's product."
    217       },
    218       "funder_independent_of_outcome": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No funding is disclosed, so independence of funders cannot be assessed. Academic researchers likely have grant support but it is not stated."
    222       },
    223       "financial_interests_declared": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No competing interests statement or financial disclosure is present in the paper."
    227       }
    228     },
    229     "contamination": {
    230       "training_cutoff_stated": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "The paper does not state CodeBERT's training data cutoff date. It checks for overlap with CodeBERT training data but does not specify when that data was collected."
    234       },
    235       "train_test_overlap_discussed": {
    236         "applies": true,
    237         "answer": true,
    238         "justification": "Section 6 explicitly checks: '65 out of 391 (16.6%) Defects4J 1.2 bugs and 9 out of 82 (11.0%) Defects4J 2.0 bugs are present in the original training data.' They verify AlphaRepair works on perturbed versions of the 15 overlapping correctly-fixed bugs."
    239       },
    240       "benchmark_contamination_addressed": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "Section 6 addresses contamination by: (1) calculating exact overlap percentages, (2) manually perturbing overlapping bugs and re-evaluating, (3) showing results still improve over baselines even excluding overlapping bugs (64 vs 63 on D4J 1.2), (4) noting QuixBugs is not in CodeBERT training data."
    244       }
    245     },
    246     "human_studies": {
    247       "pre_registered": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants. This is a benchmark evaluation of an automated program repair tool."
    251       },
    252       "irb_or_ethics_approval": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants. The study evaluates automated tools on code benchmarks."
    256       },
    257       "demographics_reported": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants."
    261       },
    262       "inclusion_exclusion_criteria": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "randomization_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "blinding_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "attrition_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       }
    282     },
    283     "cost_and_practicality": {
    284       "inference_cost_reported": {
    285         "applies": true,
    286         "answer": false,
    287         "justification": "A 5-hour timeout per bug is mentioned, but actual inference time, API costs, or per-bug timing is not reported. No cost breakdown for the mask generation, CodeBERT queries, or validation stages."
    288       },
    289       "compute_budget_stated": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "Hardware is described (8-core i7, 16GB RAM, RTX 3080 Ti) but total GPU hours, wall-clock time for the full evaluation, or total compute budget are not quantified."
    293       }
    294     },
    295     "experimental_rigor": {
    296       "seed_sensitivity_reported": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single deterministic runs."
    300       },
    301       "number_of_runs_stated": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper does not state how many experimental runs produced the reported results. Results are presented as single values."
    305       },
    306       "hyperparameter_search_budget": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "Hyperparameters (beam width=25, max patches=5000) are stated but no search budget, search method, or number of configurations tried is reported."
    310       },
    311       "best_config_selection_justified": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "The chosen configuration (beam=25, max 5000 patches) is presented without justification for how it was selected or whether a validation set was used."
    315       },
    316       "multiple_comparison_correction": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The paper compares against 18 tools and makes numerous superiority claims without any statistical tests or multiple comparison corrections."
    320       },
    321       "self_comparison_bias_addressed": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "For D4J 1.2, baseline results are taken from prior papers. For D4J 2.0, the authors run TBar and Recoder themselves. No discussion of author-evaluation bias or potential for unintentional disadvantaging of baselines."
    325       },
    326       "compute_budget_vs_performance": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "AlphaRepair generates up to 5000 patches per bug with beam=25, while baselines may use different budgets. This compute difference is not discussed or controlled for."
    330       },
    331       "benchmark_construct_validity": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Defects4J and QuixBugs are used as standard benchmarks without discussion of whether they adequately measure real-world program repair capability or whether they are representative of bugs in practice."
    335       },
    336       "scaffold_confound_addressed": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "Each compared tool has a fundamentally different pipeline (mask generation + re-ranking for AlphaRepair vs. NMT decoder for baselines). Performance differences are attributed to the zero-shot approach but scaffold differences are not disentangled from model differences."
    340       }
    341     },
    342     "data_leakage": {
    343       "temporal_leakage_addressed": {
    344         "applies": true,
    345         "answer": true,
    346         "justification": "Section 6 checks whether Defects4J fixed functions appear in CodeBERT's training data, finding 16.6% overlap for D4J 1.2 and 11.0% for D4J 2.0. They verify the approach works on perturbed versions of overlapping bugs."
    347       },
    348       "feature_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "The default evaluation setting uses perfect fault localization (exact buggy line provided), which gives information unavailable in realistic settings. While they also evaluate without it, the feature leakage implications of perfect FL are not discussed."
    352       },
    353       "non_independence_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether bugs within the same Defects4J projects share structural similarities that could inflate results, or whether CodeBERT training data and Defects4J source repositories overlap beyond the specific fixed functions checked."
    357       },
    358       "leakage_detection_method": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "Section 6 describes a concrete detection method: checking whether the 391 D4J 1.2 and 82 D4J 2.0 fixed functions appear in CodeBERT's training corpus, finding exact overlap numbers, and manually perturbing the 15 overlapping correctly-fixed bugs to verify the approach isn't memorizing."
    362       }
    363     }
    364   },
    365   "claims": [
    366     {
    367       "claim": "AlphaRepair outperforms all existing APR tools on Defects4J 1.2 with 74 correct fixes, improving from the previous best of 68 (TBar).",
    368       "evidence": "Table 1 shows 74 correct / 109 plausible patches. TBar achieves 68 correct, Recoder 65, CURE 57. Per-project breakdowns are provided.",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "AlphaRepair fixes 3.3X more bugs than the best baseline on Defects4J 2.0 (36 vs 11 for Recoder).",
    373       "evidence": "Table 4 shows 36 correct / 50 plausible for AlphaRepair, 11/23 for Recoder, 8/25 for TBar on 82 single-line D4J 2.0 bugs.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "AlphaRepair achieves state-of-the-art on both Java and Python versions of QuixBugs.",
    378       "evidence": "Table 5 shows 28/30 (Java) and 27/32 (Python) for AlphaRepair vs. CURE's 26/35 (Java) and DeepDebug's 21/22 (Python).",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "AlphaRepair fixes 8 unique bugs on D4J 1.2 that no prior tool can fix.",
    383       "evidence": "Figure 6b Venn diagram shows 8 unique correct patches from AlphaRepair across all compared tools.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Patch re-ranking reduces average correct patch rank by 31.7% (from 612th to 418th).",
    388       "evidence": "Section 5.2 and Figure 8 show the ranking comparison; 61/74 correct patches are ranked higher after re-ranking.",
    389       "supported": "moderate"
    390     },
    391     {
    392       "claim": "AlphaRepair can avoid the dataset-overfitting issue of existing techniques.",
    393       "evidence": "D4J 2.0 results (36 vs 11) show larger improvement on an unseen benchmark. The paper also checks CodeBERT training data overlap (16.6% / 11.0%) and verifies with perturbation experiments.",
    394       "supported": "moderate"
    395     }
    396   ],
    397   "red_flags": [
    398     {
    399       "flag": "No statistical tests on comparative claims",
    400       "detail": "All superiority claims (74 vs 68, 36 vs 11, etc.) are based on raw count comparisons without any statistical significance tests, confidence intervals, or uncertainty quantification. Given that results are from single runs, the observed differences could be within random variation."
    401     },
    402     {
    403       "flag": "Single-run results without variance analysis",
    404       "detail": "No multiple runs, seed sensitivity, or variance measures are reported. For a beam-search-based approach where ties could be broken differently, the stability of results is unknown."
    405     },
    406     {
    407       "flag": "Baseline results from different experimental setups",
    408       "detail": "For D4J 1.2, baseline results are taken from prior publications rather than reproduced under identical conditions. Different machines, timeouts, and configurations may affect comparability. The paper acknowledges this: 'one would need rerun the results from all the selected baselines APR tools on the same machine.'"
    409     },
    410     {
    411       "flag": "Perfect fault localization as default setting",
    412       "detail": "The preferred evaluation setting provides the exact buggy line, which is unrealistic in practice. While results are also reported without perfect FL (50 correct, marginal improvement over Recoder's 49), the headline results use an idealized setting."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair",
    418       "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano", "Louis-Noël Pouchet", "Denys Poshyvanyk", "Martin Monperrus"],
    419       "year": 2019,
    420       "relevance": "Learning-based APR tool using seq-to-seq NMT for program repair, one of the key baselines."
    421     },
    422     {
    423       "title": "CoCoNuT: Combining Context-Aware Neural Translation Models Using Ensemble for Program Repair",
    424       "authors": ["Thibaud Lutellier", "Hung Viet Pham", "Lawrence Pang", "Yitong Li", "Moshi Wei", "Lin Tan"],
    425       "year": 2020,
    426       "relevance": "Context-aware NMT approach for APR with separate encoding of context and buggy line, key learning-based baseline."
    427     },
    428     {
    429       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    430       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    431       "year": 2021,
    432       "relevance": "NMT-based APR with pre-training on developer code and static checking for identifier validity, state-of-the-art baseline."
    433     },
    434     {
    435       "title": "A Syntax-Guided Edit Decoder for Neural Program Repair",
    436       "authors": ["Qihao Zhu", "Zeyu Sun", "Yuan-an Xiao", "Wenjie Zhang", "Kang Yuan", "Yingfei Xiong", "Lu Zhang"],
    437       "year": 2021,
    438       "relevance": "Recoder: learning-based APR with syntax-guided decoder, strongest baseline in the comparison."
    439     },
    440     {
    441       "title": "DLFix: Context-Based Code Transformation Learning for Automated Program Repair",
    442       "authors": ["Yi Li", "Shaohua Wang", "Tien N. Nguyen"],
    443       "year": 2020,
    444       "relevance": "Deep learning APR using tree-based RNN for code transformation, another learning-based baseline."
    445     },
    446     {
    447       "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages",
    448       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang", "Nan Duan", "Xiaocheng Feng", "Ming Gong", "Linjun Shou", "Bing Qin", "Ting Liu", "Daxin Jiang", "Ming Zhou"],
    449       "year": 2020,
    450       "arxiv_id": "2002.08155",
    451       "relevance": "The pre-trained code model used by AlphaRepair; foundational work on BERT-based code understanding."
    452     },
    453     {
    454       "title": "Evaluating Large Language Models Trained on Code",
    455       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    456       "year": 2021,
    457       "arxiv_id": "2107.03374",
    458       "relevance": "Codex paper evaluating GPT-based models for code generation, key reference for LLM code capabilities."
    459     },
    460     {
    461       "title": "TBar: Revisiting Template-Based Automated Program Repair",
    462       "authors": ["Kui Liu", "Anil Koyuncu", "Dongsun Kim", "Tegawendé F. Bissyandé"],
    463       "year": 2019,
    464       "relevance": "State-of-the-art template-based APR tool, key traditional baseline."
    465     },
    466     {
    467       "title": "Practical Program Repair via Bytecode Mutation",
    468       "authors": ["Ali Ghanbari", "Samuel Benton", "Lingming Zhang"],
    469       "year": 2019,
    470       "relevance": "PraPR: practical mutation-based APR at bytecode level, traditional baseline."
    471     },
    472     {
    473       "title": "DeepDebug: Fixing Python Bugs Using Stack Traces, Backtranslation, and Code Skeletons",
    474       "authors": ["Dawn Drain", "Colin B. Clement", "Guillermo Serrato", "Neel Sundaresan"],
    475       "year": 2021,
    476       "arxiv_id": "2105.09352",
    477       "relevance": "Learning-based Python APR tool using stack traces, baseline for Python evaluation."
    478     },
    479     {
    480       "title": "Applying CodeBERT for Automated Program Repair of Java Simple Bugs",
    481       "authors": ["Ehsan Mashhadi", "Hadi Hemmati"],
    482       "year": 2021,
    483       "relevance": "Prior work applying CodeBERT to APR but with fine-tuning on bug fixes, contrasting approach to AlphaRepair's zero-shot setting."
    484     },
    485     {
    486       "title": "GraphCodeBERT: Pre-training Code Representations with Data Flow",
    487       "authors": ["Daya Guo", "Shuo Ren", "Shuai Lu"],
    488       "year": 2021,
    489       "arxiv_id": "2009.08366",
    490       "relevance": "Extension of CodeBERT with data flow encoding, relevant pre-trained code model for APR."
    491     }
    492   ],
    493   "engagement_factors": {
    494     "practical_relevance": {
    495       "score": 2,
    496       "justification": "A working APR tool that requires no fine-tuning on bug-fix data, directly applicable to Java/Python repair, though requires setup of CodeBERT infrastructure."
    497     },
    498     "surprise_contrarian": {
    499       "score": 2,
    500       "justification": "Challenges the assumption that learning-based APR needs historical bug-fix training data, showing zero-shot cloze-style repair outperforms fine-tuned approaches."
    501     },
    502     "fear_safety": {
    503       "score": 0,
    504       "justification": "No safety or security concerns raised; the paper is about fixing bugs, not introducing them."
    505     },
    506     "drama_conflict": {
    507       "score": 0,
    508       "justification": "No controversy or conflict; a standard research contribution with incremental improvements."
    509     },
    510     "demo_ability": {
    511       "score": 1,
    512       "justification": "Code and patches released on Zenodo, but not a pip-installable tool or live demo."
    513     },
    514     "brand_recognition": {
    515       "score": 1,
    516       "justification": "From UIUC's well-known SE research group (Lingming Zhang lab), uses Microsoft's CodeBERT, but not a household-name lab."
    517     }
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs