ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31906B)


      1 {
      2   "paper": {
      3     "title": "Impact of Code Language Models on Automated Program Repair",
      4     "authors": [
      5       "Nan Jiang",
      6       "Kevin Liu",
      7       "Thibaud Lutellier",
      8       "Lin Tan"
      9     ],
     10     "year": 2023,
     11     "venue": "IEEE/ACM 45th International Conference on Software Engineering (ICSE)",
     12     "arxiv_id": "2302.05020",
     13     "doi": "10.1109/ICSE48619.2023.00125"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "Off-the-shelf code language models (CLMs) already fix 72% more bugs than state-of-the-art DL-based APR techniques, with InCoder-6B fixing 105 bugs vs. KNOD's 61 across four benchmarks. Fine-tuning CLMs with APR training data yields 31%–1,267% improvement, but too much fine-tuning data (beyond 10K–50K instances) can reduce performance. Surprisingly, CLMs fix fewer bugs when buggy lines are explicitly provided, and fine-tuning causes over-reliance on buggy lines similar to existing APR tools. CodeT5 and InCoder show the best size efficiency among the models evaluated.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Section XI states: 'Our replication package, including... the source code for reproduction are available at [62].' Reference [62] points to https://github.com/lin-tan/clm."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Section XI states the replication package includes 'the new APR benchmark HumanEval-Java' and 'the generated patches for all four benchmarks by all CLMs.' The other three benchmarks (Defects4J v1.2, v2.0, QuixBugs) are publicly available standard benchmarks."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper reports model sizes, batch size, learning rate, and GPU memory requirements (Figure 10c), but does not provide a requirements.txt, Dockerfile, or detailed environment specification listing library versions needed to reproduce the experiments."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section XI states the replication package includes 'the source code for reproduction' at the GitHub repository [62]. The experimental design (Section III) describes the full pipeline in sufficient detail."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Tables II and IV report only point estimates (number of correct fixes) with no confidence intervals or error bars. Figure 5 shows box plots for compilation rates but no CIs on the main fixing capability results."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "All comparative claims (e.g., '72% more bugs than KNOD,' '31%–1,267% improvement') are based on raw number comparisons without any statistical significance tests (no p-values, t-tests, or other hypothesis tests)."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper consistently reports both raw numbers and percentage improvements with baseline context: '72% more bugs' (105 vs 61), '31%–1,267% improvement,' '46%–164% more bugs.' Table II and IV provide the raw counts enabling effect size computation."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for why these specific benchmarks or bug counts (130, 108, 40, 164) are sufficient for the claims made. No power analysis is discussed."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper uses 'a fixed random seed when fine-tuning different models to minimize variance for a consistent, fair comparison' (Section III-E), meaning all results are single-run. No variance across seeds is reported."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Four state-of-the-art DL-based APR techniques are included as baselines: CURE, RewardRepair, Recoder, and KNOD (Section III-F). Results are compared across all four benchmarks."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The four baselines are contemporary: CURE (ICSE 2021), RewardRepair (ICSE 2022), Recoder (ESEC/FSE 2021), and KNOD (ICSE 2023). The paper states these are 'the four best open-sourced DL-based APR techniques' (Section III-F)."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple ablation-style experiments: with/without buggy lines (Table III), with/without fine-tuning (Table II vs IV), and varying fine-tuning data sizes (Figure 9, six data sizes from 0 to 129K)."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper reports number of correct fixes (Tables II, IV), compilation rate (Figure 5), generation time per correct fix (Figure 10b), and GPU memory usage (Figure 10c)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section III-G states: 'we finally manually check the correctness of plausible patches to distinguish correct patches (which should be identical or semantically equivalent to developer-written patches).'"
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The four benchmarks (Defects4J v1.2, v2.0, QuixBugs, HumanEval-Java) serve as held-out test sets. Fine-tuning data (129,300 training + 14,366 validation instances) is collected from separate GitHub commits and is explicitly disjoint from the test benchmarks."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down across all four benchmarks individually (Tables II and IV) and across all ten CLMs. Figure 9 further breaks down by fine-tuning data size."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Figures 6, 7, and 8 provide detailed examples of bugs that CLMs fail to fix, including analysis of why (e.g., insufficient context, confusion from buggy lines, over-reliance on buggy lines after fine-tuning)."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Multiple negative findings: CLMs fix fewer bugs when buggy lines are given (Finding 2), fine-tuning causes over-reliance on buggy lines (Finding 4), too much fine-tuning data reduces performance (Finding 5), and fine-tuned PLBART-large fixes 4 fewer bugs on HumanEval-Java."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Abstract claims are supported: 'best CLM fixes 72% more bugs' → Table II (105 vs 61), 'fine-tuning brings 31%–1,267% improvement' → Table IV, 'CLMs cannot make good use of the buggy lines' → Table III and Section IV-B."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Causal claims about fine-tuning improvement are justified through controlled before/after comparisons with a fixed random seed (Section III-E). The ablation on buggy lines and fine-tuning data size (Figure 9) uses controlled single-variable manipulation."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'Impact of Code Language Models on Automated Program Repair' and abstract frame results generally, but all experiments are on Java single-hunk bugs only. The paper states 'We focus on Java single-hunk bugs' (Section III) but the title and findings (e.g., 'CLMs have competitive fixing capabilities') do not bound to Java or single-hunk bugs."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section VIII discusses specific alternative explanations: data leaking threat (CLMs may have seen benchmarks), patch correctness evaluation subjectivity, and BLEU/CodeBLEU metric limitations. They also discuss why data leaking is 'less of a concern' since CLMs don't see buggy-fixed pairs."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper measures 'number of correct fixes' (test-validated and manually checked) and claims 'fixing capabilities.' The measurement directly corresponds to the claimed outcome with no proxy gap. They also acknowledge BLEU score is a poor proxy (Section VIII)."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Table I specifies all ten models with exact names and parameter counts: PLBART-base (140M), PLBART-large (400M), CodeT5-small (60M), CodeT5-base (220M), CodeT5-large (770M), CodeGen-350M/2B/6B, InCoder-1B/6B. These are specific publicly released checkpoints."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Figure 3 provides the actual prompt formats for all four CLM types, both with and without buggy lines, using a concrete bug example (Chart-20). Figure 4 shows the fine-tuning prompt format."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section III-E reports: 'batch size is one,' 'Adam optimizer with a learning rate of 1e−5,' 'only fine-tuned for one epoch,' 'fixed random seed.' Section III-G states 'ten candidate patches for each bug.'"
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. The CLMs are applied directly to generate patches from input prompts without any agent framework, tool use, or iterative refinement."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Fine-tuning data: 143,666 single-hunk fix instances from GitHub commits, split into 129,300 training and 14,366 validation (Section III-E). HumanEval-Java: manually converted from Python HumanEval with injected bugs, 164 bugs (Section III-B). Benchmark selection criteria documented for Defects4J (130/108 single-hunk bugs) and QuixBugs (40 bugs)."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section VIII 'Threats to Validity and Limitations' is a dedicated section discussing data leaking, Codex exclusion, and patch correctness evaluation."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section VIII discusses threats specific to this study: training data of CLMs may contain APR benchmark bugs, Codex is excluded because it's a black box, manual patch correctness checking could be subjective, and BLEU/CodeBLEU metrics are misleading for this task."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "The paper states 'We focus on Java single-hunk bugs' (Section III), excludes Codex and encoder-only models with justification (Section III-C), and Section VII-C calls for 'larger buggy programs' as future work, acknowledging HumanEval-Java contains mostly small programs."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section XI provides a replication package at [62] including: HumanEval-Java benchmark, generated patches for all four benchmarks by all CLMs, fine-tuned CLM models, and source code."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Fine-tuning data collection: 'we use the APR data shared in previous work [10], which is collected from commits of open-sourced GitHub Java projects' (Section III-E). HumanEval-Java: 'We manually convert the Python programs in HumanEval and their test cases into Java programs and Junit test cases, and then inject bugs' (Section III-B)."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. Data sources are standard benchmarks (Defects4J, QuixBugs, HumanEval) and publicly available code corpora."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The pipeline is documented: data source → instance extraction (143,666 single-hunk fixes) → train/validation split → fine-tuning → patch generation (10 per bug) → test case execution → manual correctness check. HumanEval-Java creation from Python to Java conversion is also documented."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgment section states: 'This work is partially supported by a J.P. Morgan AI Faculty Research Award.'"
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All author affiliations are listed: Purdue University (Nan Jiang, Lin Tan), Lynbrook High School (Kevin Liu), University of Alberta (Thibaud Lutellier). Authors are from academic institutions, not from companies whose products are evaluated."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "J.P. Morgan AI Faculty Research Award funds the work. J.P. Morgan has no specific product or model being evaluated in this study and no direct financial stake in which CLM performs best."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is provided. The paper discloses the J.P. Morgan award but does not include an explicit declaration about whether authors hold relevant patents, equity, or other interests."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "While the paper discusses data sources for each CLM (Table I: BigQuery, CodeSearchNet, THEPILE, GitHub/GitLab), it does not state explicit training data cutoff dates for any of the ten models."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section III-B explicitly checks: 'By checking CodeSearchNet and BigQuery... we find that four repositories used by the Defects4J benchmark are also in CodeSearchNet, and the whole Defects4J repository is included by BigQuery.' They create HumanEval-Java to mitigate this."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "The paper creates HumanEval-Java specifically to address contamination: 'Since HumanEval-Java is converted from HumanEval and the bugs are manually injected, none of the CLMs would have seen it before. Thus, it is the fairest benchmark' (Section III-B)."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study. The work evaluates code language models on benchmark datasets."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The study evaluates CLMs on code benchmarks."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Figure 10(b) reports generation time per correct fix (0.70–13.88 seconds across models). Figure 10(c) reports GPU memory requirements (0.5–25 GB). Section VI discusses time and memory efficiency in detail."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The paper reports per-inference time and GPU memory but does not state total computational budget: no total GPU hours, total training time for fine-tuning, or overall experiment cost. Hardware constraints are only mentioned in passing ('batch size is one due to our hardware constraints')."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Section III-E states: 'We set a fixed random seed when fine-tuning different models to minimize variance for a consistent, fair comparison.' Only a single seed is used; no sensitivity analysis across seeds."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The number of independent experimental runs is not explicitly stated. The fixed random seed implies a single run, and 'ten candidate patches for each bug' is stated (Section III-G), but the number of training/evaluation runs is not explicit."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper mentions using 'a validation dataset with 14,366 instances to tune the hyper-parameters (e.g., number of training epochs)' (Section III-E) but does not report how many configurations were tried or the search method used."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Section III-E states hyperparameters were tuned on a validation dataset (14,366 instances), meaning configuration selection was done on validation data, not the test benchmarks."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper compares 10 CLMs × 4 benchmarks × multiple conditions (with/without buggy lines, with/without fine-tuning) without any multiple comparison corrections. No statistical tests are used at all."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Authors Nan Jiang, Thibaud Lutellier, and Lin Tan are co-authors of CURE [7] and KNOD [11], two of the four APR baselines. The paper does not acknowledge the potential bias of authors evaluating their own prior systems as baselines."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Figure 10 explicitly plots fixing capability as a function of model size (parameters), generation time, and GPU memory. Section VI (RQ3) is dedicated to studying these trade-offs."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Section I-A discusses limitations of CodeXGLUE (abstracted code, BLEU-only evaluation) and argues for realistic benchmarks with test cases. Section VIII discusses that 'BLEU and CodeBLUE are indeed misleading in comparing APR techniques.' The creation of HumanEval-Java addresses fairness concerns."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is used. CLMs are applied directly to generate patches from prompts without any agent framework or orchestration."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "Section III-B discusses that CLMs pre-trained on public repositories may have seen APR benchmarks. They verify overlap between data sources (CodeSearchNet, BigQuery) and benchmarks (Defects4J), and create HumanEval-Java as a temporally clean benchmark."
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "The paper studies whether providing buggy lines as input affects results (Section IV-B, Table III). They compare prompts with and without buggy lines for all models, explicitly testing whether input features change evaluation outcomes."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "Section III-B identifies that 'four repositories used by the Defects4J benchmark are also in CodeSearchNet, and the whole Defects4J repository is included by BigQuery,' directly checking for overlap between pre-training data sources and test benchmarks."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": true,
    363         "justification": "The paper applies a concrete detection method: checking CodeSearchNet and BigQuery against Defects4J repositories. As a prevention method, they create HumanEval-Java from scratch to ensure no CLM has seen it during pre-training."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "The best CLM (InCoder-6B) without fine-tuning fixes 72% more bugs than the best DL-based APR technique (KNOD) across four benchmarks.",
    370       "evidence": "Table II shows InCoder-6B fixes 105 bugs total vs. KNOD's 61 bugs across Defects4J v1.2 (16 vs 20), Defects4J v2.0 (15 vs 13), QuixBugs (15 vs 10), and HumanEval-Java (59 vs 18).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "CLMs fix 6%–78% fewer bugs when buggy lines are explicitly provided as input.",
    375       "evidence": "Table III shows all ten CLMs fix fewer bugs with buggy lines. Figures 7(a) and 7(b) show qualitative examples of failures caused by buggy line confusion.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Fine-tuning with APR data improves CLMs' fixing capabilities by 31%–1,267%, with the best fine-tuned model (InCoder-6B) fixing 164% more bugs than the best APR technique.",
    380       "evidence": "Table IV shows fine-tuned InCoder-6B fixes 161 bugs vs. KNOD's 61. PLBART gains 31% improvement (least), CodeT5-small gains 1,267% (most, from 9 to 89).",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Fine-tuning makes CLMs over-rely on buggy lines, causing them to miss bugs fixable without fine-tuning.",
    385       "evidence": "Figure 8(b) shows SORT_NUMBERS bug that CLMs fix without fine-tuning but fail after fine-tuning, generating patches too similar to the buggy line.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "CodeT5 and CodeGen reach best performance with 10,000 fine-tuning instances; more data (up to 129K) reduces performance by 8%–19%.",
    390       "evidence": "Figure 9 shows CodeT5-large peaks at 10K instances (59 fixes) then drops to 52 with full dataset. CodeGen-6B similarly peaks at 10K.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "CodeT5 and InCoder have the best size efficiency, fixing the most bugs relative to their parameter count.",
    395       "evidence": "Figure 10(a) shows CodeT5 and InCoder models consistently above other model types at comparable parameter sizes. CodeT5-large (770M) fixes 125 bugs vs CodeGen-6B (6B params) fixing 131.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No statistical significance tests",
    402       "detail": "All comparisons across 10 models × 4 benchmarks × multiple conditions use raw count differences and percentages without any statistical tests. Claims like '72% more bugs' have no uncertainty quantification, making it impossible to assess whether differences are robust or due to chance."
    403     },
    404     {
    405       "flag": "Single-seed experiments",
    406       "detail": "All fine-tuning experiments use a single fixed random seed (Section III-E). Given that Henderson et al. (2018) showed RL results can vary by 2x across seeds, the single-seed results provide no information about result stability. The differences between models could shift with different seeds."
    407     },
    408     {
    409       "flag": "Author-baseline overlap",
    410       "detail": "Three of four authors (Nan Jiang, Thibaud Lutellier, Lin Tan) are co-authors of CURE, and at least two are co-authors of KNOD — two of the four baseline APR techniques. This overlap is not acknowledged and creates potential bias in how baselines are set up or evaluated."
    411     },
    412     {
    413       "flag": "Contamination still present for 3 of 4 benchmarks",
    414       "detail": "Despite creating HumanEval-Java, the other three benchmarks (Defects4J v1.2, v2.0, QuixBugs) may still have contamination. The paper finds 4 Defects4J repos in CodeSearchNet and the whole Defects4J repo in BigQuery. Results on these benchmarks should be interpreted cautiously."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Cure: Code-aware neural machine translation for automatic program repair",
    420       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    421       "year": 2021,
    422       "relevance": "State-of-the-art DL-based APR technique used as baseline, showing that specialized models with code-aware search can fix bugs."
    423     },
    424     {
    425       "title": "Neural program repair with execution-based backpropagation",
    426       "authors": ["He Ye", "Matias Martinez", "Martin Monperrus"],
    427       "year": 2022,
    428       "relevance": "RewardRepair baseline that incorporates patch execution information into training, representing APR techniques that leverage test feedback."
    429     },
    430     {
    431       "title": "A syntax-guided edit decoder for neural program repair",
    432       "authors": ["Qihao Zhu", "Zeyu Sun", "Yuan-an Xiao", "Wenjie Zhang", "Kang Yuan", "Yingfei Xiong", "Lu Zhang"],
    433       "year": 2021,
    434       "doi": "10.1145/3468264.3468544",
    435       "relevance": "Recoder baseline that generates AST-level edits for APR, representing structured program repair approaches."
    436     },
    437     {
    438       "title": "Knod: Domain knowledge distilled tree decoder for automated program repair",
    439       "authors": ["Nan Jiang", "Thibaud Lutellier", "Yiling Lou", "Lin Tan", "Dan Goldwasser", "Xiangyu Zhang"],
    440       "year": 2023,
    441       "relevance": "Best-performing DL-based APR baseline using knowledge distillation and graph-transformer, representing SOTA specialized APR."
    442     },
    443     {
    444       "title": "Unified pre-training for program understanding and generation",
    445       "authors": ["Wasi Uddin Ahmad", "Saikat Chakraborty", "Baishakhi Ray", "Kai-Wei Chang"],
    446       "year": 2021,
    447       "relevance": "PLBART code language model evaluated in this study; encoder-decoder architecture pre-trained on masked/deleted span prediction."
    448     },
    449     {
    450       "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation",
    451       "authors": ["Wang Yue", "Wang Weishi", "Joty Shafiq", "Hoi Steven"],
    452       "year": 2021,
    453       "relevance": "CodeT5 code language model evaluated in this study; multi-task pre-trained encoder-decoder for code."
    454     },
    455     {
    456       "title": "A conversational paradigm for program synthesis",
    457       "authors": ["Erik Nijkamp", "Bo Pang", "Hiroaki Hayashi", "Lifu Tu", "Huan Wang", "Yingbo Zhou", "Silvio Savarese", "Caiming Xiong"],
    458       "year": 2022,
    459       "relevance": "CodeGen decoder-only models evaluated in this study; trained on next token prediction for code generation."
    460     },
    461     {
    462       "title": "InCoder: A generative model for code infilling and synthesis",
    463       "authors": ["Daniel Fried", "Armen Aghajanyan", "Jessy Lin", "Sida Wang", "Eric Wallace", "Freda Shi", "Ruiqi Zhong", "Wen-tau Yih", "Luke Zettlemoyer", "Mike Lewis"],
    464       "year": 2022,
    465       "arxiv_id": "2204.05999",
    466       "relevance": "InCoder models evaluated in this study; decoder-only with masked span prediction enabling code infilling, achieving best overall performance."
    467     },
    468     {
    469       "title": "Evaluating large language models trained on code",
    470       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    471       "year": 2021,
    472       "arxiv_id": "2107.03374",
    473       "relevance": "Codex/HumanEval paper; HumanEval dataset is the basis for the new HumanEval-Java benchmark created in this work."
    474     },
    475     {
    476       "title": "Less training, more repairing please: Revisiting automated program repair via zero-shot learning",
    477       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    478       "year": 2022,
    479       "doi": "10.1145/3540250.3549101",
    480       "relevance": "Explores zero-shot LLM application to APR, complementary to this paper's fine-tuning approach."
    481     },
    482     {
    483       "title": "Can OpenAI's Codex fix bugs?: An evaluation on QuixBugs",
    484       "authors": ["Julian Aron Prenner", "Hlib Babii", "Romain Robbes"],
    485       "year": 2022,
    486       "relevance": "Early evaluation of Codex on APR benchmarks, directly related to evaluating CLMs for bug fixing."
    487     },
    488     {
    489       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    490       "authors": ["René Just", "Darioush Jalali", "Michael D. Ernst"],
    491       "year": 2014,
    492       "relevance": "Primary APR benchmark used in this study; most widely used Java bug benchmark in the APR community."
    493     }
    494   ],
    495   "engagement_factors": {
    496     "practical_relevance": {
    497       "score": 2,
    498       "justification": "Fine-tuned CLMs for APR are practically relevant to developers, and the replication package with models is released, though applying it requires ML expertise."
    499     },
    500     "surprise_contrarian": {
    501       "score": 2,
    502       "justification": "Surprising that off-the-shelf CLMs without any APR-specific training beat state-of-the-art specialized APR tools by 72%, and that providing buggy lines hurts rather than helps."
    503     },
    504     "fear_safety": {
    505       "score": 0,
    506       "justification": "No AI safety or security concerns raised; the work is about improving software reliability through automated bug fixing."
    507     },
    508     "drama_conflict": {
    509       "score": 1,
    510       "justification": "Mildly challenges specialized APR research by showing general-purpose CLMs outperform dedicated tools, and raises data leaking concerns in CLM evaluation."
    511     },
    512     "demo_ability": {
    513       "score": 2,
    514       "justification": "Replication package with fine-tuned models, code, and the HumanEval-Java benchmark is released at GitHub, enabling reproduction but not a one-click demo."
    515     },
    516     "brand_recognition": {
    517       "score": 1,
    518       "justification": "Published at ICSE (top SE venue) by Purdue University researchers, recognized in the SE community but not a household name lab."
    519     }
    520   }
    521 }

Impressum · Datenschutz