scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32431B)
      1 {
      2   "paper": {
      3     "title": "Practical Program Repair in the Era of Large Pre-trained Language Models",
      4     "authors": [
      5       "Chunqiu Steven Xia",
      6       "Yuxiang Wei",
      7       "Lingming Zhang"
      8     ],
      9     "year": 2022,
     10     "venue": "arXiv",
     11     "arxiv_id": "2210.14179",
     12     "doi": "10.48550/arXiv.2210.14179"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "This paper presents the first extensive evaluation of 9 pre-trained language models (125M to 20B parameters) for automated program repair across 5 benchmarks in 3 languages. PLMs directly applied without finetuning outperform all 20 existing APR tools on Defects4J 1.2, with 109 bugs correctly fixed vs 67 by the best baseline (AlphaRepair). The study demonstrates a consistent scaling effect where larger models fix more bugs with better compilation rates, and shows that infilling-style APR using suffix context significantly improves repair over generative-only approaches. Combining PLMs with simple repair templates and increased sampling (2000 samples) further boosts performance to 78 correct fixes.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Section VI states 'we carefully performed the analysis and released the correct patches and code used to perform the experiments for public evaluation [74]' with a figshare link, though the URL (https://figshare.com/s/temp) appears to be a placeholder."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "All 5 evaluation benchmarks (Defects4J 1.2/2.0, QuixBugs-Java/Python, ManyBugs) are publicly available standard APR datasets. Additionally, the authors claim to release patches via figshare [74]."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "Section IV-B mentions 'Python using PyTorch' and specific hardware (32-Core workstation, Ryzen Threadripper PRO 3975WX, 256GB RAM, NVIDIA RTX A6000, Ubuntu 20.04.4 LTS), but no library versions, requirements.txt, or dependency specifications are provided."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided in the paper. The figshare reference [74] claims to release code but the paper itself contains no reproduction guide or README description."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Tables III-V, VII-VIII, and X report only point estimates (number of correct/plausible patches). No confidence intervals, error bars, or uncertainty measures are provided."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes numerous comparative claims (e.g., 'substantially outperform all existing APR techniques') based solely on comparing raw counts without any statistical significance tests."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper provides absolute and relative differences with context throughout, e.g., 'Codex can fix 32 more bugs than the existing best APR technique,' '40% (62/154) of the total bugs,' and percentage improvements for various comparisons."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No justification is given for the sample sizes. The paper uses standard benchmarks (391/438/40/40/185 bugs) without discussing whether these sizes are sufficient for the claims made or performing any power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance or standard deviation is reported. Results are from a single experimental run with stochastic sampling (nucleus sampling with temperature 0.8), but no spread measures across repeated experiments are provided."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Section IV-D describes comparison against 20 APR tools: 8 learning-based (AlphaRepair, RewardRepair, Recoder, DeepDebug, CURE, CoCoNuT, DLFix, SequenceR) and 12 traditional (TBar, PraPR, AVATAR, SimFix, FixMiner, CapGen, JAID, SketchFix, NOPOL, jGenProg, jMutRepair, jKali)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include recent state-of-the-art tools: AlphaRepair (2022), RewardRepair (2022), Recoder (2021), CURE (2021), which were all contemporary at the time of writing."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The study systematically varies repair settings (complete function, infilling, single line), compares generative vs infilling models on the same task (Table V), and RQ4 ablates sample size (200 vs 2000) and the addition of repair templates (Table X)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper reports correct patches, plausible patches, compilation rate (Figure 4), generation speed (Table VI), and entropy values (Table IX) as evaluation metrics."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section IV-E states 'we follow the standard practice in APR research and manually inspect each plausible patch for semantic equivalency' to determine correct patches."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The PLMs are applied zero-shot (no finetuning on APR data). The evaluation benchmarks (Defects4J, QuixBugs, ManyBugs) serve as held-out test sets since no model tuning is performed on them."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by dataset (5 benchmarks), by repair setting (3 settings), by model (9 PLMs), and by error type (syntactic vs semantic in Figure 4). Tables III-V provide per-dataset, per-model breakdowns."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Figure 4 analyzes syntactic and semantic error rates across models and settings. The paper discusses that generative models for single-line generation produce high syntactic error rates because they lack suffix context. Section VI discusses ManyBugs reproducibility failures."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative findings are reported: smaller models perform poorly (GPT-Neo 125M fixes very few bugs), generative models have high syntax error rates for single-line generation, complete function generation yields lower correct-to-plausible ratios than infilling, and generation speed drastically decreases with model size (71x slower for GPT-NeoX vs GPT-Neo 125M)."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims PLMs 'substantially outperform all existing APR techniques on all our datasets' — supported by Tables VII-VIII showing 109 bugs fixed vs 67 best baseline on Defects4J 1.2. Scaling effect claim supported by Tables III-V. Suffix importance claim supported by Table V Codex comparisons."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims like 'suffix code is important' are justified via controlled comparison of Codex with and without suffix on the same data (Table V). The scaling effect claim is supported by comparing multiple models of different sizes trained on the same data (GPT-Neo 125M/1.3B/2.7B, GPT-J, GPT-NeoX). RQ4 ablations use controlled single-variable manipulation."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Claims are generally bounded to tested settings. The abstract qualifies 'on all our datasets.' Section VI-External states 'our findings may still not generalize to other datasets or languages.' Results are reported separately per dataset and language. The focus on single-function bugs under perfect fault localization is stated."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section VI extensively discusses data leakage as an alternative explanation, finding 15% of fixes appear in training data. They also discuss that Codex outperforms GPT-NeoX despite being smaller because it is 'designed and finetuned for code generation' (Section V-A1), offering an alternative to pure scaling."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures correct/plausible patches on specific benchmarks and frames results in those terms. It does not overclaim beyond 'number of bugs fixed' on the tested benchmarks. The distinction between plausible (test-passing) and correct (semantically equivalent to developer fix) patches is explicit."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Table I specifies model names with parameter counts (GPT-Neo 125M/1.3B/2.7B, GPT-J 6.7B, GPT-NeoX 20B, Codex 12B, CodeT5 220M, INCODER 1.3B/6.7B). Section IV-B specifies Codex uses 'code-davinci-002 engine.' Training datasets are identified."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Figure 1 shows the complete prompt structure for function generation with a concrete Fibonacci example. Figures 2 and 3 show the infilling and single-line generation input formats. The prompt includes task description, examples, and the target bug."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section IV-B reports 'nucleus sampling with top p = 0.95, temperature = 0.8 and 200 samples per bug' and notes this is 'consistent with previous studies on PLMs [26], [28], [30].'"
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. PLMs are directly queried for patch generation without any scaffold, retry logic, or multi-step workflow."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section IV-C describes filtering benchmarks to find bugs matching each repair setting, with Table II showing the number of single-function (#SF), single-hunk (#SH), and single-line (#SL) bugs per dataset. For ManyBugs, they state 'we only use the 91 bugs where the results were reproducible by us' out of 185."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section VI 'Threats to Validity' provides dedicated discussion of both internal and external threats, including data leakage analysis and generalizability concerns."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section VI discusses specific threats: manual validation of patches as an internal threat, detailed data leakage analysis finding 15% (20/128) of fixes appear in training data, and the specific limitation to 5 datasets across 3 languages."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper explicitly bounds scope to single-function bugs (Section IV-C), evaluation under perfect fault localization (Section IV-D), 3 programming languages, and 5 specific benchmarks. Section VI states 'our findings may still not generalize to other datasets or languages.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "Only the correct patches and code are claimed to be released [74]. Raw generation outputs (all 200 samples per bug across all models) are not made available for independent verification of the filtering and validation process."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section IV-C describes each benchmark in detail: Defects4J from open-source Java projects (391/438 bugs), QuixBugs from programming challenge (40 bugs each in Java/Python), ManyBugs from C open-source projects (185 bugs, 91 reproducible)."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. All data comes from standard public benchmarks (Defects4J, QuixBugs, ManyBugs)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline from benchmark filtering (Table II showing bug counts per category) through prompt construction (Figures 1-3), generation (200 samples, nucleus sampling), filtering (removing syntactic/semantic errors per Figure 4), and test validation is documented. For ManyBugs, the reduction from 185 to 91 reproducible bugs is explained."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding source or acknowledgments section is present in the paper text provided."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All three authors are listed with their affiliation at the University of Illinois at Urbana-Champaign. They evaluate third-party models and have no apparent affiliation with model providers."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding information is disclosed, making it impossible to assess funder independence."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No explicit training data cutoff dates are stated for any of the models. Table I lists training datasets (The Pile, CodeSearchNet, BigQuery) but not the date ranges. Codex's training data is listed as 'N.R.' (not released)."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section VI extensively discusses potential train/test overlap. The authors check whether fixed functions appear in training data for models with accessible training datasets, finding 15% (20/128) of developer-matching fixes are also found in training data."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section VI addresses contamination for Defects4J 1.2 specifically, analyzing developer-patch matches (66% of fixes differ from developer patches), checking training data for overlap (15% found), and noting QuixBugs is unlikely in training data due to low GitHub stars and synthetic nature."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study. It is a benchmark evaluation of PLMs on automated program repair datasets."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study evaluates PLMs on publicly available benchmark datasets."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. All evaluation is automated using test suites and manual patch inspection by the authors."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants. Benchmark selection criteria are described in Section IV-C."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants or experimental conditions requiring randomization."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants. Manual patch inspection is not described as blinded."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Table VI reports generation speed (patches/minute) for each locally-run model across datasets and settings. Section V-B1 states 'generating 200 patches for each of the 3 settings (i.e., at most 600 patches in total) costs no more than 2.5 hours for each model.'"
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section IV-B specifies hardware (32-Core Ryzen Threadripper PRO 3975WX, 256GB RAM, NVIDIA RTX A6000, Ubuntu 20.04.4 LTS). Table VI provides generation speeds enabling total compute estimation. The 2.5-hour per-model budget is stated."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No seed sensitivity analysis is reported. Results are from a single experimental run with stochastic sampling, but no variance across seeds or repeated runs is provided."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section IV-B states '200 samples per bug' as the default, and RQ4 uses '2000 samples per bug.' The number of generated patches is explicitly stated for all experiments."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search was performed. The paper uses default settings from prior work (top_p=0.95, temperature=0.8) without exploring alternatives, despite acknowledging 'How to pick an optimal temperature value is not obvious for a problem such as APR.'"
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "The paper justifies using default generation settings by stating 'This generation setting is consistent with previous studies on PLMs [26], [28], [30]' (Section IV-B). No cherry-picking of configurations is evident."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical significance tests are performed in this paper, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors do not acknowledge the bias of evaluating their own system. They use baseline results 'directly from previous studies [17], [18], [24]' (Section IV-D), which partially mitigates re-implementation bias, but this advantage is not discussed."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "Table VI reports generation speed for each model, and Section V-A2 explicitly discusses the 'trade-off between repair effectiveness and time cost when using large models' (e.g., GPT-NeoX is 71x slower than GPT-Neo 125M). The 2.5-hour budget is compared against baselines' 5-hour timeout."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses standard APR benchmarks (Defects4J, QuixBugs, ManyBugs) without questioning whether they adequately measure practical program repair capability. No discussion of construct validity or benchmark limitations beyond the synthetic nature of QuixBugs."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is used. PLMs are directly queried for patch generation, so there is no scaffold confound to address."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "Section VI discusses that Defects4J 1.2 is 'the most widely studied dataset for APR' and checks whether patches appear in training data. They note QuixBugs 'is not part of the training data as it has low number of stars on GitHub and contains synthetic bugs.'"
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The paper does not discuss whether the evaluation setup (e.g., providing perfect fault localization, buggy function context) leaks information that would not be available in realistic repair scenarios. Perfect FL is used without framing it as a form of information leakage."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": true,
    357         "justification": "Section VI checks whether fixed functions appear in the training datasets for models with accessible training data (GPT-Neo, GPT-J, GPT-NeoX, CodeT5), finding 15% (20/128) overlap. This directly addresses train/test non-independence."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": true,
    362         "justification": "The authors apply a concrete detection method: checking whether developer-patched functions appear in training datasets, finding 20/128 matches. They also compare PLM-generated patches against developer patches, finding 66% produce different fixes (234/354)."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Directly applying PLMs can substantially outperform all existing APR techniques, fixing 109 bugs on Defects4J 1.2 vs 67 by the best baseline (AlphaRepair)",
    369       "evidence": "Table VII shows combined PLM results fix 109 single-function bugs. AlphaRepair fixes 67. Codex alone fixes 99. Figure 5 shows Venn diagram with 36 unique fixes by PLMs. (Section V-B1)",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "There is a scaling effect for APR where larger PLMs fix more bugs",
    374       "evidence": "Tables III-V show consistent improvement from GPT-Neo 125M to 1.3B to 2.7B to GPT-J to GPT-NeoX across all 5 datasets and 3 repair settings. Figure 4 shows error rates decrease with model size. (Sections V-A1, V-A3)",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Suffix code (infilling-style) is important for generating more fixes with higher compilation rate",
    379       "evidence": "Table V compares Codex single-line (prefix only) vs Codex suffix (prefix+suffix), showing improvement across all datasets (e.g., 32→39 correct on Defects4J 1.2). Figure 4 shows infilling achieves lowest semantic error rates. (Sections V-A1, V-A3)",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "PLMs consider correct patches to be more natural (lower entropy) than incorrect patches",
    384       "evidence": "Table IX shows mean entropy of correct patches is consistently lower than plausible patches, which are lower than non-plausible patches, across all models and settings. (Section V-C1)",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Entropy-based patch ranking enables faster validation of correct patches compared to random ordering",
    389       "evidence": "Figure 7 shows entropy rankings (both mean and sum) consistently outperform random patch ordering on Defects4J 1.2 across 5 PLMs. (Section V-C2)",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "PLM-based APR can be further boosted by increasing sample size and incorporating repair templates",
    394       "evidence": "Table X shows INCODER goes from 37 (200 samples) to 64 (2000 samples) to 78 (with templates) correct fixes on Defects4J 1.2, surpassing all baselines. (Section V-D)",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "The majority of correct PLM fixes are not from memorizing training data",
    399       "evidence": "Section VI reports 66% of fixes differ from developer patches. Only 15% (20/128) of developer-matching fixes appear in accessible training data. QuixBugs results are strong despite likely not being in training data. (Section VI)",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No statistical significance tests",
    406       "detail": "All comparative claims ('substantially outperform', 'scaling effect') are based on raw count comparisons without any statistical tests, despite stochastic sampling that could produce different results across runs."
    407     },
    408     {
    409       "flag": "No variance across runs reported",
    410       "detail": "Results use nucleus sampling (temperature=0.8) which is stochastic, yet only single-run results are reported. The number of correct fixes could vary across runs, but no variance or confidence intervals are provided."
    411     },
    412     {
    413       "flag": "Data leakage for best-performing model",
    414       "detail": "Codex produces the strongest results but its training data is 'N.R.' (not released), making it impossible to verify contamination. The authors can only check leakage for open-source models, which are not the top performers."
    415     },
    416     {
    417       "flag": "Perfect fault localization assumption",
    418       "detail": "All main comparisons use perfect fault localization (ground-truth bug location known), which is unrealistic for practical deployment. The paper's title emphasizes 'Practical Program Repair' but this setting is not practical."
    419     },
    420     {
    421       "flag": "Artifact URL appears to be placeholder",
    422       "detail": "Reference [74] for released code and patches points to 'https://figshare.com/s/temp' — the 'temp' suggests this URL was never replaced with the actual figshare link."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Less training, more repairing please: Revisiting automated program repair via zero-shot learning",
    428       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    429       "year": 2022,
    430       "arxiv_id": "2207.08281",
    431       "relevance": "Proposes AlphaRepair, the zero-shot PLM-based APR approach using CodeBERT that this paper extends and substantially outperforms."
    432     },
    433     {
    434       "title": "Evaluating large language models trained on code",
    435       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    436       "year": 2021,
    437       "arxiv_id": "2107.03374",
    438       "relevance": "Introduces Codex, the best-performing PLM in this study, foundational work on LLMs for code generation."
    439     },
    440     {
    441       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    442       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    443       "year": 2020,
    444       "arxiv_id": "2002.08155",
    445       "relevance": "Pre-trained code model used by AlphaRepair, demonstrating feasibility of pre-trained models for code understanding tasks."
    446     },
    447     {
    448       "title": "InCoder: A generative model for code infilling and synthesis",
    449       "authors": ["Daniel Fried", "Armen Aghajanyan", "Jessy Lin"],
    450       "year": 2022,
    451       "arxiv_id": "2204.05999",
    452       "relevance": "Code infilling model evaluated in this study, demonstrating strong APR results using causal masking for bidirectional generation."
    453     },
    454     {
    455       "title": "Language models are few-shot learners",
    456       "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"],
    457       "year": 2020,
    458       "arxiv_id": "2005.14165",
    459       "relevance": "Introduces GPT-3 and few-shot learning paradigm that underlies the zero-shot/few-shot PLM application strategy for APR."
    460     },
    461     {
    462       "title": "Patch generation with language models: Feasibility and scaling behavior",
    463       "authors": ["Samuel D. Kolak", "Ruben Martins", "Claire Le Goues", "Vincent J. Hellendoorn"],
    464       "year": 2022,
    465       "relevance": "Prior work evaluating Codex and smaller PLMs for patch generation, demonstrating scaling behavior on a small synthetic dataset."
    466     },
    467     {
    468       "title": "Can OpenAI's Codex fix bugs? An evaluation on QuixBugs",
    469       "authors": ["Julian A. Prenner", "Hlib Babii", "Romain Robbes"],
    470       "year": 2022,
    471       "relevance": "Prior evaluation of Codex for bug fixing on QuixBugs, one of the first studies applying commercial PLMs to APR."
    472     },
    473     {
    474       "title": "A syntax-guided edit decoder for neural program repair (Recoder)",
    475       "authors": ["Qihao Zhu", "Zeyu Sun", "Yuan-an Xiao"],
    476       "year": 2021,
    477       "relevance": "State-of-the-art learning-based APR tool using syntax-guided generation, one of the main baselines outperformed by PLMs."
    478     },
    479     {
    480       "title": "CURE: Code-aware neural machine translation for automatic program repair",
    481       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    482       "year": 2021,
    483       "relevance": "Learning-based APR tool using code-aware NMT, a key baseline demonstrating limits of supervised APR approaches."
    484     },
    485     {
    486       "title": "Neural program repair with execution-based backpropagation (RewardRepair)",
    487       "authors": ["He Ye", "Matias Martinez", "Martin Monperrus"],
    488       "year": 2022,
    489       "relevance": "Learning-based APR tool using execution feedback, represents the state-of-the-art in supervised program repair."
    490     },
    491     {
    492       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    493       "authors": ["René Just", "Darioush Jalali", "Michael D. Ernst"],
    494       "year": 2014,
    495       "relevance": "The most widely-used APR benchmark, central to the evaluation in this paper and most APR research."
    496     },
    497     {
    498       "title": "Scaling laws for neural language models",
    499       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    500       "year": 2020,
    501       "arxiv_id": "2001.08361",
    502       "relevance": "Establishes scaling laws for language models, providing theoretical basis for the scaling effect observed in this paper's APR results."
    503     }
    504   ],
    505   "engagement_factors": {
    506     "practical_relevance": {
    507       "score": 3,
    508       "justification": "Directly actionable — practitioners can use off-the-shelf PLMs via HuggingFace or Codex API for automated bug fixing with the described prompt strategies."
    509     },
    510     "surprise_contrarian": {
    511       "score": 2,
    512       "justification": "Shows general-purpose PLMs without any APR-specific training substantially outperform all 20 dedicated APR tools, challenging the value of domain-specific APR engineering."
    513     },
    514     "fear_safety": {
    515       "score": 0,
    516       "justification": "No safety or security concerns raised; the paper focuses on beneficial application of PLMs for fixing software bugs."
    517     },
    518     "drama_conflict": {
    519       "score": 1,
    520       "justification": "Mild tension: years of specialized APR tool development are outperformed by generic language models used zero-shot, potentially undermining the APR research agenda."
    521     },
    522     "demo_ability": {
    523       "score": 2,
    524       "justification": "Uses publicly available models from HuggingFace (GPT-Neo, INCODER) and Codex API, though no standalone tool or demo is released."
    525     },
    526     "brand_recognition": {
    527       "score": 2,
    528       "justification": "Features OpenAI's Codex prominently as the best performer; GPT-Neo/J/NeoX and INCODER are well-known in the ML community."
    529     }
    530   }
    531 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs