scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33313B)
      1 {
      2   "paper": {
      3     "title": "Exploring Generalizable Automated Program Repair with Large Language Models",
      4     "authors": [
      5       "Viola Campos",
      6       "Ridwan Shariffdeen",
      7       "Adrian Ulges",
      8       "Yannic Noller"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv",
     12     "arxiv_id": "2506.03283"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "No single LLM generalizes well across programming languages for automated program repair; different models excel on different language-specific benchmarks (Java, JavaScript, Python, PHP). Combining models in a committee improves pass@5 by 2–5 percentage points over the best individual model. Adding test case information to prompts yields substantial improvements (up to +47% pass@1), while automated fault localization causes dramatic accuracy drops (7–17% pass@1) compared to perfect localization, with only 28/100 bugs having the correct fix location in the top-3 candidates. Open models are catching up to closed models, with DeepSeek R1 (distilled) surpassing some closed models.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Section 7 states 'Upon acceptance, we will update the artifact with a reproduction package including the scripts for prompting the LLMs as well for the data analysis.' This is a promise of future release, not a current release. Only results and generated patches are currently available on figshare."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section 7 states results and generated patches are 'openly available in our supplemental material' on figshare. Additionally, all four benchmarks (Defects4J, BugsInPy, BugsJS, BugsPHP) are publicly available datasets."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions model names and temperature but not the computational environment for running experiments."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. Scripts are promised for future release upon acceptance (Section 7)."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Tables 2, 4, 6, and 7 report pass@k as point estimates without confidence intervals or error bars. No ± notation or CI notation appears anywhere in the results."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Section 3.4 states: 'we apply the Wilcoxon signed-rank test at a significance level of α = 0.05.' Results tables use bold for best and underline for results not significantly different from the best."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Effect sizes are reported as absolute percentage differences with baselines for context. For example, Table 4 shows base vs. test prompt differences (e.g., 19.02% → 45.33% for Claude 3.7 on Java), and the text reports specific improvements like '+47% pass@1' and '-16.20%' for FL comparisons."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The number of generations per bug (n=15) is justified by citing prior work: 'Based on the standard deviation analysis of pass@1 for LLM-based APR from [27], we use n = 15 as a reasonable, yet manageable number of generations.' The 100-bug subsets are justified through stratified sampling preserving complexity distributions (Section 4.1.2)."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "Results are reported as pass@k point estimates computed from 15 generations (3 runs × 5 samples). No standard deviation, IQR, or variance across the 3 independent runs is reported. The reader cannot assess result stability."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "13 LLMs are compared against each other, including both open and closed models. The base prompt serves as a baseline condition, with test and line-level localization prompts as variations."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Models include recent frontier models: Claude 3.7 Sonnet, OpenAI o3-mini, DeepSeek R1, Gemini 2.0 Flash, and Qwen 2.5 Coder. Model selection was based on recent code-focused leaderboards (Section 3.2)."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The study systematically varies prompt components: base (code only), test (adding test info), and line-level localization (adding fix location hints). This functions as an ablation of prompt ingredients, with Tables 4 and 6 showing the impact of each component."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Two metrics are reported throughout: pass@1 (success with one candidate) and pass@5 (success among five candidates), following the Chen et al. formulation."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "Section 3.4 states 'a manual review of patches is infeasible at scale.' Evaluation is entirely automated using test-suite plausibility. No human review of patch quality was conducted."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "Section 3.5 mentions preliminary experiments to select the prompt format: 'In preliminary experiments, we evaluated different methods for integrating such line-level localization information into APR prompts.' It is unclear whether these preliminary experiments used separate data from the reported results, creating potential data snooping."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by programming language (4 benchmarks), prompt type (base/test/LL), patch complexity (single-line/single-hunk/multi-hunk in Table 7), and model type (open vs. closed). Figure 2 shows per-model unique fix contributions via Venn diagrams."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Figure 1 analyzes Python indentation errors as a systematic failure mode. Section 4.2.2 discusses cases where automated FL fails to identify the correct function (72/100 bugs). The paper also discusses cases where plausible patches are produced despite incorrect localization (attributed to test overfitting)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative results are reported: line-level localization hurts accuracy for 4/6 LLMs on PHP (Section 4.2.1); automated FL causes dramatic accuracy drops (Table 6); no single model generalizes across languages; Python performance is surprisingly poor due to indentation issues."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract's three key claims are all supported: (1) 'Different LLMs tend to perform best for different languages' — Table 2 shows 4 different top models across 4 benchmarks. (2) 'Combining models by pooling repairs adds value' — Table 5 and Figure 3 show ensemble gains. (3) 'significant drops in accuracy from imperfect FL' — Table 6 shows 7–17% pass@1 drops."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Causal claims like 'including information about the failing test case significantly boosted the repair performance' are supported by controlled comparisons where only one prompt component changes while all else remains equal (same models, same bugs, same evaluation). This is a valid single-variable manipulation design."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 5 explicitly states: 'Our broad selection of models and benchmarks reduces the threat to external validity; however, we cannot claim generality beyond our experiments.' The paper appropriately scopes findings to the tested models, languages, and benchmarks."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 5 discusses several alternative explanations: data leakage (with specific leak ratios from Zhou et al.), output variability of LLMs, the distinction between plausibility and correctness (test overfitting), prompt optimization effects, and repair time differences between models."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 3.4 explicitly distinguishes between plausible patches (passing tests) and correct patches (satisfying intended requirements): 'A patch may pass all tests but still fail to implement the intended functionality.' They acknowledge using plausibility as a proxy and discuss test overfitting at length."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "All 13 models are specified with identifiable names and sizes: Claude 3.7 Sonnet, Claude 3.5 Haiku, Gemini 1.5 Pro, Gemini 2.0 Flash, GPT-4o (Nov 11 2024 version), o3-mini (Jan 31 2025 version), CodeLlama 13B/70B, DeepSeek Coder 33B, DeepSeek R1 distilled to Llama 70B, Qwen 2.5 Coder 33B, Llama 3.3 70B, CodeGemma 7B. Version dates are given for continuously-updated OpenAI models."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full prompt templates are provided in Listings 1–4, showing system messages and user prompts with placeholders for benchmark code. The fill values (buggy functions, test cases, error messages) come from the public benchmarks, making prompts fully reconstructible. Supplemental material is also referenced."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 3.4 states: 'each respective model's standard setting with a temperature of 1.0.' Temperature is the key sampling hyperparameter. Other settings are stated to use defaults."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. Section 3.5 states: 'the prompts are not iterative, meaning that each model processes a sample/query without follow-up interactions.' All experiments use single-prompt zero-shot setups."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The filtering pipeline is documented: benchmarks are filtered for reproducibility, single-function bugs, and single-hunk bugs (Table 1 shows counts at each stage). For the 100-bug subsets, stratified sampling by complexity level is described (Section 4.1.2, Table 3), ensuring all repositories are represented."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5 'Discussion & Threats to Validity' provides substantial discussion of limitations across multiple paragraphs covering data leakage, LLM variability, prompt optimization, repair time, and plausibility vs. correctness."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 5 discusses specific threats: Defects4J has a leaked ratio of 0.41% and BugsInPy 11.0% (citing Zhou et al.); plausibility vs. correctness as a construct validity concern (citing Petke et al.); DeepSeek R1's longer inference time creating unfair comparison; prompt optimization was intentionally avoided to prevent model-specific biases."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The paper explicitly states scope boundaries: single-function fixes only (excluding multi-function), zero-shot prompts only (excluding iterative/agentic workflows), no prompt optimization was performed, repair time was not analyzed rigorously, and 'we cannot claim generality beyond our experiments' (Section 5)."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 7 states: 'The data that support the findings of this study, including all our results and the generated patches, are openly available in our supplemental material: https://figshare.com/s/947fd7030f10a67a1c9f.' This includes raw patch outputs."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3.3 describes benchmark selection criteria (diverse languages, real bugs, executable with test suites, human ground-truth patches, sufficient reproducible bugs). Patch generation is described: 15 patches per bug via 3 independent runs of 5 generations each at temperature 1.0."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard public benchmarks (Defects4J, BugsInPy, BugsJS, BugsPHP)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Table 1 shows the filtering pipeline with counts at each stage (all bugs → single-file → single-function → single-hunk → single-line). Section 4.1.2 describes stratified sampling for 100-bug subsets with the distribution shown in Table 3. Reproducibility checks are described per benchmark."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding sources, grants, or sponsors are mentioned anywhere in the paper. There is no acknowledgments section listing funding."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly listed: RheinMain University of Applied Sciences, SonarSource (Singapore), and Ruhr University Bochum. Section 8 provides a disclaimer regarding SonarSource."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, making independence impossible to verify. One author is affiliated with SonarSource, a company in the code quality space, which could have a commercial interest in APR outcomes. Section 8 provides a disclaimer but no formal independence statement."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No formal competing interests or financial interests statement is provided. Section 8 has a disclaimer about SonarSource but does not constitute a standard financial interests declaration covering patents, equity, or other commercial interests."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "Model version dates are given for GPT-4o (Nov 11 2024) and o3-mini (Jan 31 2025), but these are release dates, not training data cutoff dates. Training cutoff dates are not stated for any of the 13 models evaluated."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "Section 5 explicitly discusses overlap: 'we have to assume that some of the benchmark data may have been included in the training corpora of some of the LLMs.' They cite Zhou et al. showing Defects4J has 0.41% leaked ratio and BugsInPy 11.0%, and Ramos et al. on memorization in open-source models."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section 5 addresses contamination by citing Zhou et al.'s LessLeak-Bench analysis of leaked ratios and Ramos et al.'s study of memorization. They note 'BugsInPy obviously has some data leakage issues' yet was the most challenging benchmark, providing an empirical counterpoint to contamination concerns."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants. This is a benchmark evaluation of LLMs on code repair tasks."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The study evaluates LLM performance on public code benchmarks."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference costs, API costs, or per-example latencies are reported despite generating approximately 195,000 patches. Section 5 mentions timing differences between models ('spanning between seconds and several minutes') but provides no specific cost data."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No total computational budget is stated. No GPU hours, total API spend, or hardware specifications are reported. Section 5 acknowledges: 'Our study methodology did not limit the time for model inference or patch validation.'"
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "Three independent runs of 5 generations each are performed, but results are aggregated into a single pass@k estimate. No breakdown by run or analysis of how results vary across the 3 independent runs is provided."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section 3.4 states: 'We distributed the generation across three independent runs, each generating five candidate patches using each respective model's standard setting with a temperature of 1.0.' Total n=15 per bug."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Section 3.5 mentions 'preliminary experiments' to evaluate prompt integration methods for line-level localization, but does not report how many configurations were tried or the compute spent on this search."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The LL prompt format was selected from preliminary experiments ('The most effective strategy was found to be a simple comment \"TODO: Fix here:\"') but the paper does not report all configurations tried, what data was used for selection, or the selection criterion beyond effectiveness."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "Wilcoxon signed-rank tests are applied across 13 models × 4 languages (52+ comparisons per table) at α = 0.05 with no mention of Bonferroni, Holm, or other multiple comparison correction."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "The authors do not propose their own APR system; they evaluate 13 third-party LLMs using standardized prompts and public benchmarks. No self-comparison bias exists since they are not comparing their own tool against baselines."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "Section 5 acknowledges 'Different resource demands across the models, in particular with respect to timing, can lead to unfair comparison' but does not report performance as a function of compute budget. DeepSeek R1 is noted as slower but no quantification is provided."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "Section 3.4 discusses construct validity at length: plausibility vs. correctness of patches, test overfitting as a proxy issue, and comparison of metrics (plausibility, TCE, SYE). They cite Petke et al. who found overfitting 'may be less problematic than previously assumed.'"
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is used. All models are evaluated using the same zero-shot prompt templates, eliminating scaffold confounds by design."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": true,
    347         "justification": "Section 5 discusses temporal leakage: 'Related works, e.g., RepairBench used a dataset with more recent data, whose time period is now also included in the cut-off dates of the latest models.' They cite Zhou et al. on leaked ratios of the specific benchmarks used."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks answer information through context. Different prompt variants provide different amounts of information (base vs test vs LL) but this is studied as a variable, not as a leakage concern."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether bugs from the same projects within benchmarks create non-independence. Defects4J contains bugs from 17 projects and BugsInPy from 17 projects, but potential correlations within projects are not addressed."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection method is applied by the authors. They cite external analyses (Zhou et al.'s leaked ratios, Ramos et al.'s memorization study) but do not apply their own canary strings, membership inference, or decontamination techniques."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "Different LLMs tend to perform best for different programming languages, making cross-platform single-LLM repair difficult.",
    369       "evidence": "Table 2 shows four different models leading on the four benchmarks: Claude 3.7 Sonnet on Java (22.69% pass@1), Claude 3.5 Haiku on JavaScript (14.48%), DeepSeek R1 distilled on PHP (18.43%), and Gemini 2.0 Flash on Python (12.51%).",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Including test case information significantly boosts LLM-based repair performance across all models and languages.",
    374       "evidence": "Table 4 shows consistent improvements with the test prompt, reaching up to +47% pass@1 (DeepSeek R1 on Python). Python shows the strongest average improvement (+34.7% pass@1 averaged over all models). Section 4.1.2.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Combining models by pooling repairs adds value, with a committee of expert models outperforming any single model.",
    379       "evidence": "Table 5 and Figure 3 show ensemble gains: pass@5 improved in 14/16 experiments with the test prompt. Best improvements include JavaScript from 68.00% to 71.68% (o3-mini & DeepSeek R1) and PHP from 58.44% to 63.12%.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Automated fault localization leads to significant drops in repair accuracy compared to perfect localization.",
    384       "evidence": "Table 6 shows pass@1 drops of 7.83–16.20 percentage points when using FLACOCO instead of perfect FL on Defects4J. The actual fix location appeared in the top-3 candidates for only 28/100 faults. Section 4.2.2.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Open models are catching up to closed models in APR capabilities.",
    389       "evidence": "Figure 4 shows a positive trend for open models over time, with DeepSeek R1 (distilled) achieving pass@5 of 25.85% (Table 2), surpassing several closed models. Section 4.4.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "LLMs show good robustness when repairing bugs of higher patch complexity.",
    394       "evidence": "Table 7 shows average pass@1 dropping from 45.06% (single-line) to 34.37% (single-hunk) to 27.50% (multi-hunk). Performance improves from single-line to single-hunk in 9/48 cases and from single-hunk to multi-hunk in 10/48 cases.",
    395       "supported": "moderate"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "Plausibility as proxy for correctness",
    401       "detail": "The paper uses test-passing (plausibility) as the primary evaluation metric for ~195,000 patches without any manual correctness verification. Test overfitting means patches may pass tests without implementing intended fixes. While acknowledged and discussed (Section 3.4), this inflates apparent fix rates."
    402     },
    403     {
    404       "flag": "No multiple comparison correction",
    405       "detail": "Wilcoxon signed-rank tests are applied at α = 0.05 across many model-language-prompt comparisons (potentially hundreds of tests) without any family-wise error rate correction such as Bonferroni or Holm."
    406     },
    407     {
    408       "flag": "No cost or compute budget reported",
    409       "detail": "Approximately 195,000 patches were generated across 13 models and 4 benchmarks, but no API costs, compute time, or total budget is reported. This limits assessment of practical viability and fairness of model comparisons."
    410     },
    411     {
    412       "flag": "Potential benchmark contamination",
    413       "detail": "Defects4J and BugsInPy are well-known benchmarks likely present in LLM training data. The paper acknowledges Defects4J has 0.41% and BugsInPy 11.0% leaked ratios, but applies no decontamination or detection method of its own."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Evaluating Large Language Models Trained on Code",
    419       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    420       "year": 2021,
    421       "arxiv_id": "2107.03374",
    422       "relevance": "Introduces the pass@k metric and HumanEval benchmark used for evaluating LLM code generation; foundational for this paper's evaluation methodology."
    423     },
    424     {
    425       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    426       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    427       "year": 2022,
    428       "doi": "10.1145/3540250.3549101",
    429       "relevance": "Pioneered zero-shot LLM-based program repair, establishing the approach this paper extends across languages and models."
    430     },
    431     {
    432       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each using ChatGPT",
    433       "authors": ["Chunqiu Xia", "Lingming Zhang"],
    434       "year": 2024,
    435       "doi": "10.1145/3650212.3680323",
    436       "relevance": "Demonstrates LLM-based APR with conversational feedback and cost analysis, directly relevant to evaluating LLM repair capabilities."
    437     },
    438     {
    439       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    440       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    441       "year": 2023,
    442       "doi": "10.1109/ICSE48619.2023.00129",
    443       "relevance": "Extensive prior evaluation of 9 LLMs on APR benchmarks across languages, directly compared with this paper's expanded study."
    444     },
    445     {
    446       "title": "RepairBench: Leaderboard of Frontier Models for Program Repair",
    447       "authors": ["André Silva", "Martin Monperrus"],
    448       "year": 2024,
    449       "arxiv_id": "2409.18952",
    450       "relevance": "Provides a standardized APR leaderboard for frontier models; this paper adopts RepairBench's test prompt template."
    451     },
    452     {
    453       "title": "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair",
    454       "authors": ["André Silva", "Sen Fang", "Martin Monperrus"],
    455       "year": 2024,
    456       "arxiv_id": "2312.15698",
    457       "relevance": "Explores fine-tuned LLMs for APR with different input/output representations, evaluating systematic prompt and model choices."
    458     },
    459     {
    460       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    461       "authors": ["Islem Bouzenia", "Premkumar T. Devanbu", "Michael Pradel"],
    462       "year": 2024,
    463       "arxiv_id": "2403.17134",
    464       "relevance": "Demonstrates agentic LLM-based APR with iterative repair cycles, contrasting with this paper's single-prompt approach."
    465     },
    466     {
    467       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    468       "authors": ["John Yang", "Carlos E Jimenez", "Alexander Wettig", "Kilian Lieret"],
    469       "year": 2024,
    470       "relevance": "Key agentic APR system demonstrating LLM-driven software engineering, related to the paper's discussion of future agentic workflows."
    471     },
    472     {
    473       "title": "AutoCodeRover: Autonomous Program Improvement",
    474       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    475       "year": 2024,
    476       "doi": "10.1145/3650212.3680384",
    477       "relevance": "Autonomous LLM-based program improvement agent relevant to the paper's discussion of agentic APR approaches."
    478     },
    479     {
    480       "title": "The Fact Selection Problem in LLM-Based Program Repair",
    481       "authors": ["Nikhil Parasaram", "Huijie Yan", "Boyu Yang"],
    482       "year": 2024,
    483       "arxiv_id": "2404.05520",
    484       "relevance": "Studies how different information in prompts affects LLM-based APR; directly informs this paper's prompt design and the finding that test information is a key signal."
    485     },
    486     {
    487       "title": "Impact of Code Language Models on Automated Program Repair",
    488       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    489       "year": 2023,
    490       "doi": "10.1109/ICSE48619.2023.00125",
    491       "relevance": "Studies fine-tuned code LLMs for APR across benchmarks, providing a baseline comparison for this paper's zero-shot evaluation."
    492     },
    493     {
    494       "title": "LessLeak-Bench: A First Investigation of Data Leakage in LLMs Across 83 Software Engineering Benchmarks",
    495       "authors": ["Xin Zhou", "Martin Weyssow", "Ratnadira Widyasari"],
    496       "year": 2025,
    497       "arxiv_id": "2502.06215",
    498       "relevance": "Quantifies data leakage rates for SE benchmarks including Defects4J and BugsInPy, directly cited to address contamination threats in this paper."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 2,
    504       "justification": "APR practitioners can use the model comparison insights and model committee strategy, but no tool or code is currently released."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "The finding that no single LLM dominates across languages is somewhat expected; the dramatic impact of automated FL is noteworthy but not shocking."
    509     },
    510     "fear_safety": {
    511       "score": 0,
    512       "justification": "No security or safety concerns raised; the paper focuses on bug-fixing capabilities of LLMs."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy or conflict; a straightforward empirical comparison study."
    517     },
    518     "demo_ability": {
    519       "score": 0,
    520       "justification": "No demo, tool, or released code; scripts are promised upon acceptance but not yet available."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "From academic labs (RheinMain, Ruhr University) and SonarSource; evaluates well-known models (GPT-4o, Claude, Gemini) but the paper itself is not from a major AI lab."
    525     }
    526   }
    527 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs