ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32585B)


      1 {
      2   "paper": {
      3     "title": "Empirical Evaluation of Large Language Models in Automated Program Repair",
      4     "authors": [
      5       "Jiajun Sun",
      6       "Fengjie Li",
      7       "Xinzhu Qi",
      8       "Hongyu Zhang",
      9       "Jiajun Jiang"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2506.13186",
     14     "doi": "10.48550/arXiv.2506.13186"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "CodeLlama-7B (code-specialized, fine-tuned) consistently outperforms the larger general-purpose LLaMA-2-13B on APR benchmarks, demonstrating that domain adaptation matters more than parameter scale. Correct patches overwhelmingly appear in the first 30 generations across all models, suggesting practical strategies for reducing computational cost. Prompt design significantly affects repair rates—adding repair examples improves all models, while integrating LLM-generated bug analysis helps weaker models but can hurt stronger models due to inaccurate diagnostics. LLMs perform substantially better on algorithmic assignment bugs than enterprise-grade project bugs, and better on Java than C/C++.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper states 'we release all generated patches, evaluation scripts, prompt templates, and dataset configurations at our homepage' but provides no URL or repository link anywhere in the paper text. A vague reference to a 'homepage' without an actual link does not count as released."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All six evaluation benchmarks (Defects4J, BugsCpp, IntroClass, IntroClass-Java, ConDefects-Java, ConDefects-Py) are publicly available datasets with citations. The authors also claim to release generated patches, though no URL is provided."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment specifications, dependency lists, Docker files, or hardware details are provided. The paper does not describe what GPU hardware was used or any software environment details."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided. While the paper describes the experimental methodology, there are no commands, scripts, or README-level instructions that would allow replication."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All results in Tables IV, V, and VI are reported as point estimates (e.g., '15.7%' repair rate) with no confidence intervals, error bars, or uncertainty measures."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper makes numerous comparative claims (e.g., 'DeepSeek-Coder achieves the best repair performance') based solely on comparing raw numbers without any statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports percentage improvements with baseline context throughout, e.g., 'CodeLlama's repair count increases by 206.7%', 'LLaMA's correct repairs rising from 1 to 32 (a 3100% increase)', and absolute repair rates across all conditions, providing sufficient magnitude context."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No justification for why these specific benchmark sizes are adequate for the claims made. No power analysis or discussion of whether, e.g., 106 bugs in BugsCpp is sufficient for reliable conclusions."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs with no assessment of result stability across seeds or repeated experiments."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Four LLMs are compared against each other across all datasets, and multiple prompt settings (zero-shot, one-shot, two-shot, analysis-augmented) serve as baseline comparisons."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The evaluated models (CodeLlama 2023, LLaMA-2 2023, StarCoder 2023, DeepSeek-Coder 2024) are contemporary and widely used in APR research. However, notable omissions include proprietary models like GPT-4 and Claude which had been used in recent APR work."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "RQ4 systematically ablates prompt components: zero-shot vs one-shot (removing the example), and the impact of adding bug analysis. These controlled comparisons isolate the effect of prompt design choices."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Three metrics are reported: C/P (correct/plausible count), Repair Rate (RRate), and Precision. These capture different aspects of repair effectiveness."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Section III-E states 'All plausible patches are then manually inspected by the first two authors. If a patch is semantically equivalent to the ground truth patch, it is classified as a correct patch.' This two-stage validation (automated test suite + manual inspection) is a strength."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The authors use established benchmarks without any tuning. The reduction from 200 to 30 patches was informed by preliminary results on Defects4J/BugsCpp and then applied to separate algorithmic datasets, maintaining separation."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by dataset (Tables IV, V, VI), programming language (C vs Java vs Python comparisons), model, prompt setting, bug length (Figure 5), and repair action types (Figure 6)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Figure 2 shows a specific failure case on BugsCpp with analysis of why LLMs failed (bug length). Figure 7 shows how incorrect bug analysis misleads models. Section IV-A2 discusses compilation failures (20%) and incorrect location modifications (76%) on DeepSeek-Coder."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Several negative results are reported: all LLMs fail badly on BugsCpp (3.5% average RRate), bug analysis integration hurts DeepSeek-Coder (drops 46.6% on ConDefects-Java), and LLaMA shows very poor performance across most benchmarks."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims about CodeLlama outperforming LLaMA (Tables IV, V), diminishing returns from model size (Tables IV, V), early correct patches (Figure 4), and prompt design impact (Table VI) are all supported by corresponding experimental results."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The paper claims 'fine-tuning on code-related tasks significantly enhances LLMs' repair capabilities' (Finding 2) by comparing CodeLlama-7B vs LLaMA-2-13B. This conflates fine-tuning with parameter count differences (7B vs 13B) and potentially different base model training. The prompt ablation studies (RQ4) are better controlled but still lack formal causal identification."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title 'Empirical Evaluation of Large Language Models in Automated Program Repair' implies broad coverage of LLMs, but the study tests only 4 open-source models (7B-33B) with no proprietary models (GPT-4, Claude). The abstract claims about 'modern, large-scale LLMs' are not well-bounded to the tested subset."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": true,
    136         "justification": "Section V-A discusses data leakage as an alternative explanation for model performance. RQ3 investigates bug length as a confounding factor for cross-language performance differences. The analysis of incorrect bug analysis (Figure 7) provides alternative explanations for the negative effect on stronger models."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures repair rate and precision directly on benchmark bugs and does not overclaim these as broader metrics like 'software quality' or 'developer productivity.' Claims match the measurement granularity."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Table II specifies CodeLlama-7B, LLaMA-2-13B, StarCoderBase (15.5B), and DeepSeek-Coder-33B-instruct with providers and years. These are specific enough to identify exact model weights for open-source models."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Figure 1 shows the full prompt structure including the actual guidance text ('// Provide a fix for the buggy function', 'You are a code analysis tool...'), the GCD example with actual code, and the template structure. Table III summarizes all four prompt configurations used."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "No generation hyperparameters are reported — temperature, top-p, top-k, max tokens, and other sampling parameters are not mentioned anywhere in the paper, despite generating over 600,000 patches."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The approach is direct LLM inference with prompts — no tool use, retry logic, feedback loops, or multi-step reasoning pipelines."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section III-B documents dataset filtering: selecting single-function bugs from Defects4J (255+228), BugsCpp (106), IntroClass (297 each), and ConDefects (563 each after filtering for cross-language assignment overlap and random selection of one submission per language)."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section V-B 'Limitation' and Section V-C 'Threats to Validity' (with Internal and External subsections) provide substantive discussion of study limitations."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Specific threats include: only 4 models tested with parameter sizes up to 33B while larger models exist; manual verification by two authors as a potential internal validity concern; datasets may not represent real-world bug complexity; and limited programming language coverage."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section V-B explicitly states: 'the capabilities of even larger or more recent models remain unexplored', 'real-world bugs may be more complex than those in the datasets', and 'additional programming languages not included in this study may pose unique challenges.'"
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "The paper claims to release 'all generated patches, evaluation scripts, prompt templates, and dataset configurations' but provides no URL. Without an accessible link, the raw data cannot be verified."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section III-B describes dataset selection in detail: sources, years, languages, number of bugs, and filtering criteria (single-function bugs, cross-language overlap for ConDefects, random selection of one submission per assignment)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data sources are standard public benchmarks (Defects4J, BugsCpp, IntroClass, ConDefects)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The pipeline is documented: benchmark selection → single-function bug filtering (with counts: 255, 228, 106, 297, 297, 563, 563) → patch generation (200 or 30 per bug) → deduplication → test suite validation → manual inspection of plausible patches → classification as correct or not."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: Tianjin University, University of Electronic Science and Technology of China, and Chongqing University. They evaluate third-party open-source models, not their own products."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, so independence cannot be assessed. The authors evaluate open-source models from Meta, BigCode, and DeepSeek — no apparent conflict, but the lack of any funding disclosure is a gap."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No training data cutoff dates are stated for any of the four evaluated models. This is critical since Defects4J (2014) and IntroClass (2015) have been public for years before these models were trained."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "Section V-A acknowledges the risk that 'some of the buggy or fixed code used in selected benchmarks may partially or fully exist in the model's training data' but performs no actual analysis of train/test overlap (no membership inference, n-gram checks, or temporal analysis)."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "Defects4J (2014), IntroClass (2015), and QuixBugs-derived benchmarks have been publicly available on GitHub for years before the 2023-2024 models were trained. The paper's only mitigation is using 'multiple datasets that differ in collection time' without concretely analyzing contamination risk for specific benchmarks."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. This is a benchmark evaluation study using LLMs on bug datasets."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. Manual patch verification by authors does not constitute a human subjects study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "Despite generating over 600,000 patches and discussing computational cost as a key theme (RQ2), no actual inference costs (time, GPU hours, tokens, or monetary cost) are reported."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No total computational budget, GPU type, GPU hours, or hardware specifications are mentioned. The paper discusses reducing patches from 200 to 30 for efficiency but never quantifies the actual compute consumed."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of random seeds, seed sensitivity, or results across multiple seeds. All results appear to be from single experimental runs."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": true,
    307         "justification": "The paper explicitly states the number of patches generated per bug: 200 for Defects4J and BugsCpp (Section III-E), 30 for subsequent experiments (Section IV-B). Each patch generation constitutes a run."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "No hyperparameter search is described. Generation parameters (temperature, etc.) are not even reported, let alone any search over them."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "All four prompt configurations are reported with full results in Table VI. No cherry-picking of configurations — the paper systematically compares all settings and reports both improvements and degradations."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. The paper makes dozens of comparative claims across models, datasets, and prompt settings without any statistical testing."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": true,
    327         "justification": "The authors evaluate existing open-source LLMs rather than their own system, which mitigates the primary self-comparison bias concern. They do not re-implement baselines — they use the same generation pipeline for all models."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "While the paper discusses model parameter size vs performance (diminishing returns theme), it does not report actual compute costs. No GPU hours, inference time, or matched-compute comparisons are provided."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper uses six benchmarks without discussing whether they actually measure real-world repair capability. No analysis of construct validity — e.g., whether fixing single-function bugs from student assignments generalizes to real software maintenance."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is used. All models receive the same direct prompt-based generation pipeline, so there is no scaffold confound."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "Defects4J (2014) and IntroClass (2015) have been publicly available for nearly a decade before the 2023-2024 models were trained. The paper acknowledges diversity in 'collection time' but does not specifically analyze temporal leakage for older benchmarks."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": true,
    354         "justification": "Section III-E states 'we provided method-level perfect fault localization, eliminating potential confounding factors introduced by localization inaccuracies.' This is explicitly acknowledged as a deliberate design choice to isolate repair capability, not a leaked feature."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "No discussion of whether training corpora for the evaluated models include code from the same projects as the benchmarks (e.g., Apache Commons projects in Defects4J are widely available on GitHub)."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection or prevention methods are used. Section V-A discusses leakage risk conceptually ('it is plausible that some of the buggy or fixed code...may exist in the model's training data') but applies no detection techniques (no canary strings, membership inference, or n-gram overlap analysis)."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Fine-tuned CodeLlama-7B consistently outperforms the general-purpose LLaMA-2-13B across all evaluated datasets despite having fewer parameters.",
    371       "evidence": "Tables IV and V show CodeLlama fixing 40 vs 19 bugs on Defects4J v1.2, 34 vs 18 on v2.0, 5 vs 1 on BugsCpp, and similar advantages across all algorithmic benchmarks (Section IV-A, IV-B).",
    372       "supported": "strong"
    373     },
    374     {
    375       "claim": "Increasing model parameter size yields diminishing returns in repair effectiveness for code-specialized models.",
    376       "evidence": "DeepSeek-Coder (33B, 4.71x CodeLlama's parameters) repaired only 4 fewer bugs than CodeLlama on Defects4J v1.2 (44 vs 40). StarCoder (15.5B) achieves competitive performance to DeepSeek-Coder (33B) on Defects4J (Table IV, Section IV-A).",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Correct patches are predominantly generated in early iterations — 76.59% of DeepSeek-Coder's correct patches appear within the top 30 candidates.",
    381       "evidence": "Figure 4(a) shows cumulative distribution of correct patch rankings on Defects4J. For LLaMA and StarCoder, only one correct patch each falls outside the top 30. Figure 4(b) confirms 89.6% of StarCoder's correct patches on IntroClass are within top 30.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Providing repair examples in prompts substantially improves repair performance across all LLMs.",
    386       "evidence": "Table VI shows consistent drops in zero-shot vs one-shot: LLaMA drops 85.7% on ConDefects-Java (7 to 1 fix), CodeLlama drops 44%, StarCoder drops 29.3%. Average RRate falls from 11.5% to 8.9% on ConDefects-Java (Section IV-D1).",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "Incorporating LLM-generated bug analysis helps weaker models but hurts stronger models.",
    391       "evidence": "Table VI: LLaMA increases from 1 to 32 correct fixes on ConDefects-Java (+3100%), CodeLlama increases 53.6%. But DeepSeek-Coder drops 46.6% (118 to 63). Figure 7 shows an example of incorrect analysis misleading DeepSeek-Coder (Section IV-D2).",
    392       "supported": "moderate"
    393     },
    394     {
    395       "claim": "LLMs achieve significantly better repair performance on enterprise-grade Java bugs than C/C++ bugs.",
    396       "evidence": "Table IV: average RRate on Defects4J is 15.1% vs 3.5% on BugsCpp — a 76.5% reduction. Analysis in Section IV-A2 attributes this to longer bug lengths in C/C++ code.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "Smaller models exhibit complementary repair capabilities — each model generates unique correct patches not found by other models.",
    401       "evidence": "Figure 3 shows Venn diagrams: CodeLlama fixes 9 unique bugs on Defects4J v1.2 that no other model fixes, LLaMA contributes 1 unique fix despite being the weakest model overall (Section IV-A3).",
    402       "supported": "strong"
    403     }
    404   ],
    405   "red_flags": [
    406     {
    407       "flag": "No statistical testing",
    408       "detail": "All comparative claims across 4 models, 6 datasets, and 4 prompt settings are based on raw number comparisons with no significance tests, p-values, or bootstrap confidence intervals. With over 600,000 patches, generation stochasticity could easily account for small observed differences."
    409     },
    410     {
    411       "flag": "No uncertainty quantification",
    412       "detail": "Results appear to be from single experimental runs with no variance, standard deviation, or confidence intervals reported. LLM patch generation is inherently stochastic, yet no assessment of result stability is provided."
    413     },
    414     {
    415       "flag": "Missing generation hyperparameters",
    416       "detail": "Temperature, top-p, top-k, max tokens, and other sampling parameters are never reported despite generating 600,000+ patches. These parameters significantly affect generation diversity and quality."
    417     },
    418     {
    419       "flag": "Contamination risk unaddressed",
    420       "detail": "Defects4J (2014) and IntroClass (2015) have been publicly available on GitHub for nearly a decade. The evaluated models were trained in 2023-2024 on massive code corpora likely including these benchmarks. No contamination detection methods (membership inference, n-gram overlap) are applied."
    421     },
    422     {
    423       "flag": "Confounded fine-tuning comparison",
    424       "detail": "The claim that 'fine-tuning on code-related tasks significantly enhances repair capabilities' is based on comparing CodeLlama-7B vs LLaMA-2-13B, which confounds fine-tuning with parameter count (7B vs 13B) and potentially different base training. A fair comparison would require CodeLlama-7B vs LLaMA-2-7B."
    425     },
    426     {
    427       "flag": "No compute costs despite cost-effectiveness theme",
    428       "detail": "The paper identifies 'Cost-Effectiveness Tradeoffs' as a major gap in prior work and discusses reducing patch counts from 200 to 30, yet never reports actual compute costs (GPU type, GPU hours, inference time, or energy consumption)."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Code Llama: Open Foundation Models for Code",
    434       "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"],
    435       "year": 2023,
    436       "arxiv_id": "2308.12950",
    437       "relevance": "Code-specialized LLM derived from LLaMA via fine-tuning on code tasks, one of four models evaluated in this study."
    438     },
    439     {
    440       "title": "LLaMA 2: Open Foundation and Fine-Tuned Chat Models",
    441       "authors": ["H. Touvron", "L. Martin", "K. Stone"],
    442       "year": 2023,
    443       "arxiv_id": "2307.09288",
    444       "relevance": "General-purpose open-source LLM used as the non-code-specialized baseline in this APR evaluation."
    445     },
    446     {
    447       "title": "StarCoder: May the Source Be with You!",
    448       "authors": ["R. Li", "L. B. Allal", "Y. Zi"],
    449       "year": 2023,
    450       "arxiv_id": "2305.06161",
    451       "relevance": "Open-source code LLM evaluated for automated program repair capability across multiple benchmarks."
    452     },
    453     {
    454       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming — the Rise of Code Intelligence",
    455       "authors": ["D. Guo", "Q. Zhu", "D. Yang"],
    456       "year": 2024,
    457       "arxiv_id": "2401.14196",
    458       "relevance": "Largest code-specialized model evaluated (33B parameters), achieving best overall repair performance in this study."
    459     },
    460     {
    461       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    462       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    463       "year": 2023,
    464       "relevance": "First study applying LLMs to APR tasks, evaluated earlier-generation models on Defects4J, ManyBugs, and QuixBugs — a key predecessor to this work."
    465     },
    466     {
    467       "title": "Keep the Conversation Going: Fixing 162 out of 337 Bugs for $0.42 Each Using ChatGPT",
    468       "authors": ["C. S. Xia", "L. Zhang"],
    469       "year": 2023,
    470       "arxiv_id": "2304.00385",
    471       "relevance": "ChatRepair approach using conversational LLM interaction for APR, demonstrating cost-effective repair with commercial LLMs."
    472     },
    473     {
    474       "title": "ThinkRepair: Self-Directed Automated Program Repair",
    475       "authors": ["X. Yin", "C. Ni", "S. Wang"],
    476       "year": 2024,
    477       "relevance": "LLM-based APR technique using self-directed reasoning, part of the newer generation of LLM APR approaches."
    478     },
    479     {
    480       "title": "How Far Can We Go with Practical Function-Level Program Repair?",
    481       "authors": ["J. Xiang", "X. Xu", "F. Kong"],
    482       "year": 2024,
    483       "arxiv_id": "2404.12833",
    484       "relevance": "SRepair: function-level APR with LLMs evaluated primarily on Defects4J, a close comparison point for this study."
    485     },
    486     {
    487       "title": "RepairLlama: Efficient Representations and Fine-Tuned Adapters for Program Repair",
    488       "authors": ["A. Silva", "S. Fang", "M. Monperrus"],
    489       "year": 2023,
    490       "arxiv_id": "2312.15698",
    491       "relevance": "LoRA-based fine-tuning of CodeLlama specifically for APR tasks, directly relevant to the fine-tuning vs general-purpose model comparison."
    492     },
    493     {
    494       "title": "The Plastic Surgery Hypothesis in the Era of Large Language Models",
    495       "authors": ["C. S. Xia", "Y. Ding", "L. Zhang"],
    496       "year": 2023,
    497       "relevance": "FitRepair approach fine-tuning CodeT5 for APR, exploring the hypothesis that repair ingredients exist in nearby code."
    498     },
    499     {
    500       "title": "Impact of Code Language Models on Automated Program Repair",
    501       "authors": ["N. Jiang", "K. Liu", "T. Lutellier"],
    502       "year": 2023,
    503       "relevance": "Empirical study evaluating earlier code language models (CodeBERT, CodeT5) on APR, a predecessor to this work's evaluation of larger LLMs."
    504     },
    505     {
    506       "title": "ConDefects: A New Dataset to Address the Data Leakage Concern for LLM-based Fault Localization and Program Repair",
    507       "authors": ["Y. Wu", "Z. Li", "J. M. Zhang"],
    508       "year": 2023,
    509       "arxiv_id": "2310.16253",
    510       "relevance": "Benchmark dataset specifically designed to mitigate data leakage concerns in LLM-based APR evaluation, used as two of the six benchmarks in this study."
    511     }
    512   ],
    513   "engagement_factors": {
    514     "practical_relevance": {
    515       "score": 2,
    516       "justification": "Provides actionable guidance on prompt design and model selection for APR practitioners, plus the finding that 30 patches suffices instead of 200."
    517     },
    518     "surprise_contrarian": {
    519       "score": 1,
    520       "justification": "The finding that smaller models generate unique patches not found by larger models is mildly surprising, but most results confirm expected trends."
    521     },
    522     "fear_safety": {
    523       "score": 0,
    524       "justification": "No AI safety or security concerns raised; this is a purely technical evaluation of code repair capabilities."
    525     },
    526     "drama_conflict": {
    527       "score": 0,
    528       "justification": "No controversy or provocative claims — a straightforward empirical comparison."
    529     },
    530     "demo_ability": {
    531       "score": 0,
    532       "justification": "Claims to release code and patches but provides no URL; nothing is demonstrably available to try."
    533     },
    534     "brand_recognition": {
    535       "score": 1,
    536       "justification": "Evaluates recognizable models (LLaMA, DeepSeek-Coder) but the research group and venue are not high-profile."
    537     }
    538   }
    539 }

Impressum · Datenschutz