scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (33195B)
      1 {
      2   "paper": {
      3     "title": "The Impact of Fine-tuning Large Language Models on Automated Program Repair",
      4     "authors": [
      5       "Roman Machácek",
      6       "Anastasiia Grishina",
      7       "Max Hort",
      8       "Leon Moonen"
      9     ],
     10     "year": 2025,
     11     "venue": "41st IEEE International Conference on Software Maintenance and Evolution (ICSME 2025)",
     12     "arxiv_id": "2507.19909",
     13     "doi": "10.1109/ICSME64153.2025.00042"
     14   },
     15   "scan_version": 3,
     16   "active_modules": ["experimental_rigor", "data_leakage"],
     17   "methodology_tags": ["benchmark-eval"],
     18   "key_findings": "Full fine-tuning of LLMs on bug-fix pairs improves poorly-performing models (CodeT5, Bloom) but degrades the best-performing ones (DeepSeekCoder, StarCoder) due to data distribution mismatch and overfitting. Parameter-efficient fine-tuning with LoRA consistently outperforms both full fine-tuning and IA3, achieving up to 225% improvement over base models while using less than 1% of trainable parameters. LoRA hyperparameters (rank and scaling factor) have negligible impact on performance, suggesting default values are sufficient.",
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "A replication package with code and results is provided via Zenodo (doi:10.5281/zenodo.16359186), referenced in Section VI and footnote 1. The authors state their code 'builds on the code provided by Jiang et al., adapting it to accommodate the benchmarking of additional models and parameter-efficient fine-tuning.'"
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "All three benchmarks (QuixBugs, Defects4J, HumanEval-Java) are publicly available. The CLM fine-tuning dataset is available via the clm GitHub repository. The Zenodo replication package includes results data."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The paper mentions using A100 and V100 GPUs and the Hugging Face library for models and PEFT, but does not specify library versions, Python version, or provide a requirements.txt or Dockerfile. Notably, the authors criticize Jiang et al. for not specifying the Java version used, yet do not specify their own Java version either."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "The Zenodo replication package (doi:10.5281/zenodo.16359186) is explicitly described as enabling 'replication and verification of our work' (Section VI). The paper provides detailed preprocessing in Listing 1 and describes the experimental setup thoroughly in Section III."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results are reported as point estimates (counts of plausible patches, single CodeBLEU/exact-match values) with no confidence intervals or error bars anywhere in the paper."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The paper makes numerous comparative claims (e.g., 'LoRA performs better than IA3 in 21 out of 24 cases') based solely on comparing raw numbers without any statistical significance tests."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper provides baseline context for improvements: 'by using LoRA for CodeGen-2B, we used only 0.09% of trainable parameters of the full model, while achieving performance gains of 172%, 225%, 153% on the QuixBugs, HumanEval-Java and Defects4J benchmarks.' Tables consistently show base vs. fine-tuned numbers for comparison."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No justification is given for generating exactly 10 patches per problem (only cited as convention from related works), nor for the selection of benchmark sizes. The choice of models and benchmarks follows prior work without independent justification."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "All results appear to be from single experimental runs with no variance, standard deviation, or spread measures reported. The paper does not state whether experiments were repeated or report any measure of result stability."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Each RQ uses the non-fine-tuned base models as baselines (Table I). RQ3 compares PEFT against both base and full fine-tuning (Table V). Results are also compared with Jiang et al. [8] and Li et al. [58]."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Baselines include Jiang et al. (ICSE 2023) and Li et al. (ASE 2024), which are contemporary works. The models used (DeepSeekCoder, CodeLlama2, StarCoder) are recent code LLMs."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "RQ4 systematically varies LoRA hyperparameters (rank and scaling factor) across 7 values each to measure their individual contribution. The study also compares with/without buggy lines (RQ1), and base vs. full FT vs. LoRA vs. IA3 (RQ2-3), which constitute controlled ablations of the training regimen."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper uses plausible patch count, CodeBLEU, exact match, and training/validation loss as evaluation metrics (Section III-C and Table II)."
     89       },
     90       "human_evaluation": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Evaluation is entirely automated through test suite execution, CodeBLEU, and exact match. The authors acknowledge this limitation: 'plausibility shows whether a patch passes all available tests but is not a guarantee of its correctness' and note that 'manual correctness verification is prone to subjectivity' (Section IV-E)."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The three APR benchmarks (QuixBugs, HumanEval-Java, Defects4J) are strictly held-out from the CLM fine-tuning dataset. Defects4J-related patches were explicitly removed from CLM using AST comparison (Section III-A). The CLM dataset itself is split 80/20 for training/validation."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Results are broken down per model (6 families, 15 model sizes), per benchmark (QuixBugs, HumanEval-Java, Defects4J), per training method (base, full FT, LoRA, IA3), and per epoch (Tables I-V). Patch outcome categories (Plausible, Timeout, Uncompilable, Wrong, Unknown) are also defined."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "A major finding is that full fine-tuning degrades best-performing models (RQ2): 'DeepSeekCoder and StarCoder show deterioration after fine-tuning compared to using the base models.' The paper discusses causes including overfitting and data distribution mismatch."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper prominently reports that full fine-tuning worsens performance of strong models (Table III). IA3 underperforms LoRA contrary to expectations from prior work. Including buggy lines fails to improve performance in 29/45 cases. These are substantive negative findings."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims that full fine-tuning decreases performance due to data distributions and overfitting, and that PEFT achieves better results. Both claims are supported by Tables III and V respectively, with detailed discussion in RQ2 and RQ3."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The study uses controlled single-variable manipulation: same models, same benchmarks, same datasets, varying only the training regimen (none, full FT, LoRA, IA3). This is adequate for claims that training method affects performance. The causal attribution to 'overfitting' is supported by training/validation loss divergence in Table II."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The title 'The Impact of Fine-tuning Large Language Models on Automated Program Repair' and abstract speak broadly about 'LLMs' and 'APR' without bounding to the tested setting: Java only, 6 specific model families, 3 specific benchmarks, and one fine-tuning dataset (CLM). The conclusions do not explicitly restrict generalization scope."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "Section IV-E discusses multiple alternative explanations: data leakage from pre-training, data distribution mismatch between CLM and benchmarks, Java version effects, library versioning issues, benchmark non-representativeness, and plausibility vs. correctness concerns."
    136       },
    137       "proxy_outcome_distinction": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "The paper explicitly acknowledges that plausibility is a proxy for correctness: 'Plausibility shows whether a patch passes all available tests but is not a guarantee of its correctness. One way to assess the correctness of patches is by manually checking' (Section IV-E)."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Specific model names and sizes are provided: CodeGen-Multi 350M/2B/6B, CodeT5-small (60M)/base (220M)/large (770M), StarCoderBase 1B/3B, DeepSeekCoder-Base 1.3B/6.7B (v1), Bloom 560M/1B7/7B1, CodeLlama2 7B. DeepSeekCoder version is explicitly stated as 'v1' (Section III-B)."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Listing 1 provides the complete input preprocessing format for all six model families, showing the actual prompt structure with concrete code examples including special tokens (<FILL_ME>, <|fim_begin|>, <fim_prefix>, etc.)."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "LoRA hyperparameters are reported (r=8, α=16 defaults) and RQ4 systematically varies them. However, core training hyperparameters for full fine-tuning (learning rate, batch size, optimizer, weight decay) are not reported. Inference parameters (temperature, sampling strategy) for generating the 10 patches are also absent."
    158       },
    159       "scaffolding_described": {
    160         "applies": false,
    161         "answer": false,
    162         "justification": "No agentic scaffolding is used. Models directly generate patches from code input without any multi-step reasoning, tool use, or agent framework."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Listing 1 documents model-specific preprocessing. Section III-A describes CLM dataset construction: 1,083,185 commits filtered to single-hunk patches, Defects4J patches removed via AST comparison, resulting in 143,666 instances split 80/20. Two input scenarios (with/without buggy line) are described in Section III-D."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section IV-E 'Threats to Validity' provides a dedicated multi-paragraph discussion of limitations covering benchmark representativeness, data leakage, library versioning, dataset distribution mismatch, and evaluation methodology."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Threats are specific to this study: 'benchmarks like HumanEval-Java and QuixBugs were created from simple projects and consist of bugs that are not representative of complex real-world bugs,' 'versioning of various libraries caused by internal dependencies leads to internal problems of benchmarks,' and specific discussion of data leakage risks for the models used."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "The paper discusses what might limit its results (benchmark representativeness, data leakage) but does not explicitly state what the results do NOT show or which settings/claims are excluded. The title and conclusions speak broadly about 'LLMs' and 'APR' without bounding conclusions to Java, the specific model families tested, or the specific benchmarks."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The Zenodo replication package (doi:10.5281/zenodo.16359186) contains 'code and results.' All benchmarks are publicly available (Defects4J, QuixBugs, HumanEval-Java) and the CLM dataset is available via the clm GitHub repository."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "Section III-A describes all datasets: Defects4J 2.0.1 with 835 active bugs (219 used), QuixBugs with 40 Java programs, HumanEval-Java with 163 bugs manually created from HumanEval, and CLM dataset from 1,083,185 GitHub commits (March 2011–2018) filtered to 143,666 instances."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": false,
    200         "answer": false,
    201         "justification": "No human participants. The study uses standard public benchmarks and a publicly available fine-tuning dataset."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "The CLM pipeline is documented: 1,083,185 commits → filtered to single-hunk statement updates/insertions → Defects4J patches removed via AST comparison → 143,666 instances → 129,300 training (80%) / 14,366 test (20%). Model-specific preprocessing is documented in Listing 1."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Acknowledgments section discloses: Research Council of Norway secureIT project (IKTPLUSS #288787), EU Horizon Europe Marie Skłodowska-Curie Actions (#101151798), and eX3 infrastructure (contract #270053)."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "All author affiliations are clearly listed: University of Bern and Simula Research Laboratory. None of the authors are affiliated with the companies that created the evaluated models."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "The funders (Research Council of Norway, EU Horizon Europe) are public research funding bodies with no commercial interest in which fine-tuning method performs best."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests or financial interests statement is present in the paper."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper does not state the training data cutoff dates for any of the six model families (CodeGen, CodeT5, StarCoder, DeepSeekCoder, Bloom, CodeLlama2), despite acknowledging that data leakage from pre-training is a concern."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "The paper discusses overlap at two levels: (1) for fine-tuning, 'all patches related to projects in Defects4J were removed based on a comparison using abstract syntax trees'; (2) for pre-training, 'pre-training of models uses large datasets, usually mined from GitHub. This may cause data leakage, where the benchmarks may have been seen by models at some point during pre-training.'"
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": true,
    244         "answer": true,
    245         "justification": "HumanEval-Java was specifically created to 'eliminate the threat that models have already seen the dataset during pre-training.' The paper acknowledges contamination risk for other benchmarks and notes that 'None of the models achieved close to the 100% benchmark performance, making it clear that the effect of data leakage is not the only factor.' The CLM dataset was decontaminated of Defects4J entries."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. This is a benchmark evaluation study of LLM fine-tuning methods."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants. The study evaluates models on public code benchmarks."
    258       },
    259       "demographics_reported": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "blinding_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       },
    279       "attrition_reported": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No inference cost, latency, or wall-clock time is reported for generating the 10 patches per problem across 15 models and 3 benchmarks."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "The paper mentions using 'A100, and V100 GPUs' and that hardware limitations prevented fine-tuning some larger models, but does not quantify total GPU hours, training time, or energy consumption for any experiment."
    295       }
    296     },
    297     "experimental_rigor": {
    298       "seed_sensitivity_reported": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "No mention of random seeds, seed sensitivity, or results across multiple seeds anywhere in the paper. All results appear to be from single runs."
    302       },
    303       "number_of_runs_stated": {
    304         "applies": true,
    305         "answer": true,
    306         "justification": "The paper explicitly states 'we let them generate 10 patches for each problem, as done in related works [8, 53, 56–58]' (Section III-C). This defines the evaluation protocol."
    307       },
    308       "hyperparameter_search_budget": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "For the main experiments (RQ1-3), core training hyperparameters (learning rate, batch size, optimizer) are not reported, so it is unknown whether any search was performed. LoRA defaults were taken from Hugging Face. RQ4 reports a systematic sensitivity analysis but this is separate from the main results."
    312       },
    313       "best_config_selection_justified": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "For LoRA and IA3, the paper explicitly states they used 'default parameters recommended by Hugging Face PEFT' (r=8, α=16). For RQ4, all configurations are reported (Figures 3-6), not just the best. For RQ3, they chose epoch 1 based on validation metric stagnation observed in Table IV."
    317       },
    318       "multiple_comparison_correction": {
    319         "applies": false,
    320         "answer": false,
    321         "justification": "No statistical significance tests are performed at all, so multiple comparison correction is not applicable."
    322       },
    323       "self_comparison_bias_addressed": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The authors implement and evaluate their own pipeline. They compare with Jiang et al. and Li et al. but note result discrepancies without discussing author-evaluation bias or the possibility that their implementation of baselines may underperform."
    327       },
    328       "compute_budget_vs_performance": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Table VI reports trainable parameter counts for PEFT methods, but performance is not analyzed as a function of compute budget. Full fine-tuning and PEFT are compared on outcomes without normalizing for or reporting their compute costs."
    332       },
    333       "benchmark_construct_validity": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "Section IV-E explicitly discusses construct validity: 'benchmarks like HumanEval-Java and QuixBugs were created from simple projects and consist of bugs that are not representative of complex real-world bugs,' and plausibility vs. correctness is acknowledged as a measurement limitation."
    337       },
    338       "scaffold_confound_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "No scaffolding is used. Models directly generate patches from code input without any agent framework or multi-step pipeline."
    342       }
    343     },
    344     "data_leakage": {
    345       "temporal_leakage_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "The paper discusses that pre-training datasets may include benchmark solutions since benchmarks are public GitHub repositories. HumanEval-Java was specifically created to mitigate temporal leakage: 'The reason for doing so is to eliminate the threat that models have already seen the dataset during pre-training.'"
    349       },
    350       "feature_leakage_addressed": {
    351         "applies": true,
    352         "answer": true,
    353         "justification": "RQ1 systematically tests whether including buggy lines (additional information that may not be available in real-world scenarios) affects performance. The paper analyzes 'how including or omitting the buggy line(s) affects the LLM's performance,' which is a form of feature leakage analysis."
    354       },
    355       "non_independence_addressed": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "The CLM fine-tuning dataset was explicitly decontaminated: 'all patches related to projects in Defects4J were removed based on a comparison using abstract syntax trees' (Section III-A) to ensure independence between fine-tuning and benchmark data."
    359       },
    360       "leakage_detection_method": {
    361         "applies": true,
    362         "answer": true,
    363         "justification": "AST-based comparison was used as a concrete decontamination method to remove Defects4J-related patches from the CLM fine-tuning dataset: 'all patches related to projects in Defects4J were removed based on a comparison using abstract syntax trees' (Section III-A, citing Zhu et al.)."
    364       }
    365     }
    366   },
    367   "claims": [
    368     {
    369       "claim": "Larger models generally perform better on APR benchmarks without fine-tuning (in 34 out of 48 cases).",
    370       "evidence": "Table I shows per-model, per-benchmark results. For example, CodeGen-6B outperforms CodeGen-350M in all cases, and DeepSeekCoder-6.7b achieves the best results (33/40 QuixBugs, 107/163 HumanEval-Java, 89/219 Defects4J). Section IV-A summarizes the trend.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "Full fine-tuning improves poorly-performing models (CodeT5, Bloom) but degrades the best-performing models (DeepSeekCoder, StarCoder).",
    375       "evidence": "Table III: CodeT5-base improves from 0/4/8 to 17/39/75 (QB/HE/D4J), while DeepSeekCoder-1.3b drops from 33/94/72 to 15/64/80. StarCoder-3b drops from 32/94/63 to 11/37/63 after 3 epochs of full fine-tuning.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "PEFT with LoRA outperforms full fine-tuning and IA3 in most cases, achieving up to 225% improvement over base models while using <1% of trainable parameters.",
    380       "evidence": "Table V: LoRA outperforms IA3 in 21/24 cases. CodeGen-2B with LoRA achieves 19/81/98 (QB/HE/D4J) vs. base 13/44/20, using only 0.09% of parameters (Table VI). Section IV-C provides detailed analysis.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "LoRA hyperparameters (rank and scaling factor) have negligible impact on APR performance.",
    385       "evidence": "Figures 3-6 show CodeBLEU varies only between 0.60-0.64 and exact match shows moderate variation across 7 values of rank and scaling factor for CodeGen-2B. Section IV-D concludes 'either rank or scaling factor does not affect the performance of models significantly.'",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Including buggy lines in the prompt does not consistently improve APR performance.",
    390       "evidence": "Table I: adding buggy lines fails to improve performance in 29 out of 45 cases. It helps for the best-performing models on Defects4J (all six model sizes improved) but rarely helps on QuixBugs.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "PEFT-trained models reach performance plateau after the first epoch, with metrics stagnating.",
    395       "evidence": "Table IV shows that CodeBLEU, loss, and exact match stagnate after epoch 1 for nearly all models under both LoRA and IA3. This motivated selecting epoch 1 for the benchmark evaluation in Table V.",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No uncertainty quantification",
    402       "detail": "All results across all RQs are from single experimental runs with no error bars, confidence intervals, standard deviations, or repeated trials. Given that LLM generation is stochastic, results could vary across runs. This is particularly concerning for close comparisons."
    403     },
    404     {
    405       "flag": "Missing core training hyperparameters",
    406       "detail": "Learning rate, batch size, optimizer, weight decay, and warmup schedule are not reported for full fine-tuning experiments. Inference parameters (temperature, sampling strategy) for generating the 10 patches are also absent. These can significantly affect results."
    407     },
    408     {
    409       "flag": "Hyperparameter sensitivity tested on only one model",
    410       "detail": "RQ4 systematically varies LoRA hyperparameters but only for CodeGen-2B. The conclusion that 'rank or scaling factor does not affect the performance of models significantly' is generalized from a single model, which may not hold for other architectures."
    411     },
    412     {
    413       "flag": "No statistical significance tests",
    414       "detail": "Comparative claims ('LoRA performs better than IA3 in 21 out of 24 cases') are based purely on comparing raw numbers. Without significance testing, some observed differences may be within noise margins, especially given the absence of variance reporting."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Impact of Code Language Models on Automated Program Repair",
    420       "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"],
    421       "year": 2023,
    422       "doi": "10.1109/ICSE48619.2023.00125",
    423       "relevance": "Core prior work evaluating code LLMs for APR with full fine-tuning; framework and dataset used in this study."
    424     },
    425     {
    426       "title": "Exploring Parameter-Efficient Fine-Tuning of Large Language Model on Automated Program Repair",
    427       "authors": ["G. Li", "C. Zhi", "J. Chen", "J. Han", "S. Deng"],
    428       "year": 2024,
    429       "doi": "10.1145/3691620.3695066",
    430       "relevance": "Closest related work investigating PEFT techniques for APR; results compared directly with this study."
    431     },
    432     {
    433       "title": "RepairLLaMA: Efficient Representations and Fine-Tuned Adapters for Program Repair",
    434       "authors": ["A. Silva", "S. Fang", "M. Monperrus"],
    435       "year": 2024,
    436       "arxiv_id": "2312.15698",
    437       "relevance": "Demonstrated LoRA-based fine-tuning of CodeLlama-7b for APR, finding LoRA outperformed full fine-tuning."
    438     },
    439     {
    440       "title": "Comprehensive Fine-Tuning Large Language Models of Code for Automated Program Repair",
    441       "authors": ["K. Huang", "J. Zhang", "X. Bao", "X. Wang", "Y. Liu"],
    442       "year": 2025,
    443       "doi": "10.1109/TSE.2025.3532759",
    444       "relevance": "Concurrent work studying fine-tuning of code LLMs for APR."
    445     },
    446     {
    447       "title": "Automated Program Repair in the Era of Large Pre-Trained Language Models",
    448       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    449       "year": 2023,
    450       "doi": "10.1109/ICSE48619.2023.00129",
    451       "relevance": "Studied LLMs for APR without fine-tuning, showing promising zero-shot repair results."
    452     },
    453     {
    454       "title": "LoRA: Low-Rank Adaptation of Large Language Models",
    455       "authors": ["E. J. Hu", "Y. Shen", "P. Wallis", "Z. Allen-Zhu", "Y. Li", "S. Wang", "L. Wang", "W. Chen"],
    456       "year": 2022,
    457       "relevance": "Foundational PEFT method used extensively in this study; core technique for parameter-efficient fine-tuning of LLMs."
    458     },
    459     {
    460       "title": "Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation with Large Language Models",
    461       "authors": ["M. Weyssow", "X. Zhou", "K. Kim", "D. Lo", "H. Sahraoui"],
    462       "year": 2025,
    463       "doi": "10.1145/3714461",
    464       "relevance": "Investigated PEFT for code generation, found PEFT outperforms ICL and RAG across diverse LLMs."
    465     },
    466     {
    467       "title": "Astraios: Parameter-Efficient Instruction Tuning Code Large Language Models",
    468       "authors": ["T. Y. Zhuo", "A. Zebaze", "N. Suppattarachai", "L. von Werra", "H. de Vries", "Q. Liu", "N. Muennighoff"],
    469       "year": 2024,
    470       "arxiv_id": "2401.00788",
    471       "relevance": "One of the first studies of PEFT for code LLMs, found LoRA offers the most competitive cost-performance trade-offs for 16B models."
    472     },
    473     {
    474       "title": "A Systematic Literature Review of Parameter-Efficient Fine-Tuning for Large Code Models",
    475       "authors": ["M. Z. Haque", "S. Afrin", "A. Mastropaolo"],
    476       "year": 2025,
    477       "arxiv_id": "2504.21569",
    478       "relevance": "Recent systematic review of PEFT techniques for code LLMs, providing broader context for this study."
    479     },
    480     {
    481       "title": "Multi-Objective Fine-Tuning for Enhanced Program Repair with LLMs",
    482       "authors": ["B. Yang", "H. Tian", "J. Ren", "H. Zhang", "J. Klein", "T. F. Bissyandé", "C. L. Goues", "S. Jin"],
    483       "year": 2024,
    484       "arxiv_id": "2404.12636",
    485       "relevance": "Investigated multi-objective fine-tuning approaches for LLM-based APR using PEFT."
    486     },
    487     {
    488       "title": "Automated Repair of Programs from Large Language Models",
    489       "authors": ["Z. Fan", "X. Gao", "M. Mirchev", "A. Roychoudhury", "S. Tan"],
    490       "year": 2023,
    491       "doi": "10.1109/ICSE48619.2023.00128",
    492       "relevance": "Studied automated program repair using LLMs, part of the growing body of work on LLM-based APR."
    493     },
    494     {
    495       "title": "Few-Shot Parameter-Efficient Fine-Tuning Is Better and Cheaper than in-Context Learning",
    496       "authors": ["H. Liu", "D. Tam", "M. Muqeeth", "J. Mohta", "T. Huang", "M. Bansal", "C. A. Raffel"],
    497       "year": 2022,
    498       "relevance": "Introduced the IA3 adapter method used in this study; demonstrated PEFT advantages over ICL."
    499     }
    500   ],
    501   "engagement_factors": {
    502     "practical_relevance": {
    503       "score": 2,
    504       "justification": "Practitioners fine-tuning LLMs for code repair can directly use the finding that LoRA with default settings outperforms full fine-tuning at a fraction of the compute cost."
    505     },
    506     "surprise_contrarian": {
    507       "score": 1,
    508       "justification": "The finding that full fine-tuning degrades performance of strong models is somewhat surprising but has been observed in adjacent domains."
    509     },
    510     "fear_safety": {
    511       "score": 0,
    512       "justification": "No safety, security, or risk implications in the findings."
    513     },
    514     "drama_conflict": {
    515       "score": 0,
    516       "justification": "No controversy; straightforward empirical comparison."
    517     },
    518     "demo_ability": {
    519       "score": 1,
    520       "justification": "Replication package available on Zenodo but it is a research reproduction setup, not a ready-to-use tool."
    521     },
    522     "brand_recognition": {
    523       "score": 1,
    524       "justification": "Uses recognizable models (DeepSeekCoder, CodeLlama, StarCoder) but authors are from academic institutions (Simula, Univ of Bern)."
    525     }
    526   }
    527 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs