scan.json (30493B)
1 { 2 "paper": { 3 "title": "MergeRepair: An Exploratory Study on Merging Task-Specific Adapters in Code LLMs for Automated Program Repair", 4 "authors": [ 5 "Meghdad Dehghan", 6 "Jie JW Wu", 7 "Fatemeh H. Fard", 8 "Ali Ouni" 9 ], 10 "year": 2024, 11 "venue": "Empirical Software Engineering (submitted); arXiv", 12 "arxiv_id": "2408.09568", 13 "doi": "10.48550/arXiv.2408.09568" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "Merging task-specific LoRA adapters can improve Automated Program Repair performance by up to 2.38% pass@1 and 4.01% pass@10 on HumanEvalFix without additional training. Performance depends more on which tasks are merged than how many adapters are combined. Non-APR merged adapters can generalize to APR, sometimes outperforming the APR-specific adapter. In continual merging, placing the strongest individual adapter last in the merge sequence yields the best results due to its higher weight influence.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "A replication package is provided at https://github.com/mqddd/mergerepair (footnote 1, Section 1: 'Our code and replication package are made public')." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The study uses publicly available datasets: CommitPackFT from OctoPack [39] and HumanEvalFix benchmark [39]. Both are publicly downloadable." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "Section 3.8 states the GPU (NVIDIA Tesla V100 32GB) and Table 2 lists hyperparameters, but no Python version, library versions, requirements.txt, or Dockerfile is provided in the paper." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": true, 39 "justification": "A replication package is explicitly provided at the GitHub URL. Section 1 states 'Our code and replication package are made public to support open data and open science principles.' Section 8 reaffirms 'We include all scripts used to obtain the results.'" 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results in Tables 3-11 are reported as point estimates (e.g., '28.66%') without confidence intervals or error bars." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 3.7 describes pairwise t-tests with p-values reported at three significance levels (* p<0.1, ** p<0.05, *** p<0.01) throughout Tables 4-11." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Cliff's delta effect sizes are reported throughout Tables 4-11 (S for small >=0.11, M for medium >=0.28, L for large >=0.43), as described in Section 3.7." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No power analysis or justification for the sample size. HumanEvalFix has 165 problems; no discussion of whether this is sufficient for the number of comparisons being made." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "Results are single-run point estimates. While 20 samples are generated per problem for pass@k computation, no standard deviation or variance across independent experimental runs is reported." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Individual task-specific adapters (especially the APR adapter) serve as baselines. Table 3 provides baseline scores for all individual adapters. Equal-weight merging serves as baseline for continual merging (RQ3)." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Models (StarCoder2, Granite, both 2024) and merging methods (TIES-Merging, DARE, weight averaging) are contemporary. The CommitPackFT dataset and HumanEvalFix benchmark are from 2023." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "The entire study systematically ablates adapter combinations: testing all subsets of 2, 3, 4, and 5 merged adapters (RQ1-RQ2), and all permutations of merge order (RQ3). The effect of each task and merging method is isolated." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Three metrics are used: pass@1, pass@10 (reported for all experiments), and RobustPass@1 (RP5@1, reported for selected experiments in Section 5.2, Table 14)." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "All evaluation is automated through pass@k execution-based metrics using the bigcode-evaluation-harness library. No human evaluation of generated patches is performed." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "Training uses CommitPackFT dataset while evaluation uses the separate HumanEvalFix benchmark (Section 3.7). The instruction-tuned versions of the base models were deliberately avoided because they were trained on the same data (Section 3.5)." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down by task combination, merging method (dare-ties, ties, weight-averaging), and model (StarCoder2, Granite) across all tables. Per-task individual performance is shown in Table 3." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section 5.1 shows an example case where individual adapters fail but the merged adapter succeeds (Table 12). Section 5.5 analyzes solved/unsolved problems via Venn diagrams (Figure 11). Section 5.6 tracks problems that became incorrect after merging." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Many merged adapters show degraded performance, reported transparently. For StarCoder2, only 19/45 pass@1 experiments improved (RQ1 summary). T5 (Test & QA) consistently degrades performance. Continual merging with suboptimal order worsens results." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "The abstract claims '2.38% performance improvement over APR task in terms of pass@1' and '4.01% for pass@10' are confirmed in Tables 4 and 5 (T1-T2-T3 with ties method: 31.04% vs 28.66% = +2.38% pass@1; weight-averaging: 43.12% vs 39.11% = +4.01% pass@10)." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper makes causal claims like 'merging adapters can enhance performance' and 'the order of merging affects performance.' The study design uses controlled manipulation (systematic combination of adapters, all permutations of merge order) evaluated on the same benchmark, which is adequate for these causal claims." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 7 (Threats to External Validity) explicitly states: 'we conducted experiments only on automatic program repair, along with the other four tasks for Python language... the obtained results are limited to the used tasks and programming language.' The title also bounds scope to 'Code LLMs for Automated Program Repair.'" 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": true, 135 "justification": "Section 5.4 analyzes Fraction of Sign Difference as an alternative factor explaining performance. Section 7 discusses dataset labeling noise from GPT-4. Section 6.1 notes HumanEvalFix contains diverse samples beyond APR, potentially explaining why non-APR adapters perform well on it." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper measures pass@k on HumanEvalFix and frames claims in terms of 'performance on the APR benchmark.' Section 3.7 explicitly discusses why pass@k is more reliable than similarity-based metrics (BLEU, CodeBLEU) for measuring functional correctness. Claims match measurement granularity." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Specific model identifiers are provided: 'starcoder2-3b' and 'granite-3b-code-base' with references to their respective papers [32, 37]. Section 3.5 describes selecting 'base checkpoints (pre-trained only)' of both models." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": false, 152 "justification": "Section 3.2 describes the data structure (commit messages, old contents, new contents) and Table 12 shows one evaluation example, but the actual prompt template used to format instruction-tuning data for the models is not provided. The evaluation prompt template from bigcode-evaluation-harness is not shown." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Table 2 reports learning rate, scheduler, weight decay, max steps, warmup steps, batch size, gradient accumulation, and LoRA rank for both models. Section 3.8 states temperature=0.2, n=20 samples for evaluation, and lists the specific model layers where LoRA is applied." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. The approach involves fine-tuning LoRA adapters and merging their weight parameters." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 3.2 describes the CommitPackFT dataset origin (filtered from CommitPack by its publishers), task classification via GPT-4 1-shot prompting, and Python split selection. Table 1 shows task distribution across 59,113 samples. Section 3.8 describes perturbation generation for HumanEvalFixPerturbed." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 7 'Threats to Validity' provides substantive discussion organized into internal, external, construct, and conclusion validity categories." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 7 discusses specific threats: GPT-4 task labeling may have incorrect labels, experiments limited to two models of the same size (3B), model architecture effects not studied, results limited to Python and APR tasks, docstring perturbation incompatibility with bigcode-evaluation-harness." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section 7 (External Validity): 'we conducted experiments only on automatic program repair, along with the other four tasks for Python language... the obtained results are limited to the used tasks and programming language and might not be applicable to other areas or programming languages.' Also acknowledges model size limitation." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "CommitPackFT and HumanEvalFix are publicly available datasets. The replication package at GitHub includes scripts to reproduce results. Section 8 states 'Our code and datasets are shared publicly.'" 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 3.2 describes CommitPackFT origin (GitHub repositories via CommitPack, filtered by OctoPack publishers), its structure (commit messages, old/new contents), task classification process (GPT-4 1-shot prompting), and dataset size (59,113 samples across 5 tasks)." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data comes from standard public benchmarks (CommitPackFT, HumanEvalFix)." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline is documented: CommitPackFT → task-specific splits (Table 1 with counts) → LoRA training per task → merging via three methods → evaluation on HumanEvalFix. Figure 1 provides an overview. Section 3.8 describes the perturbation pipeline for robustness evaluation." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Funding is disclosed: 'This research is supported by a grant from the Natural Sciences and Engineering Research Council of Canada RGPIN-2019-05175, as well as support from computational resources and services provided by Advanced Research Computing at the University of British Columbia.'" 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: University of British Columbia (Dehghan, Wu, Fard) and École de technologie supérieure (Ouni). No conflict with evaluated products." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": true, 223 "justification": "NSERC is a Canadian government funding agency with no financial stake in the outcomes of this research on adapter merging." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": true, 228 "justification": "The paper explicitly states: 'The authors declare that they have no conflict of interest.'" 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "No training data cutoff dates are stated for StarCoder2 or Granite. Section 3.5 describes their training data sources (Stack v2, SWH repositories) but not temporal cutoffs." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether HumanEvalFix problems could appear in StarCoder2 or Granite pre-training data. HumanEval was published in 2021 and both models were trained on large code corpora that could include it." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "HumanEvalFix is based on HumanEval (published 2021). Both StarCoder2 and Granite were trained after 2021 on large-scale code data. No discussion of whether benchmark problems or their solutions leaked into pre-training data." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. It is a computational experiment on adapter merging." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No inference cost, latency, or wall-clock time is reported for the main experiments. Section 5.2 mentions evaluation takes 'at least 8-28x time' for perturbed datasets and 'approximately 1 hour' per model for pass@k, but this is incidental and not a systematic cost report." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Section 3.8 states experiments used 'one NVIDIA Tesla V100 32GB GPU' but does not report total GPU hours, training time per adapter, or total compute budget for the study." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No results across multiple random seeds are reported. Each adapter is trained once and evaluated once (generating 20 samples per problem at temperature 0.2)." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": true, 306 "justification": "Section 3.8 states 'generate 20 samples to calculate pass@1 and pass@10 scores.' For RobustPass@k, '5 different seeds' are used for perturbation generation (Section 5.2). The statistical test uses '400 times' random sampling (Section 3.7)." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "Section 3.8 states 'The hyperparameters are set following the previous works [39, 37]' but no search budget is reported. No indication of how many configurations were tried or whether any tuning was done." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "Hyperparameters are justified by following configurations from the model developers' papers [39, 37] (Section 3.8). The study reports all adapter combinations rather than cherry-picking the best." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "The study performs many pairwise t-tests (hundreds of comparisons across Tables 4-11) but no correction for multiple comparisons (Bonferroni, Holm, etc.) is applied or mentioned." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "All adapters (individual and merged) are trained and evaluated by the authors. No discussion of self-comparison bias or independent evaluation is provided." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": false, 330 "answer": false, 331 "justification": "All models are the same size (3B parameters) and merging methods do not require additional training. Compute differences between merging methods are negligible." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "Section 3.7 argues pass@k is more reliable than similarity metrics, but there is no discussion of whether HumanEvalFix adequately represents real-world APR scenarios. Section 6.1 notes the benchmark 'contained a more diverse set of samples, including instances from tasks beyond APR' but does not discuss construct validity implications." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No scaffolding is involved. Models are evaluated directly on benchmark tasks through code generation." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of temporal leakage. HumanEvalFix is based on HumanEval (2021), and both models were trained on data that could include solutions. CommitPackFT comes from GitHub repositories with no temporal filtering mentioned." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the evaluation setup leaks information. The HumanEvalFix prompt provides the buggy function with test cases, but no analysis of whether this format introduces leakage." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether CommitPackFT training examples and HumanEvalFix test problems share structural similarities or come from overlapping code repositories." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, or n-gram overlap analysis." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "Merging APR with Improvement and Misc task-adapters achieves up to 2.38% pass@1 improvement and 4.01% pass@10 improvement on StarCoder2.", 370 "evidence": "Table 4 shows T1-T2-T3 with ties method achieves 31.04% pass@1 vs 28.66% APR baseline (+2.38%). Table 5 shows T1-T2-T3 with weight-averaging achieves 43.12% pass@10 vs 39.11% (+4.01%).", 371 "supported": "strong" 372 }, 373 { 374 "claim": "The performance of merged adapters depends more on which task-adapters are merged than on the number of adapters.", 375 "evidence": "Across Tables 4-7, no monotonic trend with number of merged adapters. T1-T2-T3 (3 tasks) outperforms T1-T2-T3-T4-T5 (5 tasks) on StarCoder2. The type of task (especially presence of T3/Misc) consistently matters more than count.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Merged adapters from non-APR tasks can generalize to APR, sometimes outperforming the APR-specific adapter.", 380 "evidence": "Tables 6-7 (RQ2): T2-T3 achieves 31.95% pass@1 on StarCoder2 with ties (+3.29% over APR). For Granite, 29/33 experiments show improvement over APR adapter (RQ2 summary).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "In continual merging, placing the best-performing individual adapter last yields the highest overall performance.", 385 "evidence": "Tables 8-11 (RQ3): For StarCoder2, T4-T2-T1 (APR last) consistently outperforms other orderings of the same tasks. The pattern is consistent across merging methods and both models.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "The Misc (T3) task-specific adapter outperforms the APR adapter on the HumanEvalFix benchmark across both models.", 390 "evidence": "Table 3: T3 achieves 31.40% vs T1's 28.66% pass@1 on StarCoder2, and 18.63% vs 16.16% on Granite.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Continual merging can be more effective than equal-weight merging when merge order is optimized.", 395 "evidence": "Section 5.3 and Figures 2-5 show that for every equal-weight merged adapter, there exists a continual merging variant that outperforms it. This is demonstrated across both models and all merging methods.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "Small benchmark size", 402 "detail": "HumanEvalFix contains only 165 Python problems. With hundreds of pairwise statistical comparisons across Tables 4-11, the statistical power is limited, and no multiple comparison correction is applied." 403 }, 404 { 405 "flag": "No contamination analysis", 406 "detail": "Both StarCoder2 and Granite were trained on large-scale code data after HumanEval was published (2021). No analysis of whether benchmark problems or solutions leaked into pre-training data." 407 }, 408 { 409 "flag": "Marginal practical improvements", 410 "detail": "The best improvements (2.38% pass@1, 4.01% pass@10) are small in absolute terms. Many merging configurations degrade performance. On StarCoder2, only 19/45 experiments improved pass@1. The practical significance beyond statistical significance is unclear." 411 }, 412 { 413 "flag": "No seed sensitivity analysis", 414 "detail": "Each adapter is trained once with one seed. Given that LoRA training and code generation with temperature 0.2 are stochastic, the reported differences may fall within run-to-run variance." 415 }, 416 { 417 "flag": "GPT-4-labeled task categories", 418 "detail": "The task classification in CommitPackFT was done via 1-shot GPT-4 prompting, introducing potential label noise. The authors acknowledge this threat in Section 7 but do not quantify its impact." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "RepairLlama: Efficient Representations and Fine-Tuned Adapters for Program Repair", 424 "authors": ["André Silva", "Sen Fang", "Martin Monperrus"], 425 "year": 2023, 426 "arxiv_id": "2312.15698", 427 "relevance": "Studies LoRA adapters specifically for program repair, directly relevant to adapter-based code LLM fine-tuning for APR." 428 }, 429 { 430 "title": "OctoPack: Instruction Tuning Code Large Language Models", 431 "authors": ["Niklas Muennighoff", "Qian Liu", "Armel Zebaze"], 432 "year": 2023, 433 "relevance": "Source of CommitPackFT dataset and HumanEvalFix benchmark used in this study; foundational work on instruction-tuning code LLMs." 434 }, 435 { 436 "title": "StarCoder 2 and The Stack v2: The Next Generation", 437 "authors": ["Anton Lozhkov", "Raymond Li", "Loubna Ben Allal"], 438 "year": 2024, 439 "arxiv_id": "2402.19173", 440 "relevance": "One of the two base code LLMs used in this study; key open-source code generation model." 441 }, 442 { 443 "title": "Granite Code Models: A Family of Open Foundation Models for Code Intelligence", 444 "authors": ["Mayank Mishra", "Matt Stallone", "Gaoyuan Zhang"], 445 "year": 2024, 446 "arxiv_id": "2405.04324", 447 "relevance": "Second base code LLM used in this study; IBM's open foundation model for code." 448 }, 449 { 450 "title": "LoRA: Low-Rank Adaptation of Large Language Models", 451 "authors": ["Edward J. Hu", "Yelong Shen", "Phillip Wallis"], 452 "year": 2022, 453 "relevance": "Foundational adapter method used for all fine-tuning in this study; key parameter-efficient fine-tuning technique." 454 }, 455 { 456 "title": "TIES-Merging: Resolving Interference When Merging Models", 457 "authors": ["Prateek Yadav", "Derek Tam", "Leshem Choshen"], 458 "year": 2024, 459 "relevance": "One of three merging techniques evaluated; proposes trim-resolve-merge approach for model parameters." 460 }, 461 { 462 "title": "Language Models are Super Mario: Absorbing Abilities from Homologous Models as a Free Lunch", 463 "authors": ["Le Yu", "Bowen Yu", "Haiyang Yu"], 464 "year": 2023, 465 "arxiv_id": "2311.03099", 466 "relevance": "Proposes the DARE merging method evaluated in this study; demonstrates parameter dropping and rescaling for model merging." 467 }, 468 { 469 "title": "Arcee's MergeKit: A Toolkit for Merging Large Language Models", 470 "authors": ["Charles Goddard", "Shamane Siriwardhana", "Malikeh Ehghaghi"], 471 "year": 2024, 472 "arxiv_id": "2403.13257", 473 "relevance": "Toolkit for merging LLMs; demonstrated merged models outperforming constituents on general and medical benchmarks." 474 }, 475 { 476 "title": "ReCode: Robustness Evaluation of Code Generation Models", 477 "authors": ["Shiqi Wang", "Zheng Li", "Haifeng Qian"], 478 "year": 2023, 479 "doi": "10.18653/v1/2023.acl-long.773", 480 "relevance": "Proposes RobustPass@k metric and perturbation framework adapted in this study for APR robustness evaluation." 481 }, 482 { 483 "title": "Evaluating Large Language Models Trained on Code", 484 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 485 "year": 2021, 486 "arxiv_id": "2107.03374", 487 "relevance": "Introduces HumanEval benchmark (basis of HumanEvalFix used here); foundational work on code generation evaluation." 488 }, 489 { 490 "title": "Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation with Large Language Models", 491 "authors": ["Martin Weyssow", "Xin Zhou", "Kisub Kim"], 492 "year": 2023, 493 "arxiv_id": "2308.10462", 494 "relevance": "Empirical study of parameter-efficient fine-tuning for code generation; directly relevant to understanding adapter effectiveness for code tasks." 495 }, 496 { 497 "title": "Astraios: Parameter-Efficient Instruction Tuning Code Large Language Models", 498 "authors": ["Terry Yue Zhuo", "Armel Zebaze", "Nitchakarn Suppattarachai"], 499 "year": 2024, 500 "relevance": "Studies parameter-efficient instruction tuning for code LLMs; relevant baseline for understanding adapter fine-tuning in the code domain." 501 } 502 ], 503 "engagement_factors": { 504 "practical_relevance": { 505 "score": 2, 506 "justification": "Adapter merging is a practical technique for combining task-specific models without retraining, applicable in resource-constrained settings." 507 }, 508 "surprise_contrarian": { 509 "score": 1, 510 "justification": "The finding that non-APR merged adapters can match or exceed APR-specific adapters is mildly surprising but not deeply contrarian." 511 }, 512 "fear_safety": { 513 "score": 0, 514 "justification": "No AI safety or security concerns are raised by this work on adapter merging for program repair." 515 }, 516 "drama_conflict": { 517 "score": 0, 518 "justification": "No controversy or conflict; straightforward empirical study with measured claims." 519 }, 520 "demo_ability": { 521 "score": 2, 522 "justification": "Replication package released on GitHub with scripts; uses publicly available models and datasets, though setup requires GPU resources." 523 }, 524 "brand_recognition": { 525 "score": 0, 526 "justification": "Academic paper from University of British Columbia and École de technologie supérieure; not from a major AI lab." 527 } 528 } 529 }