scan.json (29840B)
1 { 2 "paper": { 3 "title": "Adapting Knowledge Prompt Tuning for Enhanced Automated Program Repair", 4 "authors": [ 5 "Xuemeng Cai", 6 "Lingxiao Jiang" 7 ], 8 "year": 2025, 9 "venue": "arXiv preprint", 10 "arxiv_id": "2504.01523" 11 }, 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": true, 17 "justification": "The paper states in the Conclusion: 'Our source code and experimental data are publicly available at: https://github.com/Cxm211/k-prompt'. A working GitHub URL is provided." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": true, 22 "justification": "The paper uses six publicly available APR benchmark datasets (BugsInPy, Code Refinement, Defects4J, ManySStuBs4J-SStuBs, TFix, xCodeEval-APR-C), all of which are standard public benchmarks. The GitHub repository also claims to include experimental data." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "The paper mentions using OpenPrompt and PyTorch as frameworks and specifies hardware (NVIDIA A40 48GB GPU and H100 80GB GPU), but does not provide a requirements.txt, Dockerfile, or library version specifications sufficient to recreate the environment." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "While source code is available on GitHub, the paper itself contains no step-by-step reproduction instructions. The paper describes the experimental design but does not provide commands or a README walkthrough in the paper text." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": false, 39 "justification": "Results throughout the paper (Tables VI, VII, VIII, IX, X) are reported as single point estimates with no confidence intervals or error bars. The paper averages results over three runs but does not report standard deviations or any spread measure." 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "No statistical significance tests are applied. The paper compares prompt tuning vs. fine-tuning by reporting raw numbers and percentages without any p-values, t-tests, or other statistical tests to verify that observed differences are not due to chance." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": false, 49 "justification": "The paper reports raw percentage improvements (e.g., '87.33% on average') but these are computed as relative improvements rather than standardized effect sizes (Cohen's d, etc.). No proper effect size measures are provided." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper selects 1% samples from large datasets to simulate data scarcity but does not justify why these particular sample sizes (260-1082 training instances) are sufficient for drawing conclusions. No power analysis is presented." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "The paper runs experiments three times with different seeds and takes averages, but never reports standard deviations or any spread measure for these runs. Tables only show mean/average values without any variance information." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "Fine-tuning is used as the baseline comparison throughout. Additionally, a 'Naive Copy' baseline (copying buggy code as the fix) is included. Results are compared across all six datasets." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "The baseline is fine-tuning of the same pre-trained models (CodeT5+ 220M, CodeT5+ 770M, GPT-Neo 1.3B), which is the current standard approach for APR. Fine-tuning is the appropriate contemporary baseline for prompt tuning." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": true, 76 "justification": "RQ2 and RQ3 serve as ablation studies. RQ2 compares hard vs. soft prompts and different prompt templates (BP1-BP7). RQ3 ablates individual types of domain knowledge (repair action, repair pattern, bug type, AST, error message, tags) and their combinations." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": true, 81 "justification": "Three evaluation metrics are used: Exact Match (EM), Syntactically Correct Patch (SC), and CodeBLEU. Multiple metrics address different aspects of repair quality." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": false, 86 "justification": "Human evaluation IS relevant for APR. Exact Match only catches patches identical to a reference fix; there can be multiple valid fixes. Human evaluation of generated patches would strengthen the claims about repair effectiveness. The paper itself acknowledges EM may not capture the full fix ratio (Construct Validity section). The criterion is applicable — the paper could reasonably have included human evaluation of patches — but chose not to. The schema says applies=false only when 'human evaluation is clearly irrelevant to the claims.' For APR, where correctness of patches matters and automated metrics are known to be incomplete, human evaluation is relevant." 87 }, 88 "held_out_test_set": { 89 "applies": true, 90 "answer": true, 91 "justification": "The paper explicitly splits data into training, validation, and test sets in an 8:1:1 ratio (Table IV). Results in RQ1-RQ3 are reported on held-out test sets. RQ4 uses a fixed 500-instance test set." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Results are broken down across all six individual datasets (BugsInPy, Code Refinement, Defects4J, ManySStuBs4J-SStuBs, TFix, xCodeEval), across four programming languages (Python, Java, JavaScript, C), and across three models separately." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper discusses cases where prompt tuning does not help (e.g., Code Refinement dataset where variable anonymization limits performance, xCodeEval where noisy domain knowledge reduces effectiveness). Section V.C discusses failure modes of combining multiple knowledge types." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "Several negative findings are reported: combining multiple domain knowledge types often does not lead to incremental gains (RQ3/Table IX); certain prompt templates (BP1, BP5, BP6, BP7) consistently underperform for GPT-Neo; Code Refinement dataset shows lower improvement due to variable anonymization." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The abstract claim of '87.33% average improvement over fine-tuning in data scarcity scenarios' is supported by Table VI results across the six datasets and three models. The claim about comprehensive evaluation across three LLMs, six datasets, and four languages is confirmed by the experimental design." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper makes causal claims (prompt tuning 'improves' performance over fine-tuning). The ablation design is adequate for this — they control for model architecture, datasets, and evaluation protocol, varying only the tuning method. The controlled single-variable manipulation supports the causal interpretation." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": true, 123 "justification": "The paper explicitly acknowledges scope limitations: 'Our experimental results may be applicable to specific datasets and models only' (Section VI.B, External Validity). The conclusion also acknowledges evaluation is limited to 'four datasets, three relatively small pre-trained code models, and limited prompt templates.'" 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": false, 128 "justification": "The Threats to Validity section (Section VI.B) discusses generic threats: randomness in training (Internal Validity), EM metric limitations (Construct Validity), and limited generalizability (External Validity). However, these are methodological limitations, not alternative explanations for the observed results. The paper does not discuss specific alternative explanations such as: whether prompt tuning's advantage is simply due to having more trainable parameters per effective sample, whether the improvement is driven by the prompt providing a stronger inductive bias rather than domain knowledge per se, or whether the 87.33% improvement figure is inflated by the choice of data scarcity threshold. The Code Refinement variable anonymization discussion is a dataset-specific observation, not an alternative explanation for the main findings. Per the schema: 'A threats-to-validity section counts only if it discusses specific alternative explanations for the observed results, not just generic methodological limitations.'" 129 } 130 }, 131 "setup_transparency": { 132 "model_versions_specified": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper specifies model names and parameter counts (CodeT5+ 220M, CodeT5+ 770M, GPT-Neo 1.3B) and states they are loaded from 'official versions available on Hugging Face.' However, it does not specify exact Hugging Face model identifiers (e.g., Salesforce/codet5p-220m) or commit hashes. Per the schema: 'Marketing names like Gemini-2.5 or GPT-4o without a snapshot date or API version do NOT count as specified versions.' CodeT5+ 220M is a model family name with size, not a precise version — model weights on Hugging Face can be updated without changing the model name. These are open-weight models with specific sizes, which makes them more identifiable than API models, but the schema explicitly requires version-level specificity." 136 }, 137 "prompts_provided": { 138 "applies": true, 139 "answer": false, 140 "justification": "Tables I and III show prompt TEMPLATES with placeholders ([X], [mask], [bugType], [repairAction], [repairPattern], [SOFT], etc.). Per the schema: 'A prompt TEMPLATE with placeholders (e.g., [Task Description]) does NOT count unless the actual fill values are also provided — the reader must be able to reconstruct every prompt sent to the model.' While [X] (buggy code) and [mask] (output slot) are structural model placeholders, the knowledge prompt templates use domain knowledge placeholders like [bugType], [repairAction], [repairPattern], [AST], [errorMessage], [tags] whose actual fill values are dataset-derived and not fully enumerated. The paper shows one example (Fig. 4) but does not provide the actual domain knowledge values used across all experimental instances. A reader cannot reconstruct every prompt sent to the model." 141 }, 142 "hyperparameters_reported": { 143 "applies": true, 144 "answer": true, 145 "justification": "Table V provides a complete hyperparameter table including optimizer (AdamW), Adam epsilon (1e-8), initial learning rate (5e-5), LR scheduler (Linear), training epochs (10), temperature (1.0), top-p (0.9), beam number (5), and repetition penalty (1.0)." 146 }, 147 "scaffolding_described": { 148 "applies": false, 149 "answer": false, 150 "justification": "No agentic scaffolding is used. The paper performs standard prompt tuning and fine-tuning with LLMs; there is no agent, tool-use framework, or complex scaffolding involved." 151 }, 152 "data_preprocessing_documented": { 153 "applies": true, 154 "answer": true, 155 "justification": "Section IV.C describes preprocessing: single-hunk fixes are treated as separate instances, data is split 8:1:1 into train/val/test, and for large datasets 1% is sampled using three distinct seeds with averages taken. The criteria for the 1% sampling and the single-hunk restriction are stated." 156 } 157 }, 158 "limitations_and_scope": { 159 "limitations_section_present": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section VI.B 'Threats to Validity' contains three subsections: Construct Validity, Internal Validity, and External Validity, each discussing specific methodological limitations with substantive content." 163 }, 164 "threats_to_validity_specific": { 165 "applies": true, 166 "answer": true, 167 "justification": "The threats are specific to this study: Construct Validity discusses how EM may not capture all valid fixes and that prompt templates may not be optimal; Internal Validity discusses the impact of randomness in training and sampling; External Validity acknowledges that results may only apply to the tested datasets and models." 168 }, 169 "scope_boundaries_stated": { 170 "applies": true, 171 "answer": true, 172 "justification": "The conclusion explicitly states what the results do not show: evaluation is limited to 'four datasets, three relatively small pre-trained code models, and limited prompt templates'. External Validity also notes results 'may be applicable to specific datasets and models only' and that 'we may need more evaluation in the future to conclude that our results could be generalizable to all datasets and models.'" 173 } 174 }, 175 "data_integrity": { 176 "raw_data_available": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper uses publicly available benchmark datasets (BugsInPy, Code Refinement, Defects4J, ManySStuBs4J, TFix, xCodeEval) and states experimental data is available at the GitHub repository. Original benchmark data can be independently verified from the cited sources." 180 }, 181 "data_collection_described": { 182 "applies": true, 183 "answer": true, 184 "justification": "Section IV.C describes how datasets were selected (six commonly used APR benchmarks from literature), how they were processed (single-hunk fixes extracted, 8:1:1 split, 1% sampling for large datasets), and dataset statistics are provided in Table IV." 185 }, 186 "recruitment_methods_described": { 187 "applies": false, 188 "answer": false, 189 "justification": "There are no human participants. The study uses pre-existing benchmark datasets from real-world software projects. This is a purely computational study with no participant recruitment." 190 }, 191 "data_pipeline_documented": { 192 "applies": true, 193 "answer": true, 194 "justification": "The data pipeline is documented: starting from six benchmark datasets, applying the single-hunk restriction, splitting 8:1:1, and sampling 1% from large datasets using three seeds. Table IV shows final dataset statistics. The steps are clear and traceable." 195 } 196 }, 197 "conflicts_of_interest": { 198 "funding_disclosed": { 199 "applies": true, 200 "answer": true, 201 "justification": "The Acknowledgments section states: 'This research is supported by the Ministry of Education, Singapore under its Academic Research Fund Tier 3 (Award ID: MOET32020-0004).'" 202 }, 203 "affiliations_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "Both authors are affiliated with Singapore Management University (SMU), which is clearly stated on the title page. No commercial affiliations are present; the authors are not evaluating a product they have financial stake in." 207 }, 208 "funder_independent_of_outcome": { 209 "applies": true, 210 "answer": true, 211 "justification": "The funder is the Ministry of Education, Singapore, a government academic funding body with no financial interest in prompt tuning vs. fine-tuning for APR. The disclaimer in Acknowledgments states the views expressed do not reflect those of the Ministry." 212 }, 213 "financial_interests_declared": { 214 "applies": true, 215 "answer": false, 216 "justification": "The paper contains no competing interests statement or declaration of financial interests. While no obvious conflict exists (academic researchers at a university), the absence of an explicit competing interests declaration means this criterion is not met." 217 } 218 }, 219 "contamination": { 220 "training_cutoff_stated": { 221 "applies": true, 222 "answer": false, 223 "justification": "The paper does not state training data cutoff dates for the models used (CodeT5+, GPT-Neo). Since these models were trained on web-scraped code data, the cutoff dates are relevant for assessing whether the benchmark tasks could have appeared in training data." 224 }, 225 "train_test_overlap_discussed": { 226 "applies": true, 227 "answer": false, 228 "justification": "No discussion of potential overlap between the pre-training data of CodeT5+ or GPT-Neo and the test sets of the APR benchmarks used (Defects4J, etc.). The benchmarks are publicly available and could have been in pre-training corpora." 229 }, 230 "benchmark_contamination_addressed": { 231 "applies": true, 232 "answer": false, 233 "justification": "Some benchmarks like Defects4J and Code Refinement were published before the training cutoffs of CodeT5+ and GPT-Neo, making contamination plausible. The paper does not discuss whether the test instances could have appeared in pre-training data." 234 } 235 }, 236 "human_studies": { 237 "pre_registered": { 238 "applies": false, 239 "answer": false, 240 "justification": "No human participants. This is a computational study evaluating LLMs on APR benchmark datasets." 241 }, 242 "irb_or_ethics_approval": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human participants. This is a computational study with no human subjects." 246 }, 247 "demographics_reported": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants. This is a computational study with no human subjects." 251 }, 252 "inclusion_exclusion_criteria": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants. This is a computational study with no human subjects." 256 }, 257 "randomization_described": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. This is a computational study with no human subjects." 261 }, 262 "blinding_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants. This is a computational study with no human subjects." 266 }, 267 "attrition_reported": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants. This is a computational study with no human subjects." 271 } 272 }, 273 "cost_and_practicality": { 274 "inference_cost_reported": { 275 "applies": true, 276 "answer": false, 277 "justification": "No API costs, tokens consumed, or cost-per-example are reported. The paper mentions hardware used (A40 and H100 GPUs) but does not report inference latency or computational cost of running the approach." 278 }, 279 "compute_budget_stated": { 280 "applies": true, 281 "answer": false, 282 "justification": "The paper specifies the hardware used (NVIDIA A40 48GB GPU for CodeT5+ and H100 80GB GPU for GPT-Neo) but does not report total GPU hours, training time, or total compute budget. Given the comprehensive experiments across many configurations, the compute expenditure is non-trivial but unquantified." 283 } 284 } 285 }, 286 "claims": [ 287 { 288 "claim": "Prompt tuning achieves an average improvement of 87.33% over fine-tuning in data scarcity scenarios for APR tasks.", 289 "evidence": "Table VI reports prompt tuning vs. fine-tuning EM rates across six datasets and three models. The 87.33% is stated as the average improvement for CodeT5+ models (Section V.A). GPT-Neo improves from 0% to 1.03%-54.17% EM rate.", 290 "supported": "moderate" 291 }, 292 { 293 "claim": "Prompt tuning consistently outperforms fine-tuning across all three base models (CodeT5+ 220M, CodeT5+ 770M, GPT-Neo 1.3B) and all six APR datasets.", 294 "evidence": "Table VI shows prompt tuning outperforming fine-tuning on every dataset/model combination. Finding 1 and Finding 2 summarize these results.", 295 "supported": "strong" 296 }, 297 { 298 "claim": "Soft basic prompts (SBP) generally outperform hard basic prompts (HBP) in prompt tuning for APR.", 299 "evidence": "Table VII compares HBP, SBPinitialized, and SBPrandom across multiple datasets and models. SBP variants generally outperform HBP. Finding 3 summarizes this result.", 300 "supported": "moderate" 301 }, 302 { 303 "claim": "Incorporating bug- and code-related domain knowledge in knowledge prompts generally improves prompt tuning performance.", 304 "evidence": "Table IX compares basic prompts with various knowledge prompts for Defects4J, ManySStuBs4J, TFix, and xCodeEval. Most knowledge prompts outperform basic prompts. Finding 5 summarizes this.", 305 "supported": "moderate" 306 }, 307 { 308 "claim": "Combining multiple types of domain knowledge does not lead to incremental gains and may reduce performance compared to single knowledge type.", 309 "evidence": "Table IX shows that 'Repair Action + Repair Pattern' for Defects4J and 'Bug Type + AST' for ManySStuBs4J do not outperform single-knowledge prompts. The discussion in Section V.C attributes this to conflicting or noisy information.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "Prompt tuning is particularly advantageous in extreme data scarcity (1-32 shot) scenarios.", 314 "evidence": "Table X compares prompt tuning and fine-tuning with 1, 8, 16, and 32 training instances. In the 1-shot scenario, fine-tuned models achieve 0% EM while prompt-tuned models achieve non-zero repair rates. Fig. 5 shows trends across training sizes.", 315 "supported": "strong" 316 } 317 ], 318 "methodology_tags": [ 319 "benchmark-eval" 320 ], 321 "key_findings": "Prompt tuning consistently outperforms fine-tuning for Automated Program Repair (APR) in data scarcity scenarios, achieving an average 87.33% improvement over fine-tuning across CodeT5+ 220M and 770M models on six APR benchmarks. The approach, called knowledge prompt tuning, integrates bug- and code-related domain knowledge (repair actions, ASTs, bug types, error messages) into prompt templates, which generally enhances performance beyond basic prompt tuning. Soft prompts (with learnable tokens) outperform hard prompts (fixed natural language tokens) in APR generation tasks. The advantage of prompt tuning is especially pronounced in extreme data scarcity settings (1-32 training instances), where fine-tuned models often achieve 0% Exact Match while prompt-tuned models successfully repair some bugs.", 322 "red_flags": [ 323 { 324 "flag": "No statistical significance testing", 325 "detail": "The paper makes comparative claims ('outperforms', 'achieves improvement') based solely on point estimates without any statistical tests. Given the small test set sizes (32-135 instances per dataset) and multiple comparisons across 18+ model-dataset-prompt combinations, the lack of significance testing makes it impossible to distinguish real improvements from chance variation." 326 }, 327 { 328 "flag": "No variance reported for multi-run experiments", 329 "detail": "The paper explicitly runs experiments three times with different seeds and takes averages, but never reports standard deviations or any spread measure. This makes it impossible to assess the stability of results or whether differences between methods are meaningful." 330 }, 331 { 332 "flag": "No benchmark contamination discussion", 333 "detail": "Several benchmarks (Defects4J, Code Refinement) were published before the training cutoffs of CodeT5+ and GPT-Neo. If benchmark instances appeared in pre-training data, this would inflate performance measures and bias comparisons. The paper does not address this threat." 334 }, 335 { 336 "flag": "Small test sets limit reliability", 337 "detail": "Some test sets are very small (ManySStuBs4J: 32 instances, Code Refinement: 65 instances, Defects4J: 64 instances). A 1% EM improvement on 32 instances represents less than one additional correctly repaired bug. Claims of consistent outperformance are fragile at these sample sizes without confidence intervals." 338 }, 339 { 340 "flag": "Compute cost not reported", 341 "detail": "The paper proposes prompt tuning as a practical alternative to fine-tuning, but does not report training time, GPU hours, or total compute budget despite running hundreds of experiments across multiple models, datasets, and configurations on high-end GPUs (A40 and H100)." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation", 347 "authors": [ 348 "Y. Wang", 349 "W. Wang", 350 "S. Joty", 351 "S. C. Hoi" 352 ], 353 "year": 2021, 354 "relevance": "Core LLM for code used in the APR experiments; directly relevant to LLM-based code generation and program repair." 355 }, 356 { 357 "title": "Automated program repair in the era of large pre-trained language models", 358 "authors": [ 359 "C. S. Xia", 360 "Y. Wei", 361 "L. Zhang" 362 ], 363 "year": 2023, 364 "relevance": "Evaluates LLMs for APR across benchmarks, directly relevant to the survey scope on LLM program repair methodology." 365 }, 366 { 367 "title": "Impact of code language models on automated program repair", 368 "authors": [ 369 "N. Jiang", 370 "K. Liu", 371 "T. Lutellier", 372 "L. Tan" 373 ], 374 "year": 2023, 375 "relevance": "Empirical study fine-tuning LLMs for APR tasks; directly related to the methodology being compared in this paper." 376 }, 377 { 378 "title": "An empirical study on fine-tuning large language models of code for automated program repair", 379 "authors": [ 380 "K. Huang", 381 "X. Meng", 382 "J. Zhang", 383 "Y. Liu", 384 "W. Wang", 385 "S. Li", 386 "Y. Zhang" 387 ], 388 "year": 2023, 389 "relevance": "Comprehensive fine-tuning study for LLMs in APR; baseline methodology this paper compares against." 390 }, 391 { 392 "title": "No more fine-tuning? An experimental evaluation of prompt tuning in code intelligence", 393 "authors": [ 394 "C. Wang", 395 "Y. Yang", 396 "C. Gao", 397 "Y. Peng", 398 "H. Zhang", 399 "M. R. Lyu" 400 ], 401 "year": 2022, 402 "relevance": "Directly evaluates prompt tuning for code intelligence tasks; foundational comparison work for the paper's approach." 403 }, 404 { 405 "title": "RAP-Gen: Retrieval-augmented patch generation with CodeT5 for automatic program repair", 406 "authors": [ 407 "W. Wang", 408 "Y. Wang", 409 "S. Joty", 410 "S. C. Hoi" 411 ], 412 "year": 2023, 413 "relevance": "Uses retrieval-augmented prompts for APR with CodeT5; directly related to knowledge-enhanced approaches for program repair." 414 }, 415 { 416 "title": "The power of scale for parameter-efficient prompt tuning", 417 "authors": [ 418 "B. Lester", 419 "R. Al-Rfou", 420 "N. Constant" 421 ], 422 "year": 2021, 423 "relevance": "Foundational work on prompt tuning methodology that this paper extends to APR tasks." 424 }, 425 { 426 "title": "Assessing the effectiveness of vulnerability detection via prompt tuning: An empirical study", 427 "authors": [ 428 "G. Lu", 429 "X. Ju", 430 "X. Chen", 431 "S. Yang", 432 "L. Chen", 433 "H. Shen" 434 ], 435 "year": 2023, 436 "relevance": "Evaluates prompt tuning for security vulnerability detection in code; relevant to LLM-based program analysis." 437 }, 438 { 439 "title": "VulRepair: A T5-based automated software vulnerability repair", 440 "authors": [ 441 "M. Fu", 442 "C. Tantithamthavorn", 443 "T. Le", 444 "V. Nguyen", 445 "D. Phung" 446 ], 447 "year": 2022, 448 "relevance": "T5-based fine-tuning for vulnerability repair; directly related to LLM-based APR methodology." 449 }, 450 { 451 "title": "Prefix-tuning: Optimizing continuous prompts for generation", 452 "authors": [ 453 "X. L. Li", 454 "P. Liang" 455 ], 456 "year": 2021, 457 "relevance": "Foundational work on soft/continuous prompt tuning methods that underpin the approach evaluated in this paper." 458 }, 459 { 460 "title": "TBar: Revisiting template-based automated program repair", 461 "authors": [ 462 "K. Liu", 463 "A. Koyuncu", 464 "D. Kim", 465 "T. F. Bissyande" 466 ], 467 "year": 2019, 468 "relevance": "Template-based APR baseline system that LLM-based approaches are compared against." 469 }, 470 { 471 "title": "How effective are neural networks for fixing security vulnerabilities", 472 "authors": [ 473 "Y. Wu", 474 "N. Jiang", 475 "H. V. Pham", 476 "T. Lutellier", 477 "J. Davis", 478 "L. Tan" 479 ], 480 "year": 2023, 481 "relevance": "Evaluates neural APR for security vulnerability repair; relevant to LLM-based program repair methodology." 482 } 483 ] 484 }