scan.json (29206B)
1 { 2 "paper": { 3 "title": "A Multi-Dataset Evaluation of Models for Automated Vulnerability Repair", 4 "authors": [ 5 "Zanis Ali Khan", 6 "Aayush Garg", 7 "Qiang Tang" 8 ], 9 "year": 2025, 10 "venue": "ARES", 11 "arxiv_id": "2506.04987", 12 "doi": "10.48550/arXiv.2506.04987" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "CodeT5 generally outperforms CodeBERT on complex vulnerability datasets (e.g., Vul4J CodeBLEU 0.94 vs 0.37) while CodeBERT handles fragmented or incomplete code better (Go, PHP). Fine-tuning improves in-distribution performance but both models struggle significantly on out-of-distribution data, with same-language cross-dataset transfer performing better than cross-language. Only BLEU-based similarity metrics were used—no functional correctness evaluation of generated patches.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "Section 9 (Data Availability): 'the source code and datasets used in our study are publicly available on Zenodo [24].' Reference [24] provides DOI 10.5281/zenodo.15599983." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "All six datasets are publicly available: Go/PHP on Zenodo, MegaVul on GitHub, Vul4J on GitHub, CodeParrot on HuggingFace. The authors also released their processed data via Zenodo [24]." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "Section 4 mentions 'PyTorch 2.0.1 framework with CUDA 12 compatibility' and hardware specs (Xeon Silver 4210, V100 GPUs), but no requirements.txt, Dockerfile, or comprehensive dependency list is provided in the paper. The Python version is not stated." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions are provided in the paper. The Zenodo artifact is referenced but no README contents or reproduction steps are described." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Table 2 reports only point estimates (e.g., CodeBLEU 0.7641, CrystalBLEU 0.6557) with no confidence intervals, error bars, or uncertainty measures." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "No statistical significance tests are used. Claims that 'CodeT5 outperforms CodeBERT' are based solely on comparing raw score values without any statistical test." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": false, 55 "justification": "No formal effect sizes are reported. Results are presented as raw scores in Table 2 without Cohen's d, relative improvement calculations with baseline context, or other structured magnitude measures." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "No justification is given for dataset sizes or power analysis. The choice of 85/15 split is justified only by citation to a computer vision paper [42], not by any analysis specific to this domain." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "Section 5.2 acknowledges 'non-deterministic behavior (e.g., small variations in accuracy)' but no standard deviations, variance, or spread measures across runs are reported anywhere." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "CodeBERT and CodeT5 serve as mutual baselines, compared against each other across all six datasets in Table 2. However, no external baselines from prior vulnerability repair work (e.g., VRepair, VulRepair) are included." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": false, 77 "justification": "CodeBERT (2020) and CodeT5 (2021) are both 4-5 years old at the time of publication. More recent code LLMs (CodeT5+, StarCoder, DeepSeek-Coder, CodeLlama) and vulnerability-specific repair tools are not compared." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": false, 82 "justification": "No ablation study is performed. The preprocessing pipeline has multiple components (token filtering, comment removal, normalization) whose individual contributions are not measured." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Two metrics are used: CodeBLEU and CrystalBLEU, as described in Section 4.3 and reported in Table 2." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "No human evaluation of generated patches is performed. All evaluation is automated via CodeBLEU and CrystalBLEU similarity metrics." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "Section 3.2: 'The datasets were partitioned into 85% for training and 15% for testing... all overlapping or duplicate instances were excluded.'" 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table 2 provides per-dataset results for both models and both metrics. Figures 1 and 2 show cross-dataset heatmaps for the out-of-distribution evaluation." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": true, 107 "justification": "Section 5.1 discusses why models fail on fragmented/incomplete code (Go, PHP datasets). Section 5.2 discusses the performance drop on out-of-distribution data. CodeBERT's poor Vul4J performance is analyzed." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper reports that both models struggle with OOD generalization, that fine-tuning doesn't transfer across languages, and CodeBERT's very low Vul4J score (0.37 CodeBLEU, 0.12 CrystalBLEU)." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "Abstract claims about CodeBERT handling fragmented context better, CodeT5 excelling on complex patterns, and generalization struggles are all supported by Table 2 and the heatmap results in Sections 5.1-5.2." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": false, 124 "justification": "The paper makes causal claims like 'CodeT5's broad pre-training excels' and 'fine-tuning improves in-distribution performance' without controlling for confounds such as model size, architecture differences, or training data composition. No controlled experiments isolate these factors." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The abstract claims to 'provide actionable insights to advance automated vulnerability patching for real-world security applications' but evaluation uses only BLEU-based similarity metrics, not functional correctness or security evaluation. The title and conclusions generalize beyond the tested setting." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": true, 134 "justification": "Section 5.1 considers that dataset characteristics (fragmented code, incomplete functions) explain performance differences. Section 7 discusses model randomness, hyperparameter sensitivity, and metric limitations as alternative factors." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 7 (Construct Validity) explicitly acknowledges: 'We evaluate correct patches using CodeBLEU and CrystalBLEU, which primarily gauge syntactic and limited semantic cues... they may overlook deeper security implications and potential exploit vectors.'" 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper refers only to 'CodeBERT' and 'CodeT5' without specifying exact model checkpoints, sizes, or HuggingFace model IDs (e.g., microsoft/codebert-base, Salesforce/codet5-base)." 147 }, 148 "prompts_provided": { 149 "applies": false, 150 "answer": false, 151 "justification": "The paper uses fine-tuning (not prompting). CodeBERT and CodeT5 are fine-tuned on vulnerability datasets in an encoder-decoder setup, not prompted." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "No hyperparameters are reported. Section 3.3 mentions fine-tuning but does not state learning rate, batch size, number of epochs, optimizer, or any training configuration." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The models are directly fine-tuned and evaluated in a standard sequence-to-sequence setup." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 3.1 documents three preprocessing steps (token length filtering at 512 tokens, comment removal via language-specific regex, normalization). Table 1 shows the number of rows affected at each stage." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 7 'Threats to Validity' is a dedicated section with three subsections: Construct Validity, Internal Validity, and External Validity." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 7 discusses specific threats: CodeBLEU/CrystalBLEU may overlook security implications, token truncation may remove vital context, model randomness affects comparative outcomes, OOD performance drop reflects limited generalizability." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "While Section 7 acknowledges limitations, it does not explicitly state what the results do NOT show. Statements like 'real-world projects frequently rely on specialized libraries and domain-specific coding styles' are generic rather than listing specific untested scenarios or claims the authors are not making." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": true, 190 "justification": "All datasets are publicly available (Go/PHP on Zenodo, MegaVul on GitHub, Vul4J on GitHub, CodeParrot on HuggingFace). The authors' processed data is released via Zenodo [24]." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 4.1 describes the six datasets used, their sources, and programming languages covered. Section 3.1 describes preprocessing steps." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. All data sources are standard public vulnerability datasets and code corpora." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "Table 1 documents the pipeline: initial rows (Irows) → rows removed by tokenization (Rtok.) → rows removed by comment removal (Rcomm.) → rows removed by normalization (Rnorm.) → final rows (Trows) for each dataset." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Acknowledgments section discloses funding from the European Commission Horizon Europe Programme, LAZARUS project (Grant Agreement No. 101070303)." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All authors are listed as affiliated with Luxembourg Institute of Science and Technology (LIST). They do not evaluate their own product." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": true, 222 "justification": "EU Horizon Europe funding has no financial stake in whether CodeBERT or CodeT5 performs better on vulnerability repair tasks." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests statement or financial disclosure is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": true, 233 "answer": false, 234 "justification": "No mention of when CodeBERT's or CodeT5's pre-training data was collected. The models were pre-trained on large code corpora but the training cutoff dates are not discussed." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": true, 238 "answer": false, 239 "justification": "Section 3.2 addresses overlap in their own train/test split but does not discuss whether CodeBERT's or CodeT5's pre-training data overlaps with the vulnerability datasets used for evaluation." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": true, 243 "answer": false, 244 "justification": "Several datasets (MegaVul, Vul4J, CodeParrot) were publicly available before or around the time CodeBERT and CodeT5 were trained. No discussion of whether these models may have seen the evaluation data during pre-training." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. It is a benchmark evaluation of automated models." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants. The study evaluates models on publicly available code datasets." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No inference cost, latency, or per-example time is reported despite fine-tuning and evaluating models on an HPC cluster." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "Hardware is described (Xeon Silver 4210, V100 GPUs) but total GPU hours, training time, or compute budget are not stated." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "Section 5.2 acknowledges 'non-deterministic behavior (e.g., small variations in accuracy)' but no results across multiple seeds are reported." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs is never stated. Results appear to be from single runs." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search is described. It is unclear whether any tuning was performed or what configurations were tried." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "No description of how final model configurations were selected. No validation set selection procedure is described." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "No statistical tests are performed at all, so no correction for multiple comparisons is applied despite comparing across 6 datasets × 2 models × 2 metrics." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors create their own fine-tuning pipeline and preprocessing but do not acknowledge potential bias in their implementation choices or evaluation setup." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of compute budgets for either model. CodeBERT and CodeT5 differ in architecture and size but performance is not normalized by compute cost." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": true, 335 "justification": "Section 7 (Construct Validity) explicitly discusses that CodeBLEU and CrystalBLEU 'primarily gauge syntactic and limited semantic cues' and 'may overlook deeper security implications and potential exploit vectors.'" 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is involved. Models are fine-tuned and evaluated directly in a standard sequence-to-sequence setup." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether CodeBERT/CodeT5's pre-training data includes code from after the vulnerability datasets were created, or vice versa." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the input format leaks information about the expected patch, or whether vulnerability labels provide unintended signal." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": true, 357 "justification": "Section 3.2 states 'all overlapping or duplicate instances were excluded' between train and test splits, partially addressing non-independence." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No leakage detection method is applied (no canary strings, membership inference, n-gram overlap analysis, or temporal splits)." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "CodeT5 consistently outperforms CodeBERT on datasets with complex vulnerability patterns (Vul4J, CodeParrot, MegaVul).", 369 "evidence": "Table 2: Vul4J CodeBLEU CodeT5=0.9373 vs CodeBERT=0.3737; CodeParrot CodeBLEU CodeT5=0.9973 vs CodeBERT=0.997; MegaVul_C CodeBLEU CodeT5=0.8549 vs CodeBERT=0.8396 (Section 5.1).", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "CodeBERT performs better than CodeT5 on datasets with fragmented or incomplete code (Go, PHP).", 374 "evidence": "Table 2: Go CodeBLEU CodeBERT=0.7641 vs CodeT5=0.6499; PHP CodeBLEU CodeBERT=0.7351 vs CodeT5=0.6924. Authors attribute this to CodeBERT's robustness with incomplete context (Section 5.1).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Fine-tuning improves in-distribution performance but models fail to generalize to out-of-distribution datasets.", 379 "evidence": "Figures 1 and 2 show heatmaps where diagonal (in-distribution) scores are high but off-diagonal (OOD) scores drop significantly. Section 5.2 discusses this pattern.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Models trained on one programming language transfer better to datasets in the same language than across languages.", 384 "evidence": "Section 5.2: 'when a model trained on a specific programming language is tested on the same language—for example, trained on Megavul_C_2023 and tested on Megavul_C_2024—the accuracy remains high.' Similar for Vul4J/CodeParrot (both Java).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "CodeT5 demonstrates superior scalability compared to CodeBERT.", 389 "evidence": "Abstract and Section 5.1 state this but no scalability experiments (varying dataset size, model size, or compute) are presented to support it.", 390 "supported": "weak" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Duplicate datasets inflating diversity", 396 "detail": "MegaVul_C_2023 and MegaVul_C_2024 have identical row counts (17,975 initial, 14,526 final), identical preprocessing stats, and nearly identical results (CodeBLEU: 0.8396/0.8395 CodeBERT, 0.8549/0.8549 CodeT5; CrystalBLEU: 0.7893/0.7893, 0.8131/0.8131). These appear to be the same dataset counted twice, inflating the claimed '6 datasets' to effectively 5." 397 }, 398 { 399 "flag": "No error bars despite acknowledged non-determinism", 400 "detail": "Section 5.2 acknowledges 'non-deterministic behavior' and 'small variations in accuracy' but all results are single-point estimates. Without variance measures, it is impossible to determine whether observed differences are statistically meaningful." 401 }, 402 { 403 "flag": "Outdated models with no contemporary comparisons", 404 "detail": "CodeBERT (2020) and CodeT5 (2021) are 4-5 years old. The paper does not compare against newer code LLMs (CodeT5+, StarCoder, DeepSeek-Coder, CodeLlama) or specialized vulnerability repair tools, limiting the practical relevance of the findings." 405 }, 406 { 407 "flag": "No functional correctness evaluation", 408 "detail": "Evaluation uses only text similarity metrics (CodeBLEU, CrystalBLEU). Generated patches are never checked for compilation, test passage, or actual vulnerability remediation. A patch that is textually similar to the ground truth may not compile or may still be vulnerable." 409 }, 410 { 411 "flag": "Missing hyperparameters prevent reproduction", 412 "detail": "No learning rate, batch size, number of epochs, optimizer, warmup schedule, or other training hyperparameters are reported, making the experiments unreproducible even with the released code." 413 }, 414 { 415 "flag": "Scalability claim without evidence", 416 "detail": "The abstract claims 'CodeT5 also demonstrates superior scalability' but no scalability experiments are presented—no varying of dataset size, model size, or compute budget." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "APR4Vul: An Empirical Study of Automatic Program Repair Techniques on Real-World Java Vulnerabilities", 422 "authors": ["Quang-Cuong Bui", "Ranindya Paramitha", "Duc-Ly Vu", "Fabio Massacci", "Riccardo Scandariato"], 423 "year": 2024, 424 "doi": "10.1007/s10664-023-10415-7", 425 "relevance": "Empirical evaluation of APR techniques on real-world Java vulnerabilities, directly relevant to the survey's focus on automated vulnerability repair." 426 }, 427 { 428 "title": "CodeBERT: A Pre-Trained Model for Programming and Natural Languages", 429 "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"], 430 "year": 2020, 431 "arxiv_id": "2002.08155", 432 "relevance": "Foundational pre-trained code model evaluated in this study and widely used in LLM-based code understanding research." 433 }, 434 { 435 "title": "CodeT5: Identifier-Aware Unified Pre-Trained Encoder-Decoder Models for Code Understanding and Generation", 436 "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven C.H. Hoi"], 437 "year": 2021, 438 "arxiv_id": "2109.00859", 439 "relevance": "Key pre-trained code generation model evaluated in this study, representative of encoder-decoder approaches to code tasks." 440 }, 441 { 442 "title": "Enhanced Automated Code Vulnerability Repair Using Large Language Models", 443 "authors": ["David de-Fitero-Dominguez", "Eva García-López", "Antonio García-Cabot", "José-Javier Martínez-Herráiz"], 444 "year": 2024, 445 "doi": "10.1016/j.engappai.2024.109291", 446 "relevance": "LLM-based vulnerability repair work that motivates the need for generalizable patch generation across vulnerability types." 447 }, 448 { 449 "title": "BERTVRepair: On the Adoption of CodeBERT for Automated Vulnerability Code Repair", 450 "authors": ["N.N.H. Dang", "T.Q. Thanh", "A. Nguyen-Duc"], 451 "year": 2024, 452 "doi": "10.1007/978-3-031-55642-5_8", 453 "relevance": "Directly studies CodeBERT adoption for vulnerability repair, closely related to this paper's evaluation scope." 454 }, 455 { 456 "title": "How Effective Are Neural Networks for Fixing Security Vulnerabilities", 457 "authors": ["Yizheng Wu", "Nan Jiang", "Hoan Vu Pham", "Thibaud Lutellier", "Jeff Davis", "Lin Tan", "Petr Babkin", "Sameena Shah"], 458 "year": 2023, 459 "doi": "10.1145/3597926.3598135", 460 "relevance": "Evaluates neural network effectiveness for security vulnerability repair, directly relevant to LLM-based automated program repair." 461 }, 462 { 463 "title": "Uncovering the Limits of Machine Learning for Automatic Vulnerability Detection", 464 "authors": ["Niklas Risse", "Marcel Böhme"], 465 "year": 2024, 466 "relevance": "Investigates fundamental limits of ML for vulnerability detection, relevant to understanding generalization challenges in AI-based security." 467 }, 468 { 469 "title": "T5APR: Empowering Automated Program Repair Across Languages Through Checkpoint Ensemble", 470 "authors": ["Reza Gharibi", "Mohammad Hossein Sadreddini", "Seyed Mostafa Fakhrahmad"], 471 "year": 2024, 472 "doi": "10.1016/j.jss.2024.112083", 473 "relevance": "T5-based multi-language program repair approach, directly relevant to cross-language vulnerability repair evaluation." 474 }, 475 { 476 "title": "Patching as Translation: The Data and the Metaphor", 477 "authors": ["Yangruibo Ding", "Baishakhi Ray", "Premkumar Devanbu", "Vincent J. Hellendoorn"], 478 "year": 2021, 479 "doi": "10.1145/3324884.3416587", 480 "relevance": "Frames program repair as machine translation, foundational work for the sequence-to-sequence vulnerability patching approach used in this study." 481 }, 482 { 483 "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair", 484 "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano", "Louis-Noël Pouchet", "Denys Poshyvanyk", "Martin Monperrus"], 485 "year": 2021, 486 "doi": "10.1109/TSE.2019.2940179", 487 "relevance": "Early neural machine translation approach to program repair, directly relevant to the NMT lineage of automated code fixing." 488 }, 489 { 490 "title": "MegaVul: A C/C++ Vulnerability Dataset with Comprehensive Code Representations", 491 "authors": ["Chao Ni", "Liyu Shen", "Xiaodan Yang", "Yan Zhu", "Shaohua Wang"], 492 "year": 2024, 493 "doi": "10.1145/3643991.3644886", 494 "relevance": "One of the primary evaluation datasets used in this study; large-scale vulnerability dataset relevant to benchmarking code repair tools." 495 }, 496 { 497 "title": "GraphCodeBERT: Pre-Training Code Representations with Data Flow", 498 "authors": ["Daya Guo", "Shuo Ren", "Shuai Lu"], 499 "year": 2021, 500 "relevance": "Pre-trained code model that extends CodeBERT with data flow analysis, relevant to understanding the landscape of code understanding models." 501 } 502 ], 503 "engagement_factors": { 504 "practical_relevance": { 505 "score": 1, 506 "justification": "Evaluates models for vulnerability patching but provides no ready-to-use tool or clear actionable workflow for practitioners." 507 }, 508 "surprise_contrarian": { 509 "score": 0, 510 "justification": "Confirms expected results: fine-tuning helps in-distribution, OOD generalization is hard, larger pre-training helps on complex tasks." 511 }, 512 "fear_safety": { 513 "score": 1, 514 "justification": "Security-relevant topic (vulnerability repair) but no novel attack, no demonstration of dangerous capability gaps." 515 }, 516 "drama_conflict": { 517 "score": 0, 518 "justification": "No controversy, no provocative claims, straightforward benchmark comparison." 519 }, 520 "demo_ability": { 521 "score": 1, 522 "justification": "Zenodo artifact released but it's a research reproduction package, not a pip-installable tool or live demo." 523 }, 524 "brand_recognition": { 525 "score": 0, 526 "justification": "Authors from LIST (Luxembourg), not a well-known AI lab. Models evaluated (CodeBERT, CodeT5) are established but not headline-generating." 527 } 528 } 529 }