scan.json (29739B)
1 { 2 "paper": { 3 "title": "Neural Program Repair with Program Dependence Analysis and Effective Filter Mechanism", 4 "authors": [ 5 "Yuwei Zhang", 6 "Ge Li", 7 "Zhi Jin", 8 "Ying Xing" 9 ], 10 "year": 2023, 11 "venue": "arXiv.org", 12 "arxiv_id": "2305.09315", 13 "doi": "10.48550/arXiv.2305.09315" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "REPEATNPR combines program dependence graph slicing with an ensemble filter mechanism for neural program repair of single-line Java bugs. On five benchmarks, it outperforms 10 baselines using exact match: 2540 correct patches on BFP (vs 2191 for CodeT5, a 15.9% gain) and similar improvements on Bugs.jar, Defects4J, Bears, and QuixBugs. Ablation shows PDG-based context extraction contributes more than the filter mechanism, and both components are independently beneficial across all pre-trained models tested.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper. The implementation uses PyTorch and CodeT5-small but no code release is mentioned." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The paper uses publicly available datasets: BFP [11], Bugs.jar [53], Defects4J [54], Bears [55], and QuixBugs [56]. These are standard public benchmarks that were not modified in a way that requires separate release." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper mentions PyTorch, CodeT5-small from HuggingFace, PROGEX, and 4 Nvidia GTX 1080Ti GPUs, but provides no requirements.txt, Docker file, or detailed dependency listing with library versions sufficient to recreate the environment." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described in prose but there are no concrete commands or execution steps." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "All results in Tables II–VI and Figure 4 are reported as point estimates (raw counts or percentages) with no confidence intervals or error bars." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper claims REPEATNPR 'substantially outperforms' and 'significantly outperforms' baselines (Section V-A) based solely on comparing raw numbers. No statistical significance tests (p-values, t-tests, etc.) are reported." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Percentage improvements are reported with baseline context: 'REPEATNPR produces more correct patches than the best baseline model CodeT5 by 15.9% in the BFP benchmark, 12.0% in the Bugs.jar benchmark, 22.2% in the Defects4J benchmark, 62.5% in the Bears benchmark, 7.1% in the QuixBugs benchmark' (Section V-A1)." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification for dataset sizes or power analysis. The BFP dataset size (141k training) and benchmark sizes are stated but not justified. Some benchmarks are very small (QuixBugs: 32 bugs, Bears: 119 bugs) without discussion of statistical power." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. All results appear to be from single runs." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Ten baselines are compared: CODIT, Edits, Tufano, Recoder, CoCoNut, SEQUENCER, RoBERTa, CodeBERT, GraphCodeBERT, and CodeT5 (Section IV-B1, Table II)." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Baselines span 2019–2022 (CodeT5 2021, Recoder 2021, GraphCodeBERT 2021, SEQUENCER 2021, CoCoNut 2020), which are contemporary for a 2023 paper in the neural program repair space." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table III presents an ablation study removing program dependence analysis (A) and filter mechanism (F) independently. Table V further investigates model order and quantity in the filter mechanism." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "The paper reports exact match counts (Table II), Fix@k for k=1,5,10 (Tables IV-V, Figure 4), and per-bug-type breakdown (Table VI with Simple Delete, Simple Insert, Simple Replace, Mixed)." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "No human evaluation is included. The paper explicitly uses exact match to 'avoid human bias and reduce manual effort' (Section IV-B3), acknowledging in the threats section that they plan future human evaluations." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": true, 98 "justification": "The BFP dataset is split 8:1:1 into training, validation, and testing sets (Section IV-A). Results are reported on the test set. Model selection uses validation loss. Four additional benchmarks serve as fully external test sets." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table VI provides per-bug-type breakdowns (Simple Delete, Simple Insert, Simple Replace, Mixed) on BFP. Table II provides per-benchmark breakdowns. Figure 7 shows overlapping patching rates." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "The unaltered patching phenomenon is identified and analyzed as a systematic failure mode (Section I, Fig. 1). Figure 5 shows a specific example where ablated models fail but REPEATNPR succeeds. Section V-C2 discusses the overlapping phenomenon among model outputs." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Table V shows that changing model order in the filter mechanism decreases performance (FSRA+CodeT5A underperforms FCodeT5A+SRA). Adding a third model (GCBA) yields diminishing returns. Section V-C2 discusses overlap limitations." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims that REPEATNPR 'demonstrates effectiveness on five benchmarks when compared with state-of-the-art baselines' are supported by Table II showing improvements across all five benchmarks." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "Causal claims about component contributions ('component A improves performance') are supported by controlled ablation experiments in Table III, where single components are added/removed while other factors remain constant." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper clearly states scope: 'REPEATNPR framework focuses on single-line bugs written in the Java programming language' (Section III). The abstract specifies 'fixing single-line Java bugs.' Threats section acknowledges Java-only evaluation." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "Section VI discusses threats to validity (hyper-parameter sensitivity, exact match metric, generalizability) but does not substantively discuss alternative explanations for the observed improvements. For example, the possibility that improvements come from additional input tokens rather than PDG structure is not considered." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper explicitly distinguishes its proxy (exact match to human-written patch) from true correctness in Section IV-B3: 'Although such a metric does not represent human judgment, it is a strict and objective metric.' The construct threat in Section VI further acknowledges this gap." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper specifies 'CodeT5-small' with the exact HuggingFace checkpoint URL (https://huggingface.co/Salesforce/codet5-small) in Section IV-B2. For baselines, publicly released checkpoints or source codes are used." 148 }, 149 "prompts_provided": { 150 "applies": false, 151 "answer": false, 152 "justification": "The paper fine-tunes CodeT5 rather than using prompting. The input is tokenized code sequences with special tokens, not natural language prompts." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section IV-B2 reports: max sequence length 512, batch size 32, beam size 10, candidate number 10, max 20 epochs, early stopping patience 5, 8-headed attention, 6 layers in encoder and decoder. Uses same hyperparameters as Raffel et al. [59]." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. REPEATNPR is a fine-tuned encoder-decoder model with a post-processing filter step, not an agentic system." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section III-A and IV-A describe preprocessing: PROGEX parses Java methods into PDGs, instances failing parsing are filtered out, sequences longer than 512 after subword tokenization are truncated, dataset split 8:1:1 ensuring no cross-repository contamination. Table I provides final statistics." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section VI 'Threats to Validity' provides a dedicated section with substantive discussion of external, internal, and construct threats." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "Specific threats are discussed: hyper-parameter sensitivity with acknowledgment of computational resource constraints preventing thorough exploration, exact match not representing human judgment, evaluation limited to Java, and quality of the BFP dataset as the primary training source." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": true, 184 "justification": "Scope is clearly bounded: single-line Java bugs only, evaluated with exact match metric, fine-tuned on BFP dataset. Section VII explicitly notes plans to 'extend REPEATNPR with small modifications to support new target languages' indicating current Java-only scope." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": true, 191 "justification": "All datasets used are publicly available: BFP [11], Bugs.jar [53], Defects4J [54], Bears [55], QuixBugs [56]. However, the authors' specific adapted dataset (after PDG processing and filtering) is not released." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "BFP dataset origin is described: '787k bug-fixing commits extracted from the GitHub repositories' (Section IV-A). The adaptation process (PDG parsing, filtering, truncation, splitting) is documented with final counts in Table I." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. Data sources are standard public benchmarks with documented collection procedures in their original papers." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline is documented: BFP 787k commits → PROGEX PDG parsing (filtering failures) → truncation at 512 tokens → 8:1:1 split with repository-level dedup → final counts in Table I (141195 train, 13523 validation, 13635 test)." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding, acknowledgments, or grant information is mentioned anywhere in the paper." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly stated: Peking University (Key Laboratory of High Confidence Software Technologies) and Beijing University of Posts and Telecommunications." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, so independence cannot be assessed. The paper evaluates the authors' own framework, not a commercial product." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper does not state CodeT5's pre-training data cutoff date. CodeT5 was pre-trained on a 'colossal clean crawled corpus' but no temporal boundary is given for what code it may have seen." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Section IV-A explicitly addresses overlap: 'we remove all duplicate instances between the training and testing sets' and 'any two instances belonging to the same GitHub repository cannot be put in two different sets.'" 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "The paper does not discuss whether CodeT5's pre-training data contained code from BFP, Bugs.jar, Defects4J, Bears, or QuixBugs benchmarks. These are all publicly available and could be in the pre-training corpus." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study. Evaluation is entirely automated using exact match on benchmark datasets." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No inference cost, latency, or per-example timing is reported. The paper does not mention how long inference takes or how many patches can be generated per unit time." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Hardware is stated (4 Nvidia GTX 1080Ti GPUs, 12GB each) in Section IV-B2, but total training time, GPU hours, or compute budget is not reported." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from a single run." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of experimental runs is not stated. Results are presented as single values without any indication of how many runs produced them." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search budget is reported. The paper acknowledges: 'Due to the limitation of computational resources, we cannot thoroughly explore optimal hyper-parameters' (Section VI) and uses settings from Raffel et al. [59], but does not state what, if any, search was performed." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "Model selection is done on the validation set: 'After each epoch, we compute the loss on the validation set and save the model with the minimum validation loss' (Section IV-B2)." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "The paper makes numerous comparisons across 10 baselines and 5 benchmarks but uses no statistical tests at all, let alone correction for multiple comparisons." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors retrain all baselines 'via the same training strategy and hyper-parameter settings' (Section V-A) which may disadvantage methods designed for different configurations. This potential bias is not acknowledged." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "REPEATNPR uses an ensemble of two models (CodeT5 + SEQUENCER) but the additional compute cost compared to single-model baselines is not discussed or quantified." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": true, 336 "justification": "Section IV-B3 and Section VI (construct threat) discuss that exact match 'does not represent human judgment' and is 'a strict and objective metric.' The paper acknowledges the limitation and plans future human evaluations." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No agentic scaffolding is used. Models are fine-tuned encoder-decoder models evaluated directly." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of temporal aspects of the BFP dataset or whether training data commits predate test data commits. No temporal split is mentioned." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the input context (PDG slices, method signatures) could leak information about the correct fix in ways that would not be available in a real debugging scenario." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": true, 358 "justification": "Section IV-A explicitly states: 'any two instances belonging to the same GitHub repository cannot be put in two different sets (e.g., one in training and the other in testing).' This addresses non-independence from shared repository origins." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": true, 363 "justification": "Repository-level splitting is applied as a concrete prevention method: instances from the same GitHub repository are kept in the same split. Duplicate removal between train and test sets is also performed." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "REPEATNPR outperforms 10 state-of-the-art baselines on all five benchmarks in exact match metric.", 370 "evidence": "Table II shows REPEATNPR generates 2540 correct patches on BFP (vs 2191 for CodeT5), 168 on Bugs.jar (vs 150), 44 on Defects4J (vs 36/38), 26 on Bears (vs 16), and 15 on QuixBugs (vs 14/15). Section V-A1.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Program dependence analysis improves bug-fixing performance of all pre-trained models tested.", 375 "evidence": "Table IV shows Fix@10 improvements: RoBERTa 11.23%→11.82%, CodeBERT 13.79%→14.24%, GraphCodeBERT 13.38%→13.92%, CodeT5 17.92%→19.89%. Section V-B2.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The filter mechanism increases correct patches by eliminating unaltered patches and passing them to a second model.", 380 "evidence": "Table III shows CodeT5A generates 2431 patches on BFP while REPEATNPR (CodeT5A + filter) generates 2540, a 4.5% improvement. Gains are consistent across all five benchmarks. Section V-B1.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Pre-trained models are more promising than models trained from scratch for neural program repair.", 385 "evidence": "Table II shows CodeT5 (pre-trained) generates 2191 patches vs SEQUENCER (trained from scratch) at 1917 on BFP. Section V-A1 notes 'CodeT5 improves SEQUENCER by 14.3% in the BFP benchmark and 51.5% in the Bugs.jar benchmark.'", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "The order of models in the filter mechanism affects performance.", 390 "evidence": "Table V shows FCodeT5A+SRA achieves Fix@10 of 20.78% while FSRA+CodeT5A achieves only 17.23% on BFP. Section V-B3.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "REPEATNPR generates 390 unique correct patches that no other model can produce.", 395 "evidence": "Figure 7 diagonal shows REPEATNPR has 390 unique patches on BFP, second only to Recoder. Section V-C2.", 396 "supported": "moderate" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "No error bars or variance across runs", 402 "detail": "All experimental results are reported as single-point estimates. No standard deviation, confidence intervals, or multiple-run results are provided, making it impossible to assess result stability. This is particularly concerning for DL models known to be sensitive to initialization." 403 }, 404 { 405 "flag": "No statistical significance tests", 406 "detail": "Claims of 'significantly outperforms' and 'substantially outperforms' are made based solely on comparing raw numbers. With some benchmarks having very small sizes (QuixBugs: 32, Bears: 119), differences may not be statistically meaningful." 407 }, 408 { 409 "flag": "Baselines retrained with unified settings", 410 "detail": "All baselines are retrained 'via the same training strategy and hyper-parameter settings' (Section V-A). Methods like CoCoNut and Recoder were designed with specific architectures and training procedures; forcing them into uniform settings may systematically disadvantage them." 411 }, 412 { 413 "flag": "No code released", 414 "detail": "Despite proposing a novel framework with multiple components (PDG analysis, filter mechanism), no source code is provided, preventing independent verification or replication." 415 }, 416 { 417 "flag": "Small benchmark sizes for some evaluation sets", 418 "detail": "QuixBugs has only 32 bugs and Bears 119 bugs. Differences of 1-2 patches on QuixBugs (e.g., 14 vs 15) are presented as improvements but are not statistically meaningful at this scale." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "An empirical study on learning bug-fixing patches in the wild via neural machine translation", 424 "authors": ["M. Tufano", "C. Watson", "G. Bavota", "M. D. Penta", "M. White", "D. Poshyvanyk"], 425 "year": 2019, 426 "relevance": "Foundational neural program repair dataset (BFP) and NMT-based approach used as the primary training data source and baseline." 427 }, 428 { 429 "title": "CodeT5: Identifier-aware unified pre-trained encoder-decoder models for code understanding and generation", 430 "authors": ["Y. Wang", "W. Wang", "S. R. Joty", "S. C. H. Hoi"], 431 "year": 2021, 432 "relevance": "Pre-trained code model used as the backbone architecture for REPEATNPR; demonstrates transfer learning effectiveness for code tasks." 433 }, 434 { 435 "title": "CodeBERT: A pre-trained model for programming and natural languages", 436 "authors": ["Z. Feng", "D. Guo", "D. Tang", "N. Duan", "X. Feng", "M. Gong"], 437 "year": 2020, 438 "relevance": "Bimodal pre-trained code model used as a baseline; early example of pre-training for code understanding." 439 }, 440 { 441 "title": "GraphCodeBERT: Pre-training code representations with data flow", 442 "authors": ["D. Guo", "S. Ren", "S. Lu", "Z. Feng", "D. Tang", "S. Liu"], 443 "year": 2021, 444 "relevance": "Graph-based pre-trained code model incorporating data flow; used as baseline and shows relevance of structural code features." 445 }, 446 { 447 "title": "CoCoNut: Combining context-aware neural translation models using ensemble for program repair", 448 "authors": ["T. Lutellier", "H. V. Pham", "L. Pang", "Y. Li", "M. Wei", "L. Tan"], 449 "year": 2020, 450 "relevance": "Ensemble-based neural program repair approach; directly relevant as it combines multiple models for bug fixing, similar to REPEATNPR's filter mechanism." 451 }, 452 { 453 "title": "CURE: code-aware neural machine translation for automatic program repair", 454 "authors": ["N. Jiang", "T. Lutellier", "L. Tan"], 455 "year": 2021, 456 "relevance": "Code-aware NMT for program repair using pre-trained GPT module for contextual embeddings." 457 }, 458 { 459 "title": "Recoder: A syntax-guided edit decoder for neural program repair", 460 "authors": ["Q. Zhu", "Z. Sun", "Y. Xiao", "W. Zhang", "K. Yuan", "Y. Xiong", "L. Zhang"], 461 "year": 2021, 462 "relevance": "Syntax-guided program repair using AST-based edit operations; contrasting approach to NMT-based methods." 463 }, 464 { 465 "title": "Sequencer: Sequence-to-sequence learning for end-to-end program repair", 466 "authors": ["Z. Chen", "S. Kommrusch", "M. Tufano", "L. Pouchet", "D. Poshyvanyk", "M. Monperrus"], 467 "year": 2021, 468 "relevance": "End-to-end Seq2Seq program repair with copy mechanism; used as the secondary model in REPEATNPR's filter mechanism." 469 }, 470 { 471 "title": "Less training, more repairing please: Revisiting automated program repair via zero-shot learning", 472 "authors": ["C. S. Xia", "L. Zhang"], 473 "year": 2022, 474 "relevance": "Explores zero-shot LLM-based program repair, contrasting with the fine-tuning approach taken by REPEATNPR." 475 }, 476 { 477 "title": "Standup4NPR: Standardizing setup for empirically comparing neural program repair systems", 478 "authors": ["W. Zhong", "H. Ge", "H. Ai", "C. Li", "K. Liu", "J. Ge", "B. Luo"], 479 "year": 2022, 480 "relevance": "Standardized evaluation framework for comparing NPR systems; addresses the reproducibility crisis in NPR research." 481 }, 482 { 483 "title": "Patching as translation: the data and the metaphor", 484 "authors": ["Y. Ding", "B. Ray", "P. T. Devanbu", "V. J. Hellendoorn"], 485 "year": 2020, 486 "relevance": "Edit-based neural program repair model performing token-level operations; alternative to sequence generation approaches." 487 }, 488 { 489 "title": "A survey of learning-based automated program repair", 490 "authors": ["Q. Zhang", "C. Fang", "Y. Ma", "W. Sun", "Z. Chen"], 491 "year": 2023, 492 "arxiv_id": "2301.03270", 493 "relevance": "Contemporary survey of learning-based program repair covering the landscape that REPEATNPR contributes to." 494 } 495 ], 496 "engagement_factors": { 497 "practical_relevance": { 498 "score": 1, 499 "justification": "Automated bug fixing is practically useful but no code is released, and the approach requires fine-tuning infrastructure with GPUs." 500 }, 501 "surprise_contrarian": { 502 "score": 0, 503 "justification": "Confirms expected trends: pre-training helps, more context helps, ensembles help. No surprising or contrarian findings." 504 }, 505 "fear_safety": { 506 "score": 0, 507 "justification": "No safety or security concerns raised; focuses on improving automated program repair." 508 }, 509 "drama_conflict": { 510 "score": 0, 511 "justification": "Standard incremental improvement paper with no controversial claims or conflict." 512 }, 513 "demo_ability": { 514 "score": 0, 515 "justification": "No code, demo, or tool released. Cannot be tried by practitioners." 516 }, 517 "brand_recognition": { 518 "score": 0, 519 "justification": "From Peking University, a respected institution but not a high-profile AI lab brand for general tech audiences." 520 } 521 } 522 }