scan.json (26294B)
1 { 2 "paper": { 3 "title": "Automated Test Case Repair Using Language Models", 4 "authors": [ 5 "Ahmadreza Saboor Yaraghi", 6 "Darren Holden", 7 "Nafiseh Kahani", 8 "Lionel Briand" 9 ], 10 "year": 2024, 11 "venue": "IEEE Transactions on Software Engineering", 12 "arxiv_id": "2401.06765", 13 "doi": "10.1109/TSE.2025.3541166" 14 }, 15 "checklist": { 16 "artifacts": { 17 "code_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The paper provides a GitHub link for TARGET (https://github.com/Ahmadreza-SY/TaRGet) in footnote 1 and states 'we provide the implementation of TARGET, TARBENCH, and the script for reproducing all experiments' (Section 5.1)." 21 }, 22 "data_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "TARBENCH is publicly available via Figshare at https://doi.org/10.6084/m9.figshare.25008893 (footnote 2). The benchmark comprising 45,373 broken test repairs is explicitly released." 26 }, 27 "environment_specified": { 28 "applies": true, 29 "answer": true, 30 "justification": "Section 4.9 specifies the implementation uses Hugging Face Transformers and Accelerate libraries, with detailed hyperparameters. Hardware is specified: two Nvidia Quadro RTX 6000 GPUs (24GB each), Intel Xeon Gold 6234 16-Core CPU, 187GB RAM. Maven 3.6.3 and JDK versions 1.8.0_192, 11.0.16_8, or 17.0.2 are stated in Section 4.2.2." 31 }, 32 "reproduction_instructions": { 33 "applies": true, 34 "answer": true, 35 "justification": "The paper states 'we provide the implementation of TARGET, TARBENCH, and the script for reproducing all experiments' (Section 5.1, Table 11 'Reproducibility' column). The replication package is referenced via the GitHub repository." 36 } 37 }, 38 "statistical_methodology": { 39 "confidence_intervals_or_error_bars": { 40 "applies": true, 41 "answer": false, 42 "justification": "Main results in Tables 5, 6, and 7 report only point estimates (e.g., 66.1% EM, 80.0% PR) without confidence intervals or error bars. No uncertainty quantification is provided for the primary evaluation metrics." 43 }, 44 "significance_tests": { 45 "applies": true, 46 "answer": true, 47 "justification": "In RQ3.2, a paired t-test is used to compare Project-agnostic and Best models (p-value = 0.034), preceded by a Shapiro-Wilk normality test (p-values 0.16 and 0.25). This is appropriate for the comparative claim made." 48 }, 49 "effect_sizes_reported": { 50 "applies": true, 51 "answer": true, 52 "justification": "In RQ3.2, Cohen's d = 0.513 is reported as the effect size, described as 'moderate range, leaning closer to a small effect size.' Percentage point differences are consistently reported throughout (e.g., 37.4 EM pp improvement over NOCONTEXT)." 53 }, 54 "sample_size_justified": { 55 "applies": true, 56 "answer": false, 57 "justification": "No power analysis or explicit justification for the sample sizes is provided. The benchmark size (45,373 instances) is argued to be larger than prior work but there is no formal analysis of whether this is sufficient for the claims made." 58 }, 59 "variance_reported": { 60 "applies": true, 61 "answer": false, 62 "justification": "The main experimental results (Tables 5, 6, 7) are from single fine-tuning runs. No standard deviation or variance across multiple runs with different random seeds is reported. The only standard deviations mentioned are for AST action counts in RQ3.2 analysis, not for model performance." 63 } 64 }, 65 "evaluation_design": { 66 "baselines_included": { 67 "applies": true, 68 "answer": true, 69 "justification": "RQ1.2 compares TARGET against three baselines: CEPROT (a recent state-of-the-art method), SUTCOPY (a simple copy-based baseline), and NOCONTEXT (CLM without repair context). Results are reported in Tables 6 and 7." 70 }, 71 "baselines_contemporary": { 72 "applies": true, 73 "answer": true, 74 "justification": "CEPROT is from ASE 2023, which is contemporary. The paper also justifies why other prior works (ReAssert, TRIP, etc.) could not be compared against due to reproducibility issues (Section 5.1)." 75 }, 76 "ablation_study": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper systematically evaluates four IO formats (IO1-IO4) across three models (Table 5), effectively ablating the repair context selection, hunk prioritization, hunk representation, and output format. RQ3.1 ablates training data size. The trivial repair inclusion experiment (Section 4.3.3) also serves as an ablation." 80 }, 81 "multiple_metrics": { 82 "applies": true, 83 "answer": true, 84 "justification": "Four evaluation metrics are used: Exact Match Accuracy (EM), Plausible Repair Accuracy (PR), BLEU, and CodeBLEU (Section 4.1). All four are reported in Tables 5 and 7." 85 }, 86 "human_evaluation": { 87 "applies": true, 88 "answer": true, 89 "justification": "Section 4.5.3 describes a qualitative analysis where the authors 'manually analyzed many examples from different repair categories and projects, and selected eight representative and insightful cases.' Section 4.4.3 includes manual review of five comparisons between TARGET and CEPROT. These constitute human evaluation of system outputs." 90 }, 91 "held_out_test_set": { 92 "applies": true, 93 "answer": true, 94 "justification": "Section 4.2.5 describes temporal splitting: 'for each project, we kept older commits in the training set and included newer commits in both the validation and test sets.' The splits are 80% train, 5% validation, 15% test (36,639 / 1,631 / 7,103 instances)." 95 }, 96 "per_category_breakdown": { 97 "applies": true, 98 "answer": true, 99 "justification": "RQ2.1 provides detailed per-category breakdowns across repair categories (ARG, ORC, INV, etc.) and AST action counts. Figure 9 shows heatmaps of EM and PR across repair categories and complexity levels. Table 8 provides distribution across categories." 100 }, 101 "failure_cases_discussed": { 102 "applies": true, 103 "answer": true, 104 "justification": "Section 4.5.3 and Appendix A present four detailed failure examples with analysis of why TARGET failed, including missing contextual information and complexity of multi-step repairs." 105 }, 106 "negative_results_reported": { 107 "applies": true, 108 "answer": true, 109 "justification": "IO4 (Edit Sequence Output) is reported as performing worst across all models and metrics. The paper states 'IO4 reaching the lowest performance in all instances' (Section 4.3.3) and recommends against its use. Performance degradation with complex repairs is also reported." 110 } 111 }, 112 "claims_and_evidence": { 113 "abstract_claims_supported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The abstract claims 66.1% EM accuracy (supported by Table 5), effectiveness across different repair scenarios (supported by RQ2.1), practical prediction guide (supported by RQ2.2), and generalizability to new projects (supported by RQ3.2). All claims are substantiated in the results sections." 117 }, 118 "causal_claims_justified": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper's causal claims are primarily through ablation-style comparisons: TARGET vs. NOCONTEXT demonstrates the effect of repair context (controlled single-variable manipulation). IO format comparisons (Table 5) isolate specific design choices. The data size experiment (RQ3.1) systematically varies one factor." 122 }, 123 "generalization_bounded": { 124 "applies": true, 125 "answer": false, 126 "justification": "The paper tests only on Java/JUnit projects but makes broad claims like 'our primary contributions... are not dependent on any specific programming language and can therefore be widely adopted' (Section 4.9). While they acknowledge the Java focus, the title 'Automated Test Case Repair Using Language Models' implies broader applicability than what was tested." 127 }, 128 "alternative_explanations_discussed": { 129 "applies": true, 130 "answer": true, 131 "justification": "Section 4.10 discusses threats to validity, including the PR metric limitation where removing assertions could count as plausible repairs (with quantitative analysis showing 5.3 pp worst-case impact). The paper considers that PLB vs. CG performance differences may be due to 'pre-training datasets and tasks' not just model size (Section 4.3.3)." 132 } 133 }, 134 "setup_transparency": { 135 "model_versions_specified": { 136 "applies": true, 137 "answer": true, 138 "justification": "Specific model versions with parameter counts are stated: CodeT5+ 770M, PLBART Base 140M, and CodeGen Multi 350M (Section 4.3.1). These are specific, identifiable open-source models with known architectures." 139 }, 140 "prompts_provided": { 141 "applies": false, 142 "answer": false, 143 "justification": "This paper fine-tunes language models rather than using prompting. The input format is defined programmatically (Section 3.2) rather than through natural language prompts." 144 }, 145 "hyperparameters_reported": { 146 "applies": true, 147 "answer": true, 148 "justification": "Section 4.9 reports: 4 epochs with early stopping, beam sizes (200 for PLB, 40 for CT5+ and CG), batch sizes (8 for PLB, 1 for CT5+, 2 for CG), Adam optimizer with weight decay (AdamW), learning rate 1e-5, cosine learning rate scheduler. Max input/output lengths of 512/256 tokens stated in Section 4.2.3." 149 }, 150 "scaffolding_described": { 151 "applies": false, 152 "answer": false, 153 "justification": "No agentic scaffolding is used. TARGET is a fine-tuned model pipeline, not an agentic system with tool use, retry logic, or feedback mechanisms." 154 }, 155 "data_preprocessing_documented": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 4.2.3 documents preprocessing in detail with counts at each stage: duplicates (1,039 excluded), empty repair context (3,748 excluded), absence of breakage location (1,274 excluded), max length exceeded (5,383 excluded). Section 4.2.4 documents quality checks with trivial repair removal (2,046 from eval, 7,196 kept in training) and test refactoring exclusion (6,302). Figure 7 provides a visual overview." 159 } 160 }, 161 "limitations_and_scope": { 162 "limitations_section_present": { 163 "applies": true, 164 "answer": true, 165 "justification": "Section 4.10 'Threats to Validity' provides a dedicated discussion of external and internal validity threats, spanning approximately two pages of substantive discussion." 166 }, 167 "threats_to_validity_specific": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 4.10 discusses specific threats: (1) possibly simple test repairs and mitigation via trivial repair exclusion, (2) PR metric limitation with quantitative analysis showing 5.3 pp worst-case impact from 377 potentially misclassified assertion cases, (3) Java-only focus. These are specific to this study, not generic disclaimers." 171 }, 172 "scope_boundaries_stated": { 173 "applies": true, 174 "answer": true, 175 "justification": "The paper explicitly states several scope boundaries: single-hunk repairs only (Section 4.2.1), Java/JUnit focus (Section 4.9), 512 token input limit (Section 4.2.3), breakage location assumed known (Section 3), and exclusion of complete file additions/removals from repair context (Section 4.2.2)." 176 } 177 }, 178 "data_integrity": { 179 "raw_data_available": { 180 "applies": true, 181 "answer": true, 182 "justification": "TARBENCH is publicly available via Figshare (https://doi.org/10.6084/m9.figshare.25008893). The data is traceable to specific GitHub commits (project + commit hash pairs are included), enabling independent verification." 183 }, 184 "data_collection_described": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 4.2 describes the complete data collection procedure in extensive detail: starting from 4,662 CodeSearchNet Java projects, filtering to 1,257 Maven projects with 50+ stars, analyzing commits for test method changes, three-step validation execution, with counts at each filtering stage (Figure 7)." 188 }, 189 "recruitment_methods_described": { 190 "applies": false, 191 "answer": false, 192 "justification": "No human participants were recruited. The data consists of open-source software project commits, and the project selection process is documented in Section 4.2.1." 193 }, 194 "data_pipeline_documented": { 195 "applies": true, 196 "answer": true, 197 "justification": "Figure 7 provides a visual pipeline from 1,257 projects through subject selection (198,210 tests from 116 projects), test repair collection (65,165 from 59 projects), preprocessing and filtering steps with counts, to final splits (36,639 train / 1,631 validation / 7,103 test). Each filtering criterion and the number of excluded instances are documented." 198 } 199 }, 200 "conflicts_of_interest": { 201 "funding_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "The Acknowledgement section lists funding: 'This work was supported by a research grant from Huawei Technologies Canada, Mitacs Canada, as well as the Canada Research Chair and Discovery Grant programs of NSERC and the Research Ireland grant 13/RC/2094-2.'" 205 }, 206 "affiliations_disclosed": { 207 "applies": true, 208 "answer": true, 209 "justification": "Author affiliations are clearly stated: University of Ottawa and Lero/University of Limerick (Saboor Yaraghi, Briand), Carleton University (Holden, Kahani). No authors are affiliated with the companies whose products are evaluated (the CLMs are open-source academic models)." 210 }, 211 "funder_independent_of_outcome": { 212 "applies": true, 213 "answer": true, 214 "justification": "The funders include Huawei Technologies Canada, Mitacs, NSERC, and Research Ireland. The paper evaluates open-source academic models (CodeT5+, PLBART, CodeGen), none of which are Huawei products. Huawei has no direct financial stake in the performance comparison between these specific models." 215 }, 216 "financial_interests_declared": { 217 "applies": true, 218 "answer": false, 219 "justification": "No competing interests or financial interests statement is present in the paper. While the paper has an acknowledgements section listing funding, there is no explicit declaration about patents, equity, or other financial interests." 220 } 221 }, 222 "contamination": { 223 "training_cutoff_stated": { 224 "applies": true, 225 "answer": false, 226 "justification": "The paper uses pre-trained models (CodeT5+, PLBART, CodeGen) but does not state the training data cutoff dates for any of them. Since these models are fine-tuned on TARBENCH data drawn from GitHub repositories, there is a risk that the pre-training data overlapped with the benchmark projects." 227 }, 228 "train_test_overlap_discussed": { 229 "applies": true, 230 "answer": false, 231 "justification": "No discussion of whether the pre-trained models may have seen the test repair code during pre-training. The temporal split (older commits for training, newer for testing) addresses the fine-tuning data split but not the pre-training data overlap. The APR section (5.2) briefly mentions that 'data leakage is a point of concern' for LM-based techniques per Jiang et al. [46], but the authors do not address this for their own work." 232 }, 233 "benchmark_contamination_addressed": { 234 "applies": true, 235 "answer": false, 236 "justification": "TARBENCH is constructed from GitHub open-source projects. The CLMs (CodeT5+, PLBART, CodeGen) were pre-trained on code from GitHub and similar sources. The paper does not discuss whether TARBENCH projects' code appeared in the pre-training data of these models, despite this being a recognized concern in the related work section." 237 } 238 }, 239 "human_studies": { 240 "pre_registered": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants in the study. The evaluation is entirely automated using benchmark data from open-source repositories." 244 }, 245 "irb_or_ethics_approval": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants. The study uses publicly available open-source code from GitHub." 249 }, 250 "demographics_reported": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in the study." 254 }, 255 "inclusion_exclusion_criteria": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in the study." 259 }, 260 "randomization_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in the study." 264 }, 265 "blinding_described": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in the study." 269 }, 270 "attrition_reported": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in the study." 274 } 275 }, 276 "cost_and_practicality": { 277 "inference_cost_reported": { 278 "applies": true, 279 "answer": false, 280 "justification": "No inference cost, latency, or per-example time is reported. The paper mentions beam sizes and hardware but does not quantify the wall-clock time or cost per repair." 281 }, 282 "compute_budget_stated": { 283 "applies": true, 284 "answer": false, 285 "justification": "While the hardware is described (two Nvidia Quadro RTX 6000 GPUs, Section 4.9), the total GPU hours, training time, or computational budget is not quantified. Given that 12 model configurations were fine-tuned on 36k instances each, this represents significant compute that is not reported." 286 } 287 } 288 }, 289 "claims": [ 290 { 291 "claim": "TARGET achieves 66.1% exact match accuracy and 80% plausible repair accuracy using CodeT5+ 770M with IO2 format.", 292 "evidence": "Table 5 (Section 4.3.3) shows CT5+ with IO2 achieves 66.1 EM and 80.0 PR, which are the best results across all model-IO combinations tested.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "TARGET outperforms the baseline (NOCONTEXT) by 37.4 EM percentage points.", 297 "evidence": "Table 7 (Section 4.4.3) shows TARGET at 66.1 EM vs. NOCONTEXT at 28.7 EM, a difference of 37.4 pp. All four metrics show consistent improvement.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "TARGET significantly outperforms CEPROT, achieving 40.6% EM vs. CEPROT's 21% EM on CEPROT's benchmark.", 302 "evidence": "Table 6 (Section 4.4.3) reports TARGET at 40.6 EM and 91.1 CodeBLEU vs. CEPROT at 21 EM and 60.4 CodeBLEU on 214 compatible instances from CEPROT's evaluation set.", 303 "supported": "moderate" 304 }, 305 { 306 "claim": "PLBART 140M performs comparably to CodeT5+ 770M despite having far fewer parameters.", 307 "evidence": "Table 5 shows PLB (IO2) at 57.9 EM vs. CT5+ at 66.1 EM, and PLB (IO3) at 59.1 EM. The PR gap is smaller: PLB at 79.2 vs. CT5+ at 80.0. The paper states losses of '8.2, 0.8, 1.4, and 0.8 pp' across metrics (Section 4.3.3).", 308 "supported": "moderate" 309 }, 310 { 311 "claim": "TARGET can generalize to unseen projects with an average EM loss of only 4.87 percentage points.", 312 "evidence": "Table 10 (RQ3.2) shows 10-fold results where project-agnostic models average 59.1% EM vs. best model varying per fold. A paired t-test shows statistical significance (p=0.034) but Cohen's d=0.513 indicates moderate effect size.", 313 "supported": "strong" 314 }, 315 { 316 "claim": "A Random Forest model can predict whether TARGET will successfully repair a test with 88% F1 for exact match and 92% F1 for plausible repair.", 317 "evidence": "Table 9 (Section 4.6.2) reports precision/recall/F1 of 87/88/88 for exact match and 90/94/92 for plausible, using 5-fold cross validation on the test set.", 318 "supported": "moderate" 319 } 320 ], 321 "methodology_tags": [ 322 "benchmark-eval" 323 ], 324 "key_findings": "TARGET, a language model-based approach for automated test case repair, achieves 66.1% exact match accuracy and 80% plausible repair accuracy on TARBENCH, a new benchmark of 45,373 broken test repairs from 59 Java projects. Fine-tuning CodeT5+ 770M with text-similarity-based hunk ordering outperforms both CEPROT and no-context baselines by large margins. The approach generalizes to unseen projects with only moderate performance loss (4.87 pp EM), and a Random Forest model can predict repair reliability with high accuracy.", 325 "red_flags": [ 326 { 327 "flag": "No variance across runs", 328 "detail": "All main results appear to be from single fine-tuning runs without reporting variance across different random seeds. Deep learning results can vary significantly with initialization, and the absence of multi-run statistics makes it hard to assess result stability." 329 }, 330 { 331 "flag": "Pre-training contamination not addressed", 332 "detail": "The CLMs (CodeT5+, PLBART, CodeGen) were pre-trained on GitHub code, and TARBENCH is derived from GitHub projects. The paper acknowledges this concern for APR work in Section 5.2 but does not address it for their own evaluation, despite the risk that pre-trained models may have memorized code patterns from the benchmark projects." 333 }, 334 { 335 "flag": "Selective baseline comparison with CEPROT", 336 "detail": "The comparison with CEPROT uses only 214 out of 520 evaluation instances (41%) from CEPROT's test set. Multiple categories of instances were excluded (multi-hunk, no source changes, method name changes, invalid cases), which could introduce selection bias favoring TARGET's design constraints." 337 }, 338 { 339 "flag": "No training or inference time reported", 340 "detail": "Despite fine-tuning 12+ model configurations on 36k instances with beam sizes up to 200, no training time, inference time, or total compute budget is reported. This makes it impossible to assess practical feasibility for adoption." 341 } 342 ], 343 "cited_papers": [ 344 { 345 "title": "Large language models for software engineering: A systematic literature review", 346 "authors": ["X. Hou"], 347 "year": 2023, 348 "relevance": "Comprehensive survey of LLM applications in software engineering, covering 50+ models and their SE task performance." 349 }, 350 { 351 "title": "Evaluating large language models trained on code", 352 "authors": ["M. Chen"], 353 "year": 2021, 354 "arxiv_id": "2107.03374", 355 "relevance": "Introduces HumanEval benchmark and Codex, foundational work for evaluating code generation LLMs." 356 }, 357 { 358 "title": "CodeT5+: Open code large language models for code understanding and generation", 359 "authors": ["Y. Wang", "H. Le", "A. D. Gotmare", "N. D. Q. Bui", "J. Li", "S. C. H. Hoi"], 360 "year": 2023, 361 "relevance": "The best-performing model in this study (770M parameters); encoder-decoder architecture for code tasks." 362 }, 363 { 364 "title": "Impact of code language models on automated program repair", 365 "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"], 366 "year": 2023, 367 "arxiv_id": "2302.05020", 368 "relevance": "Comparative study of CLMs vs. DL-based APR techniques showing LMs can surpass traditional approaches, directly relevant to evaluating LLMs for code repair." 369 }, 370 { 371 "title": "Identify and update test cases when production code changes: A transformer-based approach", 372 "authors": ["X. Hu", "Z. Liu", "X. Xia", "Z. Liu", "T. Xu", "X. Yang"], 373 "year": 2023, 374 "relevance": "CEPROT baseline compared against in this paper; addresses automated detection and updating of obsolete tests using CodeT5." 375 }, 376 { 377 "title": "A survey of learning-based automated program repair", 378 "authors": ["Q. Zhang", "C. Fang", "Y. Ma", "W. Sun", "Z. Chen"], 379 "year": 2023, 380 "arxiv_id": "2301.03270", 381 "relevance": "Survey of learning-based APR approaches, closely related to the test repair task addressed in this paper." 382 }, 383 { 384 "title": "Unified pre-training for program understanding and generation", 385 "authors": ["W. Ahmad", "S. Chakraborty", "B. Ray", "K.-W. Chang"], 386 "year": 2021, 387 "relevance": "PLBART model used as one of the three CLMs in this study; pre-trained on Java and Python." 388 }, 389 { 390 "title": "CodeGen: An open large language model for code with multi-turn program synthesis", 391 "authors": ["E. Nijkamp"], 392 "year": 2023, 393 "relevance": "CodeGen model used as one of the three CLMs evaluated in this study; decoder-only architecture." 394 }, 395 { 396 "title": "Intent-preserving test repair", 397 "authors": ["X. Li", "M. d'Amorim", "A. Orso"], 398 "year": 2019, 399 "relevance": "TRIP: prior work on test repair using dynamic symbolic execution, evaluated on 91 tests. Key baseline for comparison despite reproducibility issues." 400 }, 401 { 402 "title": "Iter: Iterative neural repair for multi-location patches", 403 "authors": ["H. Ye", "M. Monperrus"], 404 "year": 2024, 405 "relevance": "Iterative neural program repair technique supporting multi-hunk repairs; cited as promising future direction for extending TARGET." 406 }, 407 { 408 "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning", 409 "authors": ["C. S. Xia", "L. Zhang"], 410 "year": 2022, 411 "relevance": "AlphaRepair: zero-shot APR approach that frames program repair as code generation without fine-tuning on repair datasets." 412 } 413 ] 414 }