scan-v5.json (23545B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair", 6 "authors": [ 7 "Nan Jiang", 8 "Thibaud Lutellier", 9 "Yiling Lou", 10 "Lin Tan", 11 "Dan Goldwasser", 12 "Xiangyu Zhang" 13 ], 14 "year": 2023, 15 "venue": "International Conference on Software Engineering", 16 "arxiv_id": "2302.01857", 17 "doi": "10.1109/ICSE48619.2023.00111" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "The abstract's claims of fixing 72 bugs on Defects4J v1.2, 25 on QuixBugs, and 50 on Defects4J v2.0 are all directly supported by Table III results.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "Ablation study (Table V) isolates the causal contribution of the three-stage tree decoder (+16 bugs over KNOD-decoder) and domain-rule distillation (+10 bugs for training phase), adequately supporting the causal framing.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "Section V explicitly states KNOD is evaluated on Java programs only and that multi-hunk bugs remain a limitation; generalization claims are bounded to the three benchmarks tested.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper does not discuss alternative explanations for KNOD's superiority, such as ensemble size advantages (5 vs fewer models for some baselines) or potential training data size differences.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper uses number of correctly fixed bugs (manually verified as semantically equivalent to developer patches) as the primary metric, which directly measures the claimed objective.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section V is a dedicated 'LIMITATION' section discussing multi-hunk bug failures and fault localization dependence.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Section III.D names specific threats: implementation correctness mitigated by multi-author review, manual labeling with 92.1% inter-rater agreement, and benchmark coverage limited to three Java benchmarks.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper explicitly states KNOD cannot fix multi-hunk bugs well and results are limited to Java programs; future work on other languages is noted as out of current scope.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": true, 76 "justification": "Acknowledgment section discloses 'partially supported by a J.P. Morgan AI Faculty Research Award.'", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All author affiliations are listed on the first page (Purdue University, University of Alberta, Fudan University); notes clarify institutions at time of the work.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": true, 88 "justification": "J.P. Morgan funds general academic research on program repair; the paper does not evaluate J.P. Morgan products or systems, so the funder is independent of experimental outcomes.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests or financial interests declaration is present; the acknowledgment only discloses funding source but not patents, equity, or consulting relationships.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "APR, AST, Abstract Syntax Graph, domain knowledge, and the three-stage decoder components are all defined or described with concrete examples (e.g., Figure 1 walkthrough of Closure-123).", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section I lists four explicit contributions: the three-stage tree decoder, domain-rule distillation, the KNOD system, and its evaluation on three benchmarks.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section VI provides substantial related work covering DL-based APR and code generation, explaining how KNOD differs architecturally from specific competing approaches like Recoder, CURE, and RewardRepair.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": true, 125 "justification": "Reference [65] and a 'Data Availability' statement link to a replication package at https://github.com/lin-tan/knod.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "Defects4J and QuixBugs are publicly available benchmarks used unmodified; the training data is sourced from a prior work's public dataset of GitHub Java patches.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Hardware (RTX 2080 TI, 56-core server) and framework (PyTorch) are mentioned, but no version numbers, requirements file, or Dockerfile are provided.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper references a replication package but includes no step-by-step reproduction instructions in the paper text itself.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "Main results in Tables III-V report only single counts of correctly fixed bugs with no confidence intervals or error bars across runs.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "No statistical significance tests are used for any comparative claims; comparisons are made purely on raw bug counts.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "The paper reports specific numerical improvements ('8 and 19 more bugs than the best DL-based and non-DL-based APR techniques') and patch precision (86.7% vs 58.4-70.3% for competitors).", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "No power analysis or justification for using these specific benchmarks; the choice is justified by convention (widely-used), not statistical reasoning.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "No variance, standard deviation, or spread across multiple runs is reported; single results per configuration appear in all tables.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Eight baselines are included: SequenceR, SimFix, DLFix, CoCoNuT, RewardRepair, TBar, CURE, and Recoder — covering both DL-based and non-DL-based APR.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "Baselines include recent work from 2021-2022 (CURE, RewardRepair, Recoder), which were state-of-the-art at time of writing.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "Table V presents ablation with three variants (KNOD-decoder, KNOD-distTrain, KNOD-distInf) isolating the tree decoder and domain-rule distillation contributions in training vs. inference phases.", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Evaluation uses number of correctly fixed bugs, patch precision (86.7%), compilation rate, and ranking of correct fixes across top-k candidate patches (Figure 6).", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": true, 207 "justification": "Manual patch correctness labeling by two participants with 92.1% agreement ratio is used to verify plausible patches as semantically equivalent to developer patches.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "Training data explicitly excludes projects in or cloned from Defects4J; bug benchmarks serve as held-out test sets separate from the 576,002-pair training corpus.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Figure 4 provides Venn diagrams of uniquely and jointly fixed bugs per benchmark; Tables III/IV break down results per benchmark across two FL settings.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "Section V discusses multi-hunk bug failures and fault localization dependence; Section IV.A analyzes why KNOD underperforms Recoder on Defects4J v1.2 under spectrum-based FL.", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "Table IV honestly reports that KNOD fixes fewer bugs than Recoder on Defects4J v1.2 under spectrum-based fault localization (38 vs 45).", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": false, 238 "answer": false, 239 "justification": "KNOD is a custom-built model, not an off-the-shelf pre-trained model; hyperparameters are reported directly and version specificity is not applicable.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": false, 244 "answer": false, 245 "justification": "KNOD is a custom deep learning system, not an LLM-based system using prompts; this criterion is not applicable.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": true, 251 "justification": "Section III.C reports encoder layers (6-8), decoder layers (1-2 for parent/edge, 4-8 for node), embedding dimensions (256-384), dropout 0.1, Adam lr 2.5e-4, beam size 1000.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "KNOD is not an agentic system with scaffolding; there is no scaffolding component to describe.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Section II.B describes code normalization using src2abs, AST/ASG construction using javalang and JavaParser, identifier normalization, and buggy location sequence generation in detail.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": true, 271 "justification": "The replication package (reference [65]) is publicly available, and the bug benchmarks (Defects4J, QuixBugs) are well-known public datasets.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "Training data collection is described: mined from prior work's dataset of open-source GitHub Java projects, with Defects4J projects removed; 576,002 pairs, 90/10 split.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participant recruitment; evaluation uses standard public bug benchmarks.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "The full pipeline from raw buggy code through normalization, ASG construction, training/validation split, and patch validation is documented across Sections II and III.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No date is given for when the GitHub training data was mined; there is no stated cutoff for the training corpus relative to the bug benchmarks.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": true, 302 "answer": true, 303 "justification": "The paper explicitly states 'we remove projects that are in or cloned from Defects4J projects from our training set' to prevent training/test overlap.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": true, 308 "answer": false, 309 "justification": "While Defects4J projects are excluded from training, no discussion addresses whether QuixBugs or Defects4J v2.0 bug fixes might appear in the GitHub-mined training corpus.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants in this study; manual patch labeling is conducted by the authors, not external subjects.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants requiring IRB approval.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human experimental design requiring randomization.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": true, 361 "justification": "Section IV.A states 'KNOD spends 12.8s on average generating one thousand candidate patches for a given bug (using one NVIDIA RTX 2080 TI GPU)'.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "Hardware specs for training (8x RTX 2080 TI, 56-core server) are stated but no total training time, GPU-hours, or compute budget is reported.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "KNOD fixes 72 bugs on Defects4J v1.2 with perfect fault localization, outperforming all existing APR tools.", 376 "evidence": "Table III shows KNOD fixing 72 bugs versus next best Recoder at 64 (DL-based) and TBar at 53 (non-DL).", 377 "supported": "strong" 378 }, 379 { 380 "claim": "The three-stage tree decoder improves patch generation by fixing 16 more bugs than a sequential decoder baseline.", 381 "evidence": "Table V ablation: KNOD (72) vs KNOD-decoder (56) on Defects4J v1.2; compilation rate 47.0% vs 33.6%.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Domain-rule distillation during training is more effective than applying it only during inference.", 386 "evidence": "Table V: KNOD-distTrain (inference-only rules) fixes 62 bugs vs KNOD-distInf (training-only rules) at 69, confirming the training phase is more critical.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "KNOD achieves 86.7% patch precision, substantially higher than existing APR tools (DLFix 58.4%, TBar 62.4%, RewardRepair 70.3%).", 391 "evidence": "Reported in Section IV.A; comparison figures cited from [8] under same configuration.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "KNOD generalizes across benchmarks, fixing 50 bugs on Defects4J v2.0 and 25 on QuixBugs.", 396 "evidence": "Table III reports these figures; limited comparison as many baselines have no published results on these benchmarks.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "KNOD uniquely fixes 12 bugs on Defects4J v1.2 that no compared technique fixes, complementing existing tools.", 401 "evidence": "Figure 4(a) Venn diagram shows 12 bugs uniquely fixed by KNOD not fixed by TBar, CURE, or Recoder.", 402 "supported": "strong" 403 } 404 ], 405 "methodology_tags": [ 406 "benchmark-eval" 407 ], 408 "key_findings": "KNOD outperforms all prior automated program repair tools on Defects4J v1.2 by fixing 72 bugs with 86.7% patch precision, substantially higher than competitors. The ablation study demonstrates that both the three-stage tree decoder (generating ASTs directly rather than token sequences) and domain-rule distillation (injecting syntactic/semantic rules during training via teacher-student distributions) independently contribute to improvement, with the training-phase component being more impactful than inference-only domain knowledge application. The system also generalizes to Defects4J v2.0 and QuixBugs, though comparisons there are limited by fewer baselines reporting results.", 409 "red_flags": [ 410 { 411 "flag": "No statistical tests", 412 "detail": "All comparative claims are made on raw bug counts without significance tests, making it impossible to assess whether differences (e.g., 72 vs 64) are statistically meaningful given benchmark variance." 413 }, 414 { 415 "flag": "No variance across runs", 416 "detail": "Results are single-run point estimates; no standard deviation or confidence intervals are reported for any metric including ablation results." 417 }, 418 { 419 "flag": "Ensemble size confound", 420 "detail": "KNOD uses an ensemble of 5 models while some baselines use fewer (Recoder: 1) and others more (CURE: 10); the ranking comparison acknowledges but does not fully control for this confound." 421 }, 422 { 423 "flag": "Training data cutoff unknown", 424 "detail": "No date is given for when GitHub training data was mined; potential overlap between training data and QuixBugs or Defects4J v2.0 fix patterns is not addressed." 425 } 426 ], 427 "cited_papers": [ 428 { 429 "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair", 430 "relevance": "Direct predecessor by overlapping authors; KNOD builds on and outperforms CURE on the same benchmarks" 431 }, 432 { 433 "title": "CoCoNuT: Combining Context-Aware Neural Translation Models Using Ensemble for Program Repair", 434 "relevance": "Co-author's prior APR work sharing training data methodology; key baseline" 435 }, 436 { 437 "title": "A Syntax-Guided Edit Decoder for Neural Program Repair (Recoder)", 438 "relevance": "Main DL-based competitor; achieves competitive results on Defects4J v1.2 under spectrum-based FL" 439 }, 440 { 441 "title": "Neural Program Repair with Execution-Based Backpropagation (RewardRepair)", 442 "relevance": "Key baseline using dynamic domain knowledge (execution feedback), contrasted with KNOD's static rules" 443 }, 444 { 445 "title": "TBar: Revisiting Template-Based Automated Program Repair", 446 "relevance": "Best non-DL baseline representing template-based APR" 447 }, 448 { 449 "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs", 450 "relevance": "Primary evaluation benchmark used throughout; most widely cited APR benchmark" 451 }, 452 { 453 "title": "Harnessing Deep Neural Networks with Logic Rules", 454 "relevance": "Foundational teacher-student distribution technique that KNOD's domain-rule distillation is directly based on" 455 }, 456 { 457 "title": "Graph Transformer Networks", 458 "relevance": "Architecture for the graph-transformer encoder used in KNOD's encoding stage" 459 } 460 ], 461 "engagement_factors": { 462 "practical_relevance": { 463 "score": 2, 464 "justification": "APR tools directly help developers fix bugs and KNOD has an open-source release, but it is Java-only and requires significant GPU compute." 465 }, 466 "surprise_contrarian": { 467 "score": 1, 468 "justification": "The finding that training-phase domain rule injection is more critical than inference-phase filtering is a useful but incremental insight; the overall direction is expected." 469 }, 470 "fear_safety": { 471 "score": 0, 472 "justification": "No safety or AI risk implications; this is a software engineering productivity tool." 473 }, 474 "drama_conflict": { 475 "score": 0, 476 "justification": "Standard benchmark competition in the APR field; no controversy or conflict angle." 477 }, 478 "demo_ability": { 479 "score": 2, 480 "justification": "Replication package available at github.com/lin-tan/knod; practitioners can run KNOD on Java projects with the provided setup." 481 }, 482 "brand_recognition": { 483 "score": 1, 484 "justification": "Purdue University is a respected CS program with J.P. Morgan AI funding, but no top-tier industry lab affiliation." 485 } 486 }, 487 "hn_data": { 488 "threads": [], 489 "top_points": 0, 490 "total_points": 0, 491 "total_comments": 0 492 } 493 }