scan-v5.json (24151B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "DynaFix: Iterative Automated Program Repair Driven by Execution-Level Dynamic Information", 6 "authors": [ 7 "Zhilin Huang", 8 "Ling Xu", 9 "Chao Liu", 10 "Weifeng Sun", 11 "Xu Zhang", 12 "Yan Lei", 13 "Meng Yan", 14 "Hongyu Zhang" 15 ], 16 "year": 2025, 17 "venue": "arXiv.org", 18 "arxiv_id": "2512.24635", 19 "doi": "10.48550/arXiv.2512.24635" 20 }, 21 "checklist": { 22 "claims_and_evidence": { 23 "abstract_claims_supported": { 24 "applies": true, 25 "answer": true, 26 "justification": "All abstract claims — 186 bugs fixed, 10% improvement over SOTA, 38 unique fixes, at most 35 attempts, 70% search reduction — are directly supported by Table 1, Figure 4, and Figure 7.", 27 "source": "haiku" 28 }, 29 "causal_claims_justified": { 30 "applies": true, 31 "answer": true, 32 "justification": "Ablation study (RQ4, Table 3) isolates each component's contribution; RQ2 controls for base model strength by comparing pure GPT-4o (72 bugs) vs DynaFix with same GPT-4o (206 bugs), providing adequate causal support.", 33 "source": "haiku" 34 }, 35 "generalization_bounded": { 36 "applies": true, 37 "answer": true, 38 "justification": "Section 6 explicitly limits claims to Java programs, Defects4J benchmark, and a single LLM; title and conclusion do not overclaim beyond evaluated settings.", 39 "source": "haiku" 40 }, 41 "alternative_explanations_discussed": { 42 "applies": true, 43 "answer": true, 44 "justification": "RQ2 isolates framework contribution by holding model constant; Section 6 addresses LLM memorization of Defects4J as an alternative explanation with a Defects4J v3.0 experiment as mitigation.", 45 "source": "haiku" 46 }, 47 "proxy_outcome_distinction": { 48 "applies": true, 49 "answer": true, 50 "justification": "Paper explicitly distinguishes 'plausible patches' (passes tests) from 'correct patches' (semantically equivalent to developer fix verified by manual inspection); RQ1 uses correct patches only.", 51 "source": "haiku" 52 } 53 }, 54 "limitations_and_scope": { 55 "limitations_section_present": { 56 "applies": true, 57 "answer": true, 58 "justification": "Section 6 'Threats to Validity' is a dedicated section with internal and external validity subsections.", 59 "source": "haiku" 60 }, 61 "threats_to_validity_specific": { 62 "applies": true, 63 "answer": true, 64 "justification": "Specific threats include: manual patch evaluation subjectivity, LLM training overlap with Defects4J (mitigated with v3.0 experiment), reliance on published baseline results without re-running, Java-only scope, and single-LLM dependency.", 65 "source": "haiku" 66 }, 67 "scope_boundaries_stated": { 68 "applies": true, 69 "answer": true, 70 "justification": "Explicitly bounded to Java programs, Defects4J benchmark, and perfect fault localization settings; multi-language extension is named as future work.", 71 "source": "haiku" 72 } 73 }, 74 "conflicts_of_interest": { 75 "funding_disclosed": { 76 "applies": true, 77 "answer": false, 78 "justification": "No funding acknowledgment, grant numbers, or sponsor information appears anywhere in the paper text.", 79 "source": "haiku" 80 }, 81 "affiliations_disclosed": { 82 "applies": true, 83 "answer": true, 84 "justification": "All eight authors list Chongqing University affiliations with full contact email addresses in the author block.", 85 "source": "haiku" 86 }, 87 "funder_independent_of_outcome": { 88 "applies": false, 89 "answer": false, 90 "justification": "No funder is disclosed, so independence cannot be assessed.", 91 "source": "haiku" 92 }, 93 "financial_interests_declared": { 94 "applies": true, 95 "answer": false, 96 "justification": "No competing interests statement, patent disclosures, or financial interest declaration appears in the paper.", 97 "source": "haiku" 98 } 99 }, 100 "scope_and_framing": { 101 "key_terms_defined": { 102 "applies": true, 103 "answer": true, 104 "justification": "'Plausible patch' vs 'correct patch' (Section 3.3), 'execution-level dynamic information' (Section 1), 'maximum patch attempts per bug' (Section 5.3), and DynaFix/ByteTrace components are all explicitly defined.", 105 "source": "haiku" 106 }, 107 "intended_contribution_clear": { 108 "applies": true, 109 "answer": true, 110 "justification": "Three contributions are explicitly enumerated at end of Section 1: the DynaFix framework, the ByteTrace tool, and SOTA results on Defects4J.", 111 "source": "haiku" 112 }, 113 "engagement_with_prior_work": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 7 explicitly positions DynaFix against FitRepair, GIANTRepair, ChatRepair, RepairAgent, SelfAPR, TraceFixer, and Self-Debug, explaining how DynaFix extends or differs from each.", 117 "source": "haiku" 118 } 119 } 120 }, 121 "type_checklist": { 122 "empirical": { 123 "artifacts": { 124 "code_released": { 125 "applies": true, 126 "answer": false, 127 "justification": "Paper states replication package 'will be made publicly available upon acceptance' — a conditional future promise; no link or current release is provided.", 128 "source": "haiku" 129 }, 130 "data_released": { 131 "applies": true, 132 "answer": true, 133 "justification": "Defects4J v1.2 and v2.0 are publicly available standard benchmarks used unmodified; no new dataset requiring release was created.", 134 "source": "haiku" 135 }, 136 "environment_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "Only mentions 'ByteTrace in Java' and 'core repair logic in Python' and 'OpenAI API'; no requirements.txt, Dockerfile, Java/Python version, or dependency specifications are provided.", 140 "source": "haiku" 141 }, 142 "reproduction_instructions": { 143 "applies": true, 144 "answer": false, 145 "justification": "No step-by-step reproduction instructions appear in the paper; replication package is not yet publicly available.", 146 "source": "haiku" 147 } 148 }, 149 "statistical_methodology": { 150 "confidence_intervals_or_error_bars": { 151 "applies": true, 152 "answer": false, 153 "justification": "Tables 1-3 and all figures report raw counts and percentages only; no confidence intervals or error bars appear for any result.", 154 "source": "haiku" 155 }, 156 "significance_tests": { 157 "applies": true, 158 "answer": false, 159 "justification": "All comparisons use raw bug counts and percentage differences; no statistical significance tests (t-test, Wilcoxon, etc.) are performed or mentioned.", 160 "source": "haiku" 161 }, 162 "effect_sizes_reported": { 163 "applies": true, 164 "answer": true, 165 "justification": "Percentage improvements over each baseline are consistently reported (e.g., +26.5% over RepairAgent, +43.1% over FitRepair) with baseline context values.", 166 "source": "haiku" 167 }, 168 "sample_size_justified": { 169 "applies": true, 170 "answer": false, 171 "justification": "The 483 single-function bug subset is adopted from standard prior practice without sample size justification or power analysis.", 172 "source": "haiku" 173 }, 174 "variance_reported": { 175 "applies": true, 176 "answer": false, 177 "justification": "Temperature is set to 1.0 (stochastic) but no variance, standard deviation, or multi-run statistics are reported; experiments appear to be single runs.", 178 "source": "haiku" 179 } 180 }, 181 "evaluation_design": { 182 "baselines_included": { 183 "applies": true, 184 "answer": true, 185 "justification": "11 SOTA baselines across 4 paradigms (LLM-based, deep learning, template-based, agent-based) are compared in RQ1.", 186 "source": "haiku" 187 }, 188 "baselines_contemporary": { 189 "applies": true, 190 "answer": true, 191 "justification": "Baselines span 2019-2025 with most recent being RepairAgent (2025) and GIANTREPAIR (2025); coverage is competitive and current.", 192 "source": "haiku" 193 }, 194 "ablation_study": { 195 "applies": true, 196 "answer": true, 197 "justification": "RQ4 (Table 3) ablates each component: w/o Local Variables, w/o Control Flow, w/o Method Call, w/o LPR, and Pure LLM baseline.", 198 "source": "haiku" 199 }, 200 "multiple_metrics": { 201 "applies": true, 202 "answer": true, 203 "justification": "Metrics include: correct patches, plausible patches, repair rate, unique fixes, and maximum patch attempts per bug (efficiency proxy).", 204 "source": "haiku" 205 }, 206 "human_evaluation": { 207 "applies": true, 208 "answer": true, 209 "justification": "Section 3.3 explicitly states manual inspection of test-passing patches to verify semantic equivalence to developer fix; RQ1 results are based on these manually verified correct patches.", 210 "source": "haiku" 211 }, 212 "held_out_test_set": { 213 "applies": true, 214 "answer": true, 215 "justification": "Defects4J provides bug-specific test suites used to validate patches; these serve as the held-out evaluation mechanism for all experiments.", 216 "source": "haiku" 217 }, 218 "per_category_breakdown": { 219 "applies": true, 220 "answer": true, 221 "justification": "Table 1 breaks results down by project (Chart, Closure, Lang, Math, Time, Mockito) and by dataset version (v1.2 vs v2.0).", 222 "source": "haiku" 223 }, 224 "failure_cases_discussed": { 225 "applies": true, 226 "answer": false, 227 "justification": "297/483 bugs remain unfixed and multi-function difficulty is noted, but no specific failure case examples or root-cause analysis of why DynaFix fails on particular bug types are provided.", 228 "source": "haiku" 229 }, 230 "negative_results_reported": { 231 "applies": true, 232 "answer": true, 233 "justification": "RQ2 shows execution-level information alone underperforms exception messages on multi-function bugs; RQ3 documents diminishing returns beyond breadth=7 or depth=5.", 234 "source": "haiku" 235 } 236 }, 237 "setup_transparency": { 238 "model_versions_specified": { 239 "applies": true, 240 "answer": false, 241 "justification": "Only 'GPT-4o' is specified; no model snapshot date (e.g., gpt-4o-2024-11-20) is provided, making exact replication impossible as OpenAI updates the model.", 242 "source": "haiku" 243 }, 244 "prompts_provided": { 245 "applies": true, 246 "answer": false, 247 "justification": "Figure 3 shows prompt structure schematically but the caption explicitly states 'code details are omitted'; actual prompt text, system instructions, and one-shot examples are not provided.", 248 "source": "haiku" 249 }, 250 "hyperparameters_reported": { 251 "applies": true, 252 "answer": true, 253 "justification": "Temperature=1.0 and LPR configuration (breadth=7, depth=5, max 35 total attempts, 30-minute per-attempt limit) are all explicitly reported.", 254 "source": "haiku" 255 }, 256 "scaffolding_described": { 257 "applies": true, 258 "answer": true, 259 "justification": "Sections 3.1-3.4 and Algorithm 1 describe the full workflow: ByteTrace instrumentation, structured prompt construction, automated patch validation, and LPR breadth-then-depth strategy in sufficient detail.", 260 "source": "haiku" 261 }, 262 "data_preprocessing_documented": { 263 "applies": true, 264 "answer": true, 265 "justification": "Section 4.2 describes bug subset selection (483 single-function from 830 total, 5 removed in latest update), v1.2/v2.0 split rationale, and use of perfect fault localization from Defects4J.", 266 "source": "haiku" 267 } 268 }, 269 "data_integrity": { 270 "raw_data_available": { 271 "applies": true, 272 "answer": false, 273 "justification": "Raw patch outputs and experimental results are not currently accessible; replication package is pending acceptance ('Link will be provided upon publication').", 274 "source": "haiku" 275 }, 276 "data_collection_described": { 277 "applies": true, 278 "answer": true, 279 "justification": "ByteTrace data collection mechanism is described in Section 3.1; bug selection from Defects4J and rationale for the 483-bug subset are described in Section 4.2.", 280 "source": "haiku" 281 }, 282 "recruitment_methods_described": { 283 "applies": false, 284 "answer": false, 285 "justification": "Standard benchmark study with no human participant recruitment.", 286 "source": "haiku" 287 }, 288 "data_pipeline_documented": { 289 "applies": true, 290 "answer": true, 291 "justification": "Full pipeline documented: bug selection → ByteTrace instrumentation → prompt construction → LLM invocation → patch validation → LPR iterative loop → manual correctness verification.", 292 "source": "haiku" 293 } 294 }, 295 "contamination": { 296 "training_cutoff_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "GPT-4o training data cutoff is never stated in the paper; only an API access date in the reference is provided.", 300 "source": "haiku" 301 }, 302 "train_test_overlap_discussed": { 303 "applies": true, 304 "answer": true, 305 "justification": "Section 6 explicitly discusses LLM training overlap with Defects4J open-source repositories, cites prior work [18] showing limited impact, and provides a Defects4J v3.0 experiment (9/24 bugs fixed) as empirical mitigation.", 306 "source": "haiku" 307 }, 308 "benchmark_contamination_addressed": { 309 "applies": true, 310 "answer": true, 311 "justification": "Contamination is addressed by arguing training corpora 'rarely contain complete bug-fix pairs' and by demonstrating generalization on Defects4J v3.0 bugs not present in prior benchmarks.", 312 "source": "haiku" 313 } 314 }, 315 "human_studies": { 316 "pre_registered": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human subjects study.", 320 "source": "haiku" 321 }, 322 "irb_or_ethics_approval": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "haiku" 327 }, 328 "demographics_reported": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "haiku" 333 }, 334 "inclusion_exclusion_criteria": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "haiku" 339 }, 340 "randomization_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "haiku" 345 }, 346 "blinding_described": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "haiku" 351 }, 352 "attrition_reported": { 353 "applies": false, 354 "answer": false, 355 "justification": "No human participants.", 356 "source": "haiku" 357 } 358 }, 359 "cost_and_practicality": { 360 "inference_cost_reported": { 361 "applies": true, 362 "answer": false, 363 "justification": "'Maximum patch attempts per bug' is used as a cost proxy and token-based billing is acknowledged, but actual dollar costs or total API call counts are never reported.", 364 "source": "haiku" 365 }, 366 "compute_budget_stated": { 367 "applies": true, 368 "answer": false, 369 "justification": "No total API cost, token consumption, or wall-clock time for the full experimental evaluation is reported.", 370 "source": "haiku" 371 } 372 } 373 } 374 }, 375 "claims": [ 376 { 377 "claim": "DynaFix repairs 186 single-function bugs on Defects4J, outperforming all 11 SOTA baselines including GIANTREPAIR (169 bugs).", 378 "evidence": "Table 1 shows DynaFix 186 total vs GIANTREPAIR 169, RepairAgent 147, FitRepair 130 across 483 single-function bugs.", 379 "supported": "strong" 380 }, 381 { 382 "claim": "DynaFix achieves 38 unique bug fixes not resolved by any of the 11 baselines.", 383 "evidence": "Figure 4(b) shows 38 uniquely repaired bugs by DynaFix in the complementarity analysis across all 483 bugs.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Iterative use of execution-level information is critical — execution-level info alone achieves only 24.2% repair rate vs DynaFix's 42.6% with iteration.", 388 "evidence": "Table 2 on full Defects4J v2.0: Pure LLM 14.9%, Exception 18.6%, Execution-Level 24.2%, DynaFix (iterative) 42.6%.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "DynaFix reduces maximum patch attempts by over 70% compared to the most efficient baseline (35 vs RepairAgent's 117).", 393 "evidence": "Figure 7 shows DynaFix at 35 max attempts vs RepairAgent at 117; other baselines range from 250 to 5,000.", 394 "supported": "strong" 395 }, 396 { 397 "claim": "The LPR strategy is the most impactful component, contributing 21.9 percentage points to repair rate.", 398 "evidence": "Table 3 ablation on 255 v1.2 bugs: Default 43.5% vs w/o LPR 21.6%, the largest single-component drop.", 399 "supported": "strong" 400 }, 401 { 402 "claim": "DynaFix with a single LLM outperforms GIANTREPAIR which aggregates four LLM models.", 403 "evidence": "Table 1 shows DynaFix (186) > GIANTREPAIR (169); paper notes GIANTREPAIR aggregates four models while DynaFix uses one.", 404 "supported": "moderate" 405 } 406 ], 407 "methodology_tags": [ 408 "benchmark-eval" 409 ], 410 "key_findings": "DynaFix integrates fine-grained execution-level dynamic information (variable states, control-flow paths, call stacks via the ByteTrace tool) into an iterative LLM-based APR workflow, achieving SOTA performance on Defects4J v1.2+v2.0 with 186 single-function bugs repaired including 38 previously unresolved by any baseline. The iterative mechanism is the dominant contributor (21.9pp in ablation), demonstrating that execution-level information alone is insufficient and must be combined with iteration to realize its value. DynaFix requires at most 35 patch attempts per bug — over 70% fewer than the most efficient baseline — showing that precise dynamic guidance dramatically reduces search overhead. Results are limited to Java under perfect fault localization and use an unpinned GPT-4o model without variance reporting.", 411 "red_flags": [ 412 { 413 "flag": "Model version not pinned", 414 "detail": "'GPT-4o' specified without a snapshot date; OpenAI updates this model silently, making exact replication impossible." 415 }, 416 { 417 "flag": "No statistical testing", 418 "detail": "All comparisons use raw counts and percentage differences; no significance tests are run despite stochastic generation at temperature=1.0." 419 }, 420 { 421 "flag": "No variance across runs", 422 "detail": "With temperature=1.0, results will differ across runs, but no standard deviation or multi-run reporting is provided; experiments appear to be single runs." 423 }, 424 { 425 "flag": "Code not yet released", 426 "detail": "Replication package promised 'upon acceptance'; paper cannot currently be reproduced independently." 427 }, 428 { 429 "flag": "Baselines not re-run", 430 "detail": "11 baselines are compared using their published results, which may use different Defects4J subsets, fault localization tools, or LLM configurations." 431 }, 432 { 433 "flag": "Perfect fault localization only", 434 "detail": "All experiments assume oracle bug location; real-world performance with automated fault localization is not evaluated." 435 }, 436 { 437 "flag": "Unresolved internal note in manuscript", 438 "detail": "Section 3.1 contains a stray editorial note ('please say that which experimental result approves the balance') left in the paper text, indicating the manuscript was submitted before completing revisions." 439 } 440 ], 441 "cited_papers": [ 442 { 443 "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair", 444 "relevance": "Key baseline and closest agentic APR approach; uses dynamic prompts and state machine for iterative repair." 445 }, 446 { 447 "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using chatgpt (ChatRepair)", 448 "relevance": "Key baseline: dialogue-driven iterative APR using test failure feedback; most directly comparable iterative method." 449 }, 450 { 451 "title": "The plastic surgery hypothesis in the era of large language models (FitRepair)", 452 "relevance": "Key baseline: LLM APR with patch-knowledge and repair-oriented fine-tuning." 453 }, 454 { 455 "title": "Hybrid Automated Program Repair by Combining Large Language Models and Program Analysis (GIANTREPAIR)", 456 "relevance": "Strongest baseline: aggregates four LLM models with patch skeleton extraction; DynaFix outperforms it with a single model." 457 }, 458 { 459 "title": "Tracefixer: Execution trace-driven program repair", 460 "relevance": "Closest prior work on execution traces for APR; uses traces during fine-tuning rather than iteratively at inference." 461 }, 462 { 463 "title": "Towards Effectively Leveraging Execution Traces for Program Repair with Code LLMs", 464 "relevance": "Closely related concurrent work analyzing execution trace utility for LLM-based APR." 465 }, 466 { 467 "title": "Teaching large language models to self-debug", 468 "relevance": "Related work on using code explanations and chain-of-thought for self-debugging in program repair." 469 }, 470 { 471 "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs", 472 "relevance": "Primary evaluation benchmark used for all experiments in this paper." 473 } 474 ], 475 "engagement_factors": { 476 "practical_relevance": { 477 "score": 3, 478 "justification": "DynaFix and ByteTrace are concrete implementable tools directly applicable to IDE-integrated APR; 186 real bugs fixed is tangible practitioner value." 479 }, 480 "surprise_contrarian": { 481 "score": 1, 482 "justification": "Confirms expected hypothesis that fine-grained iterative feedback improves repair; the negative finding (execution info alone doesn't help multi-function bugs) is mildly surprising." 483 }, 484 "fear_safety": { 485 "score": 0, 486 "justification": "No AI safety or risk concerns; purely a software engineering productivity tool." 487 }, 488 "drama_conflict": { 489 "score": 0, 490 "justification": "Standard empirical benchmark comparison with no controversy." 491 }, 492 "demo_ability": { 493 "score": 2, 494 "justification": "ByteTrace and DynaFix are described in enough detail to prototype; replication package forthcoming, but not yet available to try." 495 }, 496 "brand_recognition": { 497 "score": 0, 498 "justification": "All authors from Chongqing University; no famous lab, product, or industry affiliation." 499 } 500 }, 501 "hn_data": { 502 "threads": [], 503 "top_points": 0, 504 "total_points": 0, 505 "total_comments": 0 506 } 507 }