scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (24151B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DynaFix: Iterative Automated Program Repair Driven by Execution-Level Dynamic Information",
      6     "authors": [
      7       "Zhilin Huang",
      8       "Ling Xu",
      9       "Chao Liu",
     10       "Weifeng Sun",
     11       "Xu Zhang",
     12       "Yan Lei",
     13       "Meng Yan",
     14       "Hongyu Zhang"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2512.24635",
     19     "doi": "10.48550/arXiv.2512.24635"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "All abstract claims — 186 bugs fixed, 10% improvement over SOTA, 38 unique fixes, at most 35 attempts, 70% search reduction — are directly supported by Table 1, Figure 4, and Figure 7.",
     27         "source": "haiku"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Ablation study (RQ4, Table 3) isolates each component's contribution; RQ2 controls for base model strength by comparing pure GPT-4o (72 bugs) vs DynaFix with same GPT-4o (206 bugs), providing adequate causal support.",
     33         "source": "haiku"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "Section 6 explicitly limits claims to Java programs, Defects4J benchmark, and a single LLM; title and conclusion do not overclaim beyond evaluated settings.",
     39         "source": "haiku"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "RQ2 isolates framework contribution by holding model constant; Section 6 addresses LLM memorization of Defects4J as an alternative explanation with a Defects4J v3.0 experiment as mitigation.",
     45         "source": "haiku"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Paper explicitly distinguishes 'plausible patches' (passes tests) from 'correct patches' (semantically equivalent to developer fix verified by manual inspection); RQ1 uses correct patches only.",
     51         "source": "haiku"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 6 'Threats to Validity' is a dedicated section with internal and external validity subsections.",
     59         "source": "haiku"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "Specific threats include: manual patch evaluation subjectivity, LLM training overlap with Defects4J (mitigated with v3.0 experiment), reliance on published baseline results without re-running, Java-only scope, and single-LLM dependency.",
     65         "source": "haiku"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "Explicitly bounded to Java programs, Defects4J benchmark, and perfect fault localization settings; multi-language extension is named as future work.",
     71         "source": "haiku"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding acknowledgment, grant numbers, or sponsor information appears anywhere in the paper text.",
     79         "source": "haiku"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All eight authors list Chongqing University affiliations with full contact email addresses in the author block.",
     85         "source": "haiku"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "No funder is disclosed, so independence cannot be assessed.",
     91         "source": "haiku"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests statement, patent disclosures, or financial interest declaration appears in the paper.",
     97         "source": "haiku"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "'Plausible patch' vs 'correct patch' (Section 3.3), 'execution-level dynamic information' (Section 1), 'maximum patch attempts per bug' (Section 5.3), and DynaFix/ByteTrace components are all explicitly defined.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Three contributions are explicitly enumerated at end of Section 1: the DynaFix framework, the ByteTrace tool, and SOTA results on Defects4J.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 7 explicitly positions DynaFix against FitRepair, GIANTRepair, ChatRepair, RepairAgent, SelfAPR, TraceFixer, and Self-Debug, explaining how DynaFix extends or differs from each.",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "Paper states replication package 'will be made publicly available upon acceptance' — a conditional future promise; no link or current release is provided.",
    128           "source": "haiku"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "Defects4J v1.2 and v2.0 are publicly available standard benchmarks used unmodified; no new dataset requiring release was created.",
    134           "source": "haiku"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Only mentions 'ByteTrace in Java' and 'core repair logic in Python' and 'OpenAI API'; no requirements.txt, Dockerfile, Java/Python version, or dependency specifications are provided.",
    140           "source": "haiku"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No step-by-step reproduction instructions appear in the paper; replication package is not yet publicly available.",
    146           "source": "haiku"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "Tables 1-3 and all figures report raw counts and percentages only; no confidence intervals or error bars appear for any result.",
    154           "source": "haiku"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "All comparisons use raw bug counts and percentage differences; no statistical significance tests (t-test, Wilcoxon, etc.) are performed or mentioned.",
    160           "source": "haiku"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Percentage improvements over each baseline are consistently reported (e.g., +26.5% over RepairAgent, +43.1% over FitRepair) with baseline context values.",
    166           "source": "haiku"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "The 483 single-function bug subset is adopted from standard prior practice without sample size justification or power analysis.",
    172           "source": "haiku"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "Temperature is set to 1.0 (stochastic) but no variance, standard deviation, or multi-run statistics are reported; experiments appear to be single runs.",
    178           "source": "haiku"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "11 SOTA baselines across 4 paradigms (LLM-based, deep learning, template-based, agent-based) are compared in RQ1.",
    186           "source": "haiku"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Baselines span 2019-2025 with most recent being RepairAgent (2025) and GIANTREPAIR (2025); coverage is competitive and current.",
    192           "source": "haiku"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "RQ4 (Table 3) ablates each component: w/o Local Variables, w/o Control Flow, w/o Method Call, w/o LPR, and Pure LLM baseline.",
    198           "source": "haiku"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "Metrics include: correct patches, plausible patches, repair rate, unique fixes, and maximum patch attempts per bug (efficiency proxy).",
    204           "source": "haiku"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Section 3.3 explicitly states manual inspection of test-passing patches to verify semantic equivalence to developer fix; RQ1 results are based on these manually verified correct patches.",
    210           "source": "haiku"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": true,
    215           "justification": "Defects4J provides bug-specific test suites used to validate patches; these serve as the held-out evaluation mechanism for all experiments.",
    216           "source": "haiku"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Table 1 breaks results down by project (Chart, Closure, Lang, Math, Time, Mockito) and by dataset version (v1.2 vs v2.0).",
    222           "source": "haiku"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": false,
    227           "justification": "297/483 bugs remain unfixed and multi-function difficulty is noted, but no specific failure case examples or root-cause analysis of why DynaFix fails on particular bug types are provided.",
    228           "source": "haiku"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "RQ2 shows execution-level information alone underperforms exception messages on multi-function bugs; RQ3 documents diminishing returns beyond breadth=7 or depth=5.",
    234           "source": "haiku"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "Only 'GPT-4o' is specified; no model snapshot date (e.g., gpt-4o-2024-11-20) is provided, making exact replication impossible as OpenAI updates the model.",
    242           "source": "haiku"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "Figure 3 shows prompt structure schematically but the caption explicitly states 'code details are omitted'; actual prompt text, system instructions, and one-shot examples are not provided.",
    248           "source": "haiku"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Temperature=1.0 and LPR configuration (breadth=7, depth=5, max 35 total attempts, 30-minute per-attempt limit) are all explicitly reported.",
    254           "source": "haiku"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "Sections 3.1-3.4 and Algorithm 1 describe the full workflow: ByteTrace instrumentation, structured prompt construction, automated patch validation, and LPR breadth-then-depth strategy in sufficient detail.",
    260           "source": "haiku"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "Section 4.2 describes bug subset selection (483 single-function from 830 total, 5 removed in latest update), v1.2/v2.0 split rationale, and use of perfect fault localization from Defects4J.",
    266           "source": "haiku"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "Raw patch outputs and experimental results are not currently accessible; replication package is pending acceptance ('Link will be provided upon publication').",
    274           "source": "haiku"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "ByteTrace data collection mechanism is described in Section 3.1; bug selection from Defects4J and rationale for the 483-bug subset are described in Section 4.2.",
    280           "source": "haiku"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "Standard benchmark study with no human participant recruitment.",
    286           "source": "haiku"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "Full pipeline documented: bug selection → ByteTrace instrumentation → prompt construction → LLM invocation → patch validation → LPR iterative loop → manual correctness verification.",
    292           "source": "haiku"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "GPT-4o training data cutoff is never stated in the paper; only an API access date in the reference is provided.",
    300           "source": "haiku"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "Section 6 explicitly discusses LLM training overlap with Defects4J open-source repositories, cites prior work [18] showing limited impact, and provides a Defects4J v3.0 experiment (9/24 bugs fixed) as empirical mitigation.",
    306           "source": "haiku"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": true,
    311           "justification": "Contamination is addressed by arguing training corpora 'rarely contain complete bug-fix pairs' and by demonstrating generalization on Defects4J v3.0 bugs not present in prior benchmarks.",
    312           "source": "haiku"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human subjects study.",
    320           "source": "haiku"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants.",
    326           "source": "haiku"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "haiku"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "haiku"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "haiku"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "haiku"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants.",
    356           "source": "haiku"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "'Maximum patch attempts per bug' is used as a cost proxy and token-based billing is acknowledged, but actual dollar costs or total API call counts are never reported.",
    364           "source": "haiku"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No total API cost, token consumption, or wall-clock time for the full experimental evaluation is reported.",
    370           "source": "haiku"
    371         }
    372       }
    373     }
    374   },
    375   "claims": [
    376     {
    377       "claim": "DynaFix repairs 186 single-function bugs on Defects4J, outperforming all 11 SOTA baselines including GIANTREPAIR (169 bugs).",
    378       "evidence": "Table 1 shows DynaFix 186 total vs GIANTREPAIR 169, RepairAgent 147, FitRepair 130 across 483 single-function bugs.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "DynaFix achieves 38 unique bug fixes not resolved by any of the 11 baselines.",
    383       "evidence": "Figure 4(b) shows 38 uniquely repaired bugs by DynaFix in the complementarity analysis across all 483 bugs.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Iterative use of execution-level information is critical — execution-level info alone achieves only 24.2% repair rate vs DynaFix's 42.6% with iteration.",
    388       "evidence": "Table 2 on full Defects4J v2.0: Pure LLM 14.9%, Exception 18.6%, Execution-Level 24.2%, DynaFix (iterative) 42.6%.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "DynaFix reduces maximum patch attempts by over 70% compared to the most efficient baseline (35 vs RepairAgent's 117).",
    393       "evidence": "Figure 7 shows DynaFix at 35 max attempts vs RepairAgent at 117; other baselines range from 250 to 5,000.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "The LPR strategy is the most impactful component, contributing 21.9 percentage points to repair rate.",
    398       "evidence": "Table 3 ablation on 255 v1.2 bugs: Default 43.5% vs w/o LPR 21.6%, the largest single-component drop.",
    399       "supported": "strong"
    400     },
    401     {
    402       "claim": "DynaFix with a single LLM outperforms GIANTREPAIR which aggregates four LLM models.",
    403       "evidence": "Table 1 shows DynaFix (186) > GIANTREPAIR (169); paper notes GIANTREPAIR aggregates four models while DynaFix uses one.",
    404       "supported": "moderate"
    405     }
    406   ],
    407   "methodology_tags": [
    408     "benchmark-eval"
    409   ],
    410   "key_findings": "DynaFix integrates fine-grained execution-level dynamic information (variable states, control-flow paths, call stacks via the ByteTrace tool) into an iterative LLM-based APR workflow, achieving SOTA performance on Defects4J v1.2+v2.0 with 186 single-function bugs repaired including 38 previously unresolved by any baseline. The iterative mechanism is the dominant contributor (21.9pp in ablation), demonstrating that execution-level information alone is insufficient and must be combined with iteration to realize its value. DynaFix requires at most 35 patch attempts per bug — over 70% fewer than the most efficient baseline — showing that precise dynamic guidance dramatically reduces search overhead. Results are limited to Java under perfect fault localization and use an unpinned GPT-4o model without variance reporting.",
    411   "red_flags": [
    412     {
    413       "flag": "Model version not pinned",
    414       "detail": "'GPT-4o' specified without a snapshot date; OpenAI updates this model silently, making exact replication impossible."
    415     },
    416     {
    417       "flag": "No statistical testing",
    418       "detail": "All comparisons use raw counts and percentage differences; no significance tests are run despite stochastic generation at temperature=1.0."
    419     },
    420     {
    421       "flag": "No variance across runs",
    422       "detail": "With temperature=1.0, results will differ across runs, but no standard deviation or multi-run reporting is provided; experiments appear to be single runs."
    423     },
    424     {
    425       "flag": "Code not yet released",
    426       "detail": "Replication package promised 'upon acceptance'; paper cannot currently be reproduced independently."
    427     },
    428     {
    429       "flag": "Baselines not re-run",
    430       "detail": "11 baselines are compared using their published results, which may use different Defects4J subsets, fault localization tools, or LLM configurations."
    431     },
    432     {
    433       "flag": "Perfect fault localization only",
    434       "detail": "All experiments assume oracle bug location; real-world performance with automated fault localization is not evaluated."
    435     },
    436     {
    437       "flag": "Unresolved internal note in manuscript",
    438       "detail": "Section 3.1 contains a stray editorial note ('please say that which experimental result approves the balance') left in the paper text, indicating the manuscript was submitted before completing revisions."
    439     }
    440   ],
    441   "cited_papers": [
    442     {
    443       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    444       "relevance": "Key baseline and closest agentic APR approach; uses dynamic prompts and state machine for iterative repair."
    445     },
    446     {
    447       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using chatgpt (ChatRepair)",
    448       "relevance": "Key baseline: dialogue-driven iterative APR using test failure feedback; most directly comparable iterative method."
    449     },
    450     {
    451       "title": "The plastic surgery hypothesis in the era of large language models (FitRepair)",
    452       "relevance": "Key baseline: LLM APR with patch-knowledge and repair-oriented fine-tuning."
    453     },
    454     {
    455       "title": "Hybrid Automated Program Repair by Combining Large Language Models and Program Analysis (GIANTREPAIR)",
    456       "relevance": "Strongest baseline: aggregates four LLM models with patch skeleton extraction; DynaFix outperforms it with a single model."
    457     },
    458     {
    459       "title": "Tracefixer: Execution trace-driven program repair",
    460       "relevance": "Closest prior work on execution traces for APR; uses traces during fine-tuning rather than iteratively at inference."
    461     },
    462     {
    463       "title": "Towards Effectively Leveraging Execution Traces for Program Repair with Code LLMs",
    464       "relevance": "Closely related concurrent work analyzing execution trace utility for LLM-based APR."
    465     },
    466     {
    467       "title": "Teaching large language models to self-debug",
    468       "relevance": "Related work on using code explanations and chain-of-thought for self-debugging in program repair."
    469     },
    470     {
    471       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    472       "relevance": "Primary evaluation benchmark used for all experiments in this paper."
    473     }
    474   ],
    475   "engagement_factors": {
    476     "practical_relevance": {
    477       "score": 3,
    478       "justification": "DynaFix and ByteTrace are concrete implementable tools directly applicable to IDE-integrated APR; 186 real bugs fixed is tangible practitioner value."
    479     },
    480     "surprise_contrarian": {
    481       "score": 1,
    482       "justification": "Confirms expected hypothesis that fine-grained iterative feedback improves repair; the negative finding (execution info alone doesn't help multi-function bugs) is mildly surprising."
    483     },
    484     "fear_safety": {
    485       "score": 0,
    486       "justification": "No AI safety or risk concerns; purely a software engineering productivity tool."
    487     },
    488     "drama_conflict": {
    489       "score": 0,
    490       "justification": "Standard empirical benchmark comparison with no controversy."
    491     },
    492     "demo_ability": {
    493       "score": 2,
    494       "justification": "ByteTrace and DynaFix are described in enough detail to prototype; replication package forthcoming, but not yet available to try."
    495     },
    496     "brand_recognition": {
    497       "score": 0,
    498       "justification": "All authors from Chongqing University; no famous lab, product, or industry affiliation."
    499     }
    500   },
    501   "hn_data": {
    502     "threads": [],
    503     "top_points": 0,
    504     "total_points": 0,
    505     "total_comments": 0
    506   }
    507 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs