scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32623B)
      1 {
      2   "paper": {
      3     "title": "DynaFix: Iterative Automated Program Repair Driven by Execution-Level Dynamic Information",
      4     "authors": [
      5       "Zhili Huang",
      6       "Ling Xu",
      7       "Chao Liu",
      8       "Weifeng Sun",
      9       "Xu Zhang",
     10       "Yan Lei",
     11       "Meng Yan",
     12       "Hongyu Zhang"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2512.24635",
     17     "doi": "10.48550/arXiv.2512.24635"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "DynaFix integrates execution-level dynamic information (variable states, control-flow paths, call stacks) into an iterative LLM-based program repair workflow, repairing 186 single-function bugs on Defects4J — 17 more than the next-best baseline (GIANTREPAIR) and 38 uniquely fixed bugs. The Layered Progressive Repair (LPR) strategy reduces maximum patch attempts to 35 per bug (70% fewer than the most efficient baseline), while ablation shows LPR contributes the most (−21.9% fix rate without it) and execution-level info adds complementary value over coarse exception messages.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper states 'we will release a replication package including the DynaFix framework, the ByteTrace tool, and all experimental datasets upon acceptance' and 'will be made publicly available upon acceptance of the paper.' This is a promise of future release, not an actual release."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The evaluation uses the publicly available Defects4J benchmark (v1.2, v2.0, and v3.0), which is a standard public dataset that was not modified by the authors."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper mentions ByteTrace is implemented in Java and the core repair logic in Python, but provides no requirements.txt, Dockerfile, library versions, or environment setup details."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are provided. The replication package is promised upon acceptance but not available."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "All results are reported as point estimates (e.g., '186 bugs', '42.6%') with no confidence intervals, error bars, or uncertainty quantification."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims DynaFix 'outperforms' 11 baselines based solely on comparing raw counts of fixed bugs. No statistical significance tests (t-tests, Mann-Whitney U, etc.) are performed for any comparison."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "The paper reports improvements with baseline context throughout: '14 more than GIANTREPAIR', '39 more bugs than RepairAgent (an improvement of 26.5%)', '+27.7% improvement over Pure LLM', and similar contextual comparisons in Tables 1-3."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification for the sample size (483 single-function bugs). The paper adopts the Defects4J benchmark as given without discussing whether the bug count is sufficient for the comparative claims made."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be from a single run despite using temperature=1.0, which produces non-deterministic outputs."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper compares against 11 SOTA APR systems covering 4 paradigms: 5 LLM-based (FitRepair, Repilot, GAMMA, AlphaRepair, GIANTREPAIR), 4 deep learning-based (ITER, SelfAPR, Tare, KNOD), 1 template-based (TBar), and 1 agent-based (RepairAgent)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include recent work: RepairAgent (ICSE 2025), GIANTREPAIR (TOSEM 2025), FitRepair (ASE 2023), GAMMA (ASE 2023), Tare (ICSE 2023), and KNOD (ICSE 2023). These represent the current state of the art in APR."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "RQ4 (Section 5.4, Table 3) presents a systematic ablation study removing individual components: local variables, control flow, method calls, and the LPR strategy, showing each component's contribution."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper uses multiple metrics: number of correct patches (manually verified), number of plausible patches (test-passing), fix rate, unique fixes, and maximum patch attempts per bug (efficiency metric)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 3.3 states 'we further conduct manual inspection of test-passing patches to assess whether they are semantically equivalent to the developer's fix.' RQ1 results are based on manually verified correct patches."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "RQ3 tunes hyperparameters (breadth=7, depth=5) on 255 single-function bugs from Defects4J v1.2, but the final RQ1 results include these same v1.2 bugs. The hyperparameter selection data is not held out from the evaluation."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Table 1 provides per-project breakdowns (Chart, Closure, Lang, Math, Time, Mockito) and separate results for v1.2 and v2.0. Table 2 breaks down by single-function vs multi-function bugs."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "The paper shows examples where DynaFix succeeds (Listings 1-3) and discusses motivation cases, but provides no systematic analysis of where DynaFix fails or what bug types it cannot handle."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Table 2 reports that for multi-function bugs, exception messages (18 fixes) outperform execution-level information alone (15 fixes). RQ3 reports diminishing returns beyond breadth 7 and depth 5. These are honestly reported negative findings."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "Abstract claims are supported: '186 single-function bugs' (Table 1), '10% improvement' (186 vs 169 GIANTREPAIR = 10.1%), '38 bugs previously unrepaired' (Figure 4b), 'at most 35 attempts' (Figure 7), '70% reduction' (35 vs 117 = 70.1%). All numbers check out."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper makes causal claims ('DynaFix enables more effective bug localization', 'removing LPR reduces fix rate'). The ablation study (RQ4) uses controlled single-variable manipulation, and RQ2 isolates the effect of dynamic information from iteration by comparing four conditions with the same LLM."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title 'Iterative Automated Program Repair' and conclusion claim 'integrating execution-level feedback into automated repair can better align with real-world debugging practices' frame results broadly, but evaluation is limited to Java bugs in Defects4J only. The threats section acknowledges 'our evaluation is limited to Java programs' but this caveat doesn't appear in the title, abstract, or conclusion."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "RQ2 controls for the LLM (same GPT-4o across all conditions) to separate the effect of dynamic info from iteration. The threats section discusses data leakage as an alternative explanation, and the paper tests robustness with DeepSeek to address model-specific bias. They also note GIANTREPAIR uses 4 models while DynaFix uses 1."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures plausible patches (test-passing) and correct patches (manually verified semantic equivalence to developer fix). It clearly distinguishes between these two levels and explains why both are needed (Section 3.3, Section 4.3). Claims match measurement granularity."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The paper states 'We used GPT-4o as the underlying LLM, accessed via the OpenAI API' without specifying a snapshot date or API version (e.g., 'gpt-4o-2024-05-13'). For the robustness check, only 'DeepSeek model' is named without version."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Figure 3 shows the 'Structure of the hierarchical prompt template' but states 'code details are omitted.' The actual prompt text is described in natural language (Section 3.2) without providing the full prompts used in experiments."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 4.4 reports: temperature=1.0, max 35 candidate patches per bug, breadth=7, depth=5, 30-minute timeout per repair attempt. Key hyperparameters are documented."
    162       },
    163       "scaffolding_described": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "The iterative repair scaffolding is described in detail: ByteTrace instrumentation (Section 3.1), structured prompt construction (Section 3.2), automated patch validation (Section 3.3), and the LPR strategy with Algorithm 1 (Section 3.4). Figure 2 provides an architectural overview."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 4.2 documents the bug selection: 835 bugs in Defects4J v2.0, 5 removed leaving 830, classified into 483 single-function and 347 multi-function bugs following prior work. Section 5.1.1 describes normalization: 'retaining only the 483 single-function bugs selected in this study.'"
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 6 'Threats to Validity' is a dedicated section discussing internal and external validity threats across multiple paragraphs."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The threats section discusses study-specific issues: manual patch evaluation subjectivity, potential data leakage from GPT-4o training overlap with Defects4J, using reported baselines instead of re-running, reliance on a single LLM, and Java-only evaluation."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "The paper states specific scope boundaries: 'our evaluation is limited to Java programs' and 'Extending evaluation to multiple programming languages is a promising direction.' It also acknowledges reliance on a single LLM and provides a DeepSeek robustness check."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "All experimental data (patches, execution traces, validation results) are promised 'upon acceptance' but not currently available. Only aggregated results in tables and figures are provided."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.2 describes the Defects4J benchmark in detail: 835 real-world bugs from 17 open-source repositories, latest update removes 5 bugs, classification into single-function (483) and multi-function (347) bugs."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants. The data source is Defects4J, a standard benchmark."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The pipeline from bug selection to results is documented: Defects4J bugs → perfect fault localization → ByteTrace instrumentation → LLM patch generation → test validation → manual inspection. Each stage is described in Sections 3.1-3.4 and 4.2-4.4."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding sources, grants, or acknowledgments section appears in the paper. University researchers typically receive some form of funding, but none is disclosed."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All authors list their affiliation as Chongqing University, China. They are not evaluating a product from their own institution, so no conflict arises from the affiliations."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The paper evaluates GPT-4o (OpenAI product) but the authors have no disclosed relationship with OpenAI."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests statement or financial disclosure appears in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The paper does not state GPT-4o's training data cutoff date. They discuss data leakage in the threats section but never specify when the model's training data was collected."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section 6 discusses overlap: 'the LLM may have been trained on open-source repositories partially overlapping with Defects4J.' They cite prior work [18] suggesting limited impact and evaluate on 24 Defects4J v3.0 bugs as mitigation."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": true,
    249         "justification": "The paper discusses contamination risk: 'Prior work [18] suggests such overlap has limited impact on APR, since training corpora rarely contain complete bug–fix pairs.' They evaluate on Defects4J v3.0 bugs 'which were not included in prior benchmarks' as concrete mitigation, finding DynaFix fixes 9/24 vs 2/24 for pure LLM."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. This is a benchmark evaluation of an automated program repair tool."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants. The study evaluates software on public benchmarks."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "The paper reports maximum patch attempts (35 per bug) and 30-minute timeouts, and discusses 'token-based billing model' as a concern, but never reports actual API costs, tokens consumed, or dollar amounts per bug repair."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No total computational budget is stated. The paper mentions 30-minute per-attempt limits but does not report total GPU hours, total API spend, or hardware used for the experiments."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds or seed sensitivity analysis. The temperature is set to 1.0 (non-deterministic), but all results appear to be from a single run."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The number of experimental runs is never stated. It appears results are from a single run, but this is not explicitly confirmed."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "RQ3 explores different breadth (1-10) and depth (1-10) configurations in Figure 6, but does not report the total compute spent on this hyperparameter search. Only the resulting performance curves are shown."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "RQ3 (Section 5.3, Figure 6) shows performance and cost trade-offs across configurations. The paper transparently selects breadth=7 and depth=5 based on diminishing returns analysis: 'Beyond breadth 7 or depth 5, the search has already covered most high-quality patches.'"
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper compares against 11 baselines but performs no statistical tests at all, let alone corrections for multiple comparisons. All claims of superiority are based on raw count comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper uses reported baseline results rather than re-implementing them, which they acknowledge in threats. However, they do not discuss the bias of evaluating their own system or the systematic advantage that authors have in tuning their own approach."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Figure 7 compares maximum patch attempts across methods but does not report performance as a function of matched compute budgets. DynaFix's 35 attempts involve richer per-attempt computation (ByteTrace instrumentation + longer prompts) than baselines' simpler attempts, making the comparison non-equivalent."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper uses Defects4J as a standard benchmark without discussing whether it measures real-world bug repair capability. No discussion of construct validity, benchmark limitations, or comparison with alternative benchmarks."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": false,
    345         "justification": "Cross-system comparisons pit DynaFix (GPT-4o + ByteTrace + LPR) against baselines using different models and scaffolds (e.g., GIANTREPAIR uses 4 LLMs). While the ablation (RQ4) separates internal components, the main comparison conflates scaffold and model differences across systems."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "The paper discusses that GPT-4o may have been trained on Defects4J code and evaluates on 24 Defects4J v3.0 bugs 'newly introduced' and 'not included in prior benchmarks' as a temporal mitigation. DynaFix repaired 9/24 vs 2/24 for pure LLM."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The paper uses perfect fault localization (Section 4.4) but does not discuss whether this or other evaluation features (e.g., test suite information provided to the model) constitute feature leakage relative to real-world usage."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No discussion of whether bugs from the same Defects4J project share structural similarities that could violate independence assumptions. Bugs from the same repository may have correlated patterns."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The v3.0 evaluation provides some temporal mitigation but is not a systematic detection method and covers only 24 bugs."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "DynaFix repairs 186 single-function bugs on Defects4J, exceeding all 11 SOTA baselines including 38 bugs uniquely fixed.",
    374       "evidence": "Table 1 shows 186 total (100 on v1.2, 86 on v2.0). Figure 4(b) shows 38 unique fixes across all baselines. GIANTREPAIR is next at 169.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Execution-level dynamic information provides greater benefit than exception messages for single-function bugs when used without iteration.",
    379       "evidence": "Table 2 shows 24.2% fix rate with execution-level info vs 18.6% with exception info for single-function bugs. However, for multi-function bugs, exception info (5.2%) outperforms execution-level (4.3%).",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "The iterative mechanism is crucial for exploiting the value of dynamic information, more than doubling performance.",
    384       "evidence": "Table 2: DynaFix (iterative + dynamic info) achieves 42.6% vs execution-level info alone at 24.2% for single-function bugs. For multi-function: 8.9% vs 4.3%. Single-run results without statistical testing.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "DynaFix requires only 35 patch attempts per bug, reducing the search space by over 70% compared to the most efficient baseline.",
    389       "evidence": "Figure 7 shows DynaFix at 35 attempts vs RepairAgent at 117 (70.1% reduction). Most baselines require 500-5,000 attempts. The comparison is straightforward as the metric is clearly defined.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "The LPR strategy contributes the most to DynaFix's effectiveness, with the largest ablation impact.",
    394       "evidence": "Table 3 ablation study: removing LPR drops fix rate by 21.9% (111→55 plausible patches), far more than removing any single dynamic info type (3.5-5.5% drop each).",
    395       "supported": "moderate"
    396     },
    397     {
    398       "claim": "DynaFix generalizes to previously unseen bugs on Defects4J v3.0.",
    399       "evidence": "Section 6 reports DynaFix repairs 9/24 multi-function v3.0 bugs vs 2/24 for pure LLM. Very small sample size (24 bugs), no baselines compared on this set, and 9/24 is a modest rate.",
    400       "supported": "weak"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No statistical testing across 11 baseline comparisons",
    406       "detail": "All claims of superiority are based on comparing raw counts without significance tests, error bars, or multiple runs. With temperature=1.0 and non-deterministic outputs, single-run results may not be stable. A difference of 17 bugs (186 vs 169) over 483 could be within random variation."
    407     },
    408     {
    409       "flag": "Baseline results adopted from publications rather than re-run",
    410       "detail": "Section 4.4 states 'For all baseline APR systems, we directly adopt the official experimental results reported in their respective publications.' Different baselines may have been run under different conditions, model versions, and infrastructure, making direct comparison questionable."
    411     },
    412     {
    413       "flag": "Hyperparameters tuned on test data",
    414       "detail": "RQ3 selects optimal LPR parameters (breadth=7, depth=5) on Defects4J v1.2 (255 bugs), which are then included in the main RQ1 evaluation. The tuning set and the evaluation set overlap."
    415     },
    416     {
    417       "flag": "No model version specified for GPT-4o",
    418       "detail": "Only 'GPT-4o' is stated without a snapshot date. GPT-4o behavior changes across versions, and the specific version used could significantly affect results."
    419     },
    420     {
    421       "flag": "Single-run non-deterministic results",
    422       "detail": "Temperature is set to 1.0 for diversity, but no evidence of multiple runs. Non-deterministic LLM outputs at high temperature could produce substantially different results across runs."
    423     },
    424     {
    425       "flag": "Editing artifact in paper text",
    426       "detail": "Section 3.1 contains the text 'please say that which experimental result approves the balance' — an apparent instruction left from the drafting process, suggesting incomplete review before submission."
    427     },
    428     {
    429       "flag": "Non-equivalent patch attempt comparison",
    430       "detail": "Figure 7 compares DynaFix's 35 attempts against baselines' 117-5000 attempts, but DynaFix's per-attempt cost is higher (ByteTrace instrumentation + longer prompts with execution traces). The comparison of attempt counts does not equate to a fair compute-cost comparison."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    436       "authors": ["Islem Bouzenia", "Premkumar Devanbu", "Michael Pradel"],
    437       "year": 2025,
    438       "relevance": "Autonomous LLM agent for iterative program repair using dynamic prompts and a state machine, a direct baseline."
    439     },
    440     {
    441       "title": "Tracefixer: Execution trace-driven program repair",
    442       "authors": ["Islem Bouzenia", "Yangruibo Ding", "Kexin Pei", "Baishakhi Ray", "Michael Pradel"],
    443       "year": 2023,
    444       "arxiv_id": "2304.12743",
    445       "relevance": "Incorporates execution traces (local variable values, expected states) into APR during CodeT5 fine-tuning, a key related approach for dynamic information in repair."
    446     },
    447     {
    448       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using chatgpt",
    449       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    450       "year": 2024,
    451       "relevance": "Dialogue-driven iterative LLM repair (ChatRepair) using test failure feedback, a prominent baseline for conversational APR."
    452     },
    453     {
    454       "title": "The plastic surgery hypothesis in the era of large language models",
    455       "authors": ["Chunqiu Steven Xia", "Yifeng Ding", "Lingming Zhang"],
    456       "year": 2023,
    457       "relevance": "FitRepair: LLM-based APR with repair-oriented fine-tuning, one of the 11 baselines compared."
    458     },
    459     {
    460       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    461       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    462       "year": 2022,
    463       "relevance": "AlphaRepair: zero-shot LLM-based program repair baseline demonstrating LLMs' repair capabilities without fine-tuning."
    464     },
    465     {
    466       "title": "Teaching large language models to self-debug",
    467       "authors": ["Xinyun Chen", "Maxwell Lin", "Nathanael Schärli", "Denny Zhou"],
    468       "year": 2023,
    469       "arxiv_id": "2304.05128",
    470       "relevance": "Self-Debug: LLM self-debugging via chain-of-thought code explanations, relevant to iterative LLM-based repair approaches."
    471     },
    472     {
    473       "title": "Selfapr: Self-supervised program repair with test execution diagnostics",
    474       "authors": ["He Ye", "Matias Martinez", "Xiapu Luo", "Tao Zhang", "Martin Monperrus"],
    475       "year": 2022,
    476       "relevance": "Uses compiler and test diagnostics during self-supervised LLM training for APR, a baseline approach using dynamic signals in training."
    477     },
    478     {
    479       "title": "Hybrid Automated Program Repair by Combining Large Language Models and Program Analysis",
    480       "authors": ["Fengjie Li", "Jiajun Jiang", "Jiajun Sun", "Hongyu Zhang"],
    481       "year": 2025,
    482       "doi": "10.1145/3715004",
    483       "relevance": "GIANTREPAIR: strongest baseline, aggregates 4 LLM outputs with program analysis to reduce search space."
    484     },
    485     {
    486       "title": "Thinkrepair: Self-directed automated program repair",
    487       "authors": ["Xin Yin", "Chao Ni", "Shaohua Wang", "Zhenhao Li", "Limin Zeng", "Xiaohu Yang"],
    488       "year": 2024,
    489       "relevance": "Self-directed iterative APR approach using LLMs, representative of recent iterative repair strategies."
    490     },
    491     {
    492       "title": "Towards Effectively Leveraging Execution Traces for Program Repair with Code LLMs",
    493       "authors": ["Mirazul Haque", "Petr Babkin", "Farima Farmahinifarahani", "Manuela Veloso"],
    494       "year": 2025,
    495       "arxiv_id": "2505.04441",
    496       "relevance": "Systematically analyzes execution traces' potential in APR, demonstrating value for explaining failing test behaviors."
    497     },
    498     {
    499       "title": "Traced: Execution-aware pre-training for source code",
    500       "authors": ["Yangruibo Ding", "Benjamin Steenhoek", "Kexin Pei", "Gail Kaiser", "Wei Le", "Baishakhi Ray"],
    501       "year": 2024,
    502       "relevance": "Execution-aware pre-training that incorporates runtime traces into model training, related to dynamic information in code understanding."
    503     },
    504     {
    505       "title": "Impact of code language models on automated program repair",
    506       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    507       "year": 2023,
    508       "relevance": "Analyzes data leakage and overlap between LLM training data and APR benchmarks, cited for contamination discussion."
    509     },
    510     {
    511       "title": "Iter: Iterative neural repair for multi-location patches",
    512       "authors": ["He Ye", "Martin Monperrus"],
    513       "year": 2024,
    514       "relevance": "Iterative neural repair for multi-location bugs, a baseline approach for iterative program repair."
    515     },
    516     {
    517       "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair",
    518       "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"],
    519       "year": 2023,
    520       "relevance": "Repilot: combines LLMs with completion engines for APR, one of the 11 baselines compared."
    521     }
    522   ],
    523   "engagement_factors": {
    524     "practical_relevance": {
    525       "score": 2,
    526       "justification": "DynaFix proposes a usable APR approach for Java bugs, but requires ByteTrace setup, GPT-4o API access, and is not yet released."
    527     },
    528     "surprise_contrarian": {
    529       "score": 1,
    530       "justification": "Incremental improvement over existing APR methods; the insight that execution traces help is intuitive rather than surprising."
    531     },
    532     "fear_safety": {
    533       "score": 0,
    534       "justification": "No safety or security concerns — this is about fixing software bugs, not creating them."
    535     },
    536     "drama_conflict": {
    537       "score": 0,
    538       "justification": "No controversy or provocative claims; standard benchmark evaluation."
    539     },
    540     "demo_ability": {
    541       "score": 0,
    542       "justification": "No code, demo, or tool released; all artifacts are promised 'upon acceptance.'"
    543     },
    544     "brand_recognition": {
    545       "score": 1,
    546       "justification": "Uses GPT-4o (OpenAI) but the research lab (Chongqing University) is not widely known in the APR community."
    547     }
    548   }
    549 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs