scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32559B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DynaFix: Iterative Automated Program Repair Driven by Execution-Level Dynamic Information",
      6     "authors": [
      7       "Zhilin Huang",
      8       "Ling Xu",
      9       "Chao Liu",
     10       "Weifeng Sun",
     11       "Xu Zhang",
     12       "Yan Lei",
     13       "Meng Yan",
     14       "Hongyu Zhang"
     15     ],
     16     "year": 2025,
     17     "venue": "arXiv.org",
     18     "arxiv_id": "2512.24635",
     19     "doi": "10.48550/arXiv.2512.24635"
     20   },
     21   "checklist": {
     22     "claims_and_evidence": {
     23       "abstract_claims_supported": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Abstract claims are supported: '186 single-function bugs' (Table 1), '10% improvement' (186 vs 169 GIANTREPAIR = 10.1%), '38 bugs previously unrepaired' (Figure 4b), 'at most 35 attempts' (Figure 7), '70% reduction' (35 vs 117 = 70.1%). All numbers check out.",
     27         "source": "opus"
     28       },
     29       "causal_claims_justified": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The paper makes causal claims ('DynaFix enables more effective bug localization', 'removing LPR reduces fix rate'). The ablation study (RQ4) uses controlled single-variable manipulation, and RQ2 isolates the effect of dynamic information from iteration by comparing four conditions with the same LLM.",
     33         "source": "opus"
     34       },
     35       "generalization_bounded": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The title 'Iterative Automated Program Repair' and conclusion claim 'integrating execution-level feedback into automated repair can better align with real-world debugging practices' frame results broadly, but evaluation is limited to Java bugs in Defects4J only. The threats section acknowledges 'our evaluation is limited to Java programs' but this caveat doesn't appear in the title, abstract, or conclusion.",
     39         "source": "opus"
     40       },
     41       "alternative_explanations_discussed": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "RQ2 controls for the LLM (same GPT-4o across all conditions) to separate the effect of dynamic info from iteration. The threats section discusses data leakage as an alternative explanation, and the paper tests robustness with DeepSeek to address model-specific bias. They also note GIANTREPAIR uses 4 models while DynaFix uses 1.",
     45         "source": "opus"
     46       },
     47       "proxy_outcome_distinction": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "The paper measures plausible patches (test-passing) and correct patches (manually verified semantic equivalence to developer fix). It clearly distinguishes between these two levels and explains why both are needed (Section 3.3, Section 4.3). Claims match measurement granularity.",
     51         "source": "opus"
     52       }
     53     },
     54     "limitations_and_scope": {
     55       "limitations_section_present": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Section 6 'Threats to Validity' is a dedicated section discussing internal and external validity threats across multiple paragraphs.",
     59         "source": "opus"
     60       },
     61       "threats_to_validity_specific": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The threats section discusses study-specific issues: manual patch evaluation subjectivity, potential data leakage from GPT-4o training overlap with Defects4J, using reported baselines instead of re-running, reliance on a single LLM, and Java-only evaluation.",
     65         "source": "opus"
     66       },
     67       "scope_boundaries_stated": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper states specific scope boundaries: 'our evaluation is limited to Java programs' and 'Extending evaluation to multiple programming languages is a promising direction.' It also acknowledges reliance on a single LLM and provides a DeepSeek robustness check.",
     71         "source": "opus"
     72       }
     73     },
     74     "conflicts_of_interest": {
     75       "funding_disclosed": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No funding sources, grants, or acknowledgments section appears in the paper. University researchers typically receive some form of funding, but none is disclosed.",
     79         "source": "opus"
     80       },
     81       "affiliations_disclosed": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "All authors list their affiliation as Chongqing University, China. They are not evaluating a product from their own institution, so no conflict arises from the affiliations.",
     85         "source": "opus"
     86       },
     87       "funder_independent_of_outcome": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The paper evaluates GPT-4o (OpenAI product) but the authors have no disclosed relationship with OpenAI.",
     91         "source": "opus"
     92       },
     93       "financial_interests_declared": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No competing interests statement or financial disclosure appears in the paper.",
     97         "source": "opus"
     98       }
     99     },
    100     "scope_and_framing": {
    101       "key_terms_defined": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Key terms are defined: 'execution-level dynamic information' (variable states, control-flow paths, call stacks), 'plausible patch' vs 'correct patch', 'single-function' vs 'multi-function' bugs.",
    105         "source": "haiku"
    106       },
    107       "intended_contribution_clear": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Contributions are explicitly enumerated in bullet form: the DynaFix framework, the ByteTrace instrumentation tool, and evaluation on Defects4J demonstrating superiority over 11 SOTA methods.",
    111         "source": "haiku"
    112       },
    113       "engagement_with_prior_work": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Section 7 explicitly contrasts DynaFix with TraceFixer and other execution-trace methods (which use dynamic info only in training/once), and with iterative methods like ChatRepair (which use only coarse-grained feedback).",
    117         "source": "haiku"
    118       }
    119     }
    120   },
    121   "type_checklist": {
    122     "empirical": {
    123       "artifacts": {
    124         "code_released": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "The paper states 'we will release a replication package including the DynaFix framework, the ByteTrace tool, and all experimental datasets upon acceptance' and 'will be made publicly available upon acceptance of the paper.' This is a promise of future release, not an actual release.",
    128           "source": "opus"
    129         },
    130         "data_released": {
    131           "applies": true,
    132           "answer": true,
    133           "justification": "The evaluation uses the publicly available Defects4J benchmark (v1.2, v2.0, and v3.0), which is a standard public dataset that was not modified by the authors.",
    134           "source": "opus"
    135         },
    136         "environment_specified": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "The paper mentions ByteTrace is implemented in Java and the core repair logic in Python, but provides no requirements.txt, Dockerfile, library versions, or environment setup details.",
    140           "source": "opus"
    141         },
    142         "reproduction_instructions": {
    143           "applies": true,
    144           "answer": false,
    145           "justification": "No step-by-step reproduction instructions are provided. The replication package is promised upon acceptance but not available.",
    146           "source": "opus"
    147         }
    148       },
    149       "statistical_methodology": {
    150         "confidence_intervals_or_error_bars": {
    151           "applies": true,
    152           "answer": false,
    153           "justification": "All results are reported as point estimates (e.g., '186 bugs', '42.6%') with no confidence intervals, error bars, or uncertainty quantification.",
    154           "source": "opus"
    155         },
    156         "significance_tests": {
    157           "applies": true,
    158           "answer": false,
    159           "justification": "The paper claims DynaFix 'outperforms' 11 baselines based solely on comparing raw counts of fixed bugs. No statistical significance tests (t-tests, Mann-Whitney U, etc.) are performed for any comparison.",
    160           "source": "opus"
    161         },
    162         "effect_sizes_reported": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "The paper reports improvements with baseline context throughout: '14 more than GIANTREPAIR', '39 more bugs than RepairAgent (an improvement of 26.5%)', '+27.7% improvement over Pure LLM', and similar contextual comparisons in Tables 1-3.",
    166           "source": "opus"
    167         },
    168         "sample_size_justified": {
    169           "applies": true,
    170           "answer": false,
    171           "justification": "No justification for the sample size (483 single-function bugs). The paper adopts the Defects4J benchmark as given without discussing whether the bug count is sufficient for the comparative claims made.",
    172           "source": "opus"
    173         },
    174         "variance_reported": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. Results appear to be from a single run despite using temperature=1.0, which produces non-deterministic outputs.",
    178           "source": "opus"
    179         }
    180       },
    181       "evaluation_design": {
    182         "baselines_included": {
    183           "applies": true,
    184           "answer": true,
    185           "justification": "The paper compares against 11 SOTA APR systems covering 4 paradigms: 5 LLM-based (FitRepair, Repilot, GAMMA, AlphaRepair, GIANTREPAIR), 4 deep learning-based (ITER, SelfAPR, Tare, KNOD), 1 template-based (TBar), and 1 agent-based (RepairAgent).",
    186           "source": "opus"
    187         },
    188         "baselines_contemporary": {
    189           "applies": true,
    190           "answer": true,
    191           "justification": "Baselines include recent work: RepairAgent (ICSE 2025), GIANTREPAIR (TOSEM 2025), FitRepair (ASE 2023), GAMMA (ASE 2023), Tare (ICSE 2023), and KNOD (ICSE 2023). These represent the current state of the art in APR.",
    192           "source": "opus"
    193         },
    194         "ablation_study": {
    195           "applies": true,
    196           "answer": true,
    197           "justification": "RQ4 (Section 5.4, Table 3) presents a systematic ablation study removing individual components: local variables, control flow, method calls, and the LPR strategy, showing each component's contribution.",
    198           "source": "opus"
    199         },
    200         "multiple_metrics": {
    201           "applies": true,
    202           "answer": true,
    203           "justification": "The paper uses multiple metrics: number of correct patches (manually verified), number of plausible patches (test-passing), fix rate, unique fixes, and maximum patch attempts per bug (efficiency metric).",
    204           "source": "opus"
    205         },
    206         "human_evaluation": {
    207           "applies": true,
    208           "answer": true,
    209           "justification": "Section 3.3 states 'we further conduct manual inspection of test-passing patches to assess whether they are semantically equivalent to the developer's fix.' RQ1 results are based on manually verified correct patches.",
    210           "source": "opus"
    211         },
    212         "held_out_test_set": {
    213           "applies": true,
    214           "answer": false,
    215           "justification": "RQ3 tunes hyperparameters (breadth=7, depth=5) on 255 single-function bugs from Defects4J v1.2, but the final RQ1 results include these same v1.2 bugs. The hyperparameter selection data is not held out from the evaluation.",
    216           "source": "opus"
    217         },
    218         "per_category_breakdown": {
    219           "applies": true,
    220           "answer": true,
    221           "justification": "Table 1 provides per-project breakdowns (Chart, Closure, Lang, Math, Time, Mockito) and separate results for v1.2 and v2.0. Table 2 breaks down by single-function vs multi-function bugs.",
    222           "source": "opus"
    223         },
    224         "failure_cases_discussed": {
    225           "applies": true,
    226           "answer": false,
    227           "justification": "The paper shows examples where DynaFix succeeds (Listings 1-3) and discusses motivation cases, but provides no systematic analysis of where DynaFix fails or what bug types it cannot handle.",
    228           "source": "opus"
    229         },
    230         "negative_results_reported": {
    231           "applies": true,
    232           "answer": true,
    233           "justification": "Table 2 reports that for multi-function bugs, exception messages (18 fixes) outperform execution-level information alone (15 fixes). RQ3 reports diminishing returns beyond breadth 7 and depth 5. These are honestly reported negative findings.",
    234           "source": "opus"
    235         }
    236       },
    237       "setup_transparency": {
    238         "model_versions_specified": {
    239           "applies": true,
    240           "answer": false,
    241           "justification": "The paper states 'We used GPT-4o as the underlying LLM, accessed via the OpenAI API' without specifying a snapshot date or API version (e.g., 'gpt-4o-2024-05-13'). For the robustness check, only 'DeepSeek model' is named without version.",
    242           "source": "opus"
    243         },
    244         "prompts_provided": {
    245           "applies": true,
    246           "answer": false,
    247           "justification": "Figure 3 shows the 'Structure of the hierarchical prompt template' but states 'code details are omitted.' The actual prompt text is described in natural language (Section 3.2) without providing the full prompts used in experiments.",
    248           "source": "opus"
    249         },
    250         "hyperparameters_reported": {
    251           "applies": true,
    252           "answer": true,
    253           "justification": "Section 4.4 reports: temperature=1.0, max 35 candidate patches per bug, breadth=7, depth=5, 30-minute timeout per repair attempt. Key hyperparameters are documented.",
    254           "source": "opus"
    255         },
    256         "scaffolding_described": {
    257           "applies": true,
    258           "answer": true,
    259           "justification": "The iterative repair scaffolding is described in detail: ByteTrace instrumentation (Section 3.1), structured prompt construction (Section 3.2), automated patch validation (Section 3.3), and the LPR strategy with Algorithm 1 (Section 3.4). Figure 2 provides an architectural overview.",
    260           "source": "opus"
    261         },
    262         "data_preprocessing_documented": {
    263           "applies": true,
    264           "answer": true,
    265           "justification": "Section 4.2 documents the bug selection: 835 bugs in Defects4J v2.0, 5 removed leaving 830, classified into 483 single-function and 347 multi-function bugs following prior work. Section 5.1.1 describes normalization: 'retaining only the 483 single-function bugs selected in this study.'",
    266           "source": "opus"
    267         }
    268       },
    269       "data_integrity": {
    270         "raw_data_available": {
    271           "applies": true,
    272           "answer": false,
    273           "justification": "All experimental data (patches, execution traces, validation results) are promised 'upon acceptance' but not currently available. Only aggregated results in tables and figures are provided.",
    274           "source": "opus"
    275         },
    276         "data_collection_described": {
    277           "applies": true,
    278           "answer": true,
    279           "justification": "Section 4.2 describes the Defects4J benchmark in detail: 835 real-world bugs from 17 open-source repositories, latest update removes 5 bugs, classification into single-function (483) and multi-function (347) bugs.",
    280           "source": "opus"
    281         },
    282         "recruitment_methods_described": {
    283           "applies": false,
    284           "answer": false,
    285           "justification": "No human participants. The data source is Defects4J, a standard benchmark.",
    286           "source": "opus"
    287         },
    288         "data_pipeline_documented": {
    289           "applies": true,
    290           "answer": true,
    291           "justification": "The pipeline from bug selection to results is documented: Defects4J bugs → perfect fault localization → ByteTrace instrumentation → LLM patch generation → test validation → manual inspection. Each stage is described in Sections 3.1-3.4 and 4.2-4.4.",
    292           "source": "opus"
    293         }
    294       },
    295       "contamination": {
    296         "training_cutoff_stated": {
    297           "applies": true,
    298           "answer": false,
    299           "justification": "The paper does not state GPT-4o's training data cutoff date. They discuss data leakage in the threats section but never specify when the model's training data was collected.",
    300           "source": "opus"
    301         },
    302         "train_test_overlap_discussed": {
    303           "applies": true,
    304           "answer": true,
    305           "justification": "Section 6 discusses overlap: 'the LLM may have been trained on open-source repositories partially overlapping with Defects4J.' They cite prior work [18] suggesting limited impact and evaluate on 24 Defects4J v3.0 bugs as mitigation.",
    306           "source": "opus"
    307         },
    308         "benchmark_contamination_addressed": {
    309           "applies": true,
    310           "answer": true,
    311           "justification": "The paper discusses contamination risk: 'Prior work [18] suggests such overlap has limited impact on APR, since training corpora rarely contain complete bug–fix pairs.' They evaluate on Defects4J v3.0 bugs 'which were not included in prior benchmarks' as concrete mitigation, finding DynaFix fixes 9/24 vs 2/24 for pure LLM.",
    312           "source": "opus"
    313         }
    314       },
    315       "human_studies": {
    316         "pre_registered": {
    317           "applies": false,
    318           "answer": false,
    319           "justification": "No human participants. This is a benchmark evaluation of an automated program repair tool.",
    320           "source": "opus"
    321         },
    322         "irb_or_ethics_approval": {
    323           "applies": false,
    324           "answer": false,
    325           "justification": "No human participants. The study evaluates software on public benchmarks.",
    326           "source": "opus"
    327         },
    328         "demographics_reported": {
    329           "applies": false,
    330           "answer": false,
    331           "justification": "No human participants.",
    332           "source": "opus"
    333         },
    334         "inclusion_exclusion_criteria": {
    335           "applies": false,
    336           "answer": false,
    337           "justification": "No human participants.",
    338           "source": "opus"
    339         },
    340         "randomization_described": {
    341           "applies": false,
    342           "answer": false,
    343           "justification": "No human participants.",
    344           "source": "opus"
    345         },
    346         "blinding_described": {
    347           "applies": false,
    348           "answer": false,
    349           "justification": "No human participants.",
    350           "source": "opus"
    351         },
    352         "attrition_reported": {
    353           "applies": false,
    354           "answer": false,
    355           "justification": "No human participants.",
    356           "source": "opus"
    357         }
    358       },
    359       "cost_and_practicality": {
    360         "inference_cost_reported": {
    361           "applies": true,
    362           "answer": false,
    363           "justification": "The paper reports maximum patch attempts (35 per bug) and 30-minute timeouts, and discusses 'token-based billing model' as a concern, but never reports actual API costs, tokens consumed, or dollar amounts per bug repair.",
    364           "source": "opus"
    365         },
    366         "compute_budget_stated": {
    367           "applies": true,
    368           "answer": false,
    369           "justification": "No total computational budget is stated. The paper mentions 30-minute per-attempt limits but does not report total GPU hours, total API spend, or hardware used for the experiments.",
    370           "source": "opus"
    371         }
    372       },
    373       "experimental_rigor": {
    374         "seed_sensitivity_reported": {
    375           "applies": true,
    376           "answer": false,
    377           "justification": "No mention of multiple random seeds or seed sensitivity analysis. The temperature is set to 1.0 (non-deterministic), but all results appear to be from a single run.",
    378           "source": "opus"
    379         },
    380         "number_of_runs_stated": {
    381           "applies": true,
    382           "answer": false,
    383           "justification": "The number of experimental runs is never stated. It appears results are from a single run, but this is not explicitly confirmed.",
    384           "source": "opus"
    385         },
    386         "hyperparameter_search_budget": {
    387           "applies": true,
    388           "answer": false,
    389           "justification": "RQ3 explores different breadth (1-10) and depth (1-10) configurations in Figure 6, but does not report the total compute spent on this hyperparameter search. Only the resulting performance curves are shown.",
    390           "source": "opus"
    391         },
    392         "best_config_selection_justified": {
    393           "applies": true,
    394           "answer": true,
    395           "justification": "RQ3 (Section 5.3, Figure 6) shows performance and cost trade-offs across configurations. The paper transparently selects breadth=7 and depth=5 based on diminishing returns analysis: 'Beyond breadth 7 or depth 5, the search has already covered most high-quality patches.'",
    396           "source": "opus"
    397         },
    398         "multiple_comparison_correction": {
    399           "applies": true,
    400           "answer": false,
    401           "justification": "The paper compares against 11 baselines but performs no statistical tests at all, let alone corrections for multiple comparisons. All claims of superiority are based on raw count comparisons.",
    402           "source": "opus"
    403         },
    404         "self_comparison_bias_addressed": {
    405           "applies": true,
    406           "answer": false,
    407           "justification": "The paper uses reported baseline results rather than re-implementing them, which they acknowledge in threats. However, they do not discuss the bias of evaluating their own system or the systematic advantage that authors have in tuning their own approach.",
    408           "source": "opus"
    409         },
    410         "compute_budget_vs_performance": {
    411           "applies": true,
    412           "answer": false,
    413           "justification": "Figure 7 compares maximum patch attempts across methods but does not report performance as a function of matched compute budgets. DynaFix's 35 attempts involve richer per-attempt computation (ByteTrace instrumentation + longer prompts) than baselines' simpler attempts, making the comparison non-equivalent.",
    414           "source": "opus"
    415         },
    416         "benchmark_construct_validity": {
    417           "applies": true,
    418           "answer": false,
    419           "justification": "The paper uses Defects4J as a standard benchmark without discussing whether it measures real-world bug repair capability. No discussion of construct validity, benchmark limitations, or comparison with alternative benchmarks.",
    420           "source": "opus"
    421         },
    422         "scaffold_confound_addressed": {
    423           "applies": true,
    424           "answer": false,
    425           "justification": "Cross-system comparisons pit DynaFix (GPT-4o + ByteTrace + LPR) against baselines using different models and scaffolds (e.g., GIANTREPAIR uses 4 LLMs). While the ablation (RQ4) separates internal components, the main comparison conflates scaffold and model differences across systems.",
    426           "source": "opus"
    427         }
    428       },
    429       "data_leakage": {
    430         "temporal_leakage_addressed": {
    431           "applies": true,
    432           "answer": true,
    433           "justification": "The paper discusses that GPT-4o may have been trained on Defects4J code and evaluates on 24 Defects4J v3.0 bugs 'newly introduced' and 'not included in prior benchmarks' as a temporal mitigation. DynaFix repaired 9/24 vs 2/24 for pure LLM.",
    434           "source": "opus"
    435         },
    436         "feature_leakage_addressed": {
    437           "applies": true,
    438           "answer": false,
    439           "justification": "The paper uses perfect fault localization (Section 4.4) but does not discuss whether this or other evaluation features (e.g., test suite information provided to the model) constitute feature leakage relative to real-world usage.",
    440           "source": "opus"
    441         },
    442         "non_independence_addressed": {
    443           "applies": true,
    444           "answer": false,
    445           "justification": "No discussion of whether bugs from the same Defects4J project share structural similarities that could violate independence assumptions. Bugs from the same repository may have correlated patterns.",
    446           "source": "opus"
    447         },
    448         "leakage_detection_method": {
    449           "applies": true,
    450           "answer": false,
    451           "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The v3.0 evaluation provides some temporal mitigation but is not a systematic detection method and covers only 24 bugs.",
    452           "source": "opus"
    453         }
    454       }
    455     }
    456   },
    457   "claims": [
    458     {
    459       "claim": "DynaFix repairs 186 single-function bugs on Defects4J, a 10% improvement over the strongest baseline GIANTREPAIR (169 bugs).",
    460       "evidence": "Table 1 shows per-project breakdown: DynaFix 186 total vs GIANTREPAIR 169, outperforming on both v1.2 (100 vs 86) and v2.0 (86 vs 83).",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "DynaFix uniquely repairs 38 bugs that none of the 11 compared baselines can fix.",
    465       "evidence": "Figure 4(b) shows unique fixes across all 11 baselines; DynaFix achieves 38 unique fixes vs GIANTREPAIR's next-best 24.",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "The LPR iterative strategy is the dominant component, with its removal causing a 21.9pp fix rate drop — larger than any individual dynamic information component.",
    470       "evidence": "Table 3 ablation on 255 Defects4J v1.2 bugs: Default 43.5%, w/o LPR 21.6% (-21.9pp); all individual feature removals cause ≤5.5pp drops.",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "Execution-level dynamic information combined with iteration more than doubles repair rate over pure LLM, while dynamic info alone adds only 9.3pp.",
    475       "evidence": "Table 2: Pure LLM 14.9%, execution-level info alone 24.2% (+9.3pp), DynaFix 42.6% (+27.7pp) for single-function bugs.",
    476       "supported": "strong"
    477     },
    478     {
    479       "claim": "DynaFix requires only 35 maximum patch attempts per bug, over 70% fewer than the most efficient baseline (RepairAgent at 117).",
    480       "evidence": "Figure 7 compares maximum patch attempts; DynaFix: 35, RepairAgent: 117, most others 500–5,000.",
    481       "supported": "strong"
    482     },
    483     {
    484       "claim": "Local variable state is the most informative execution-level feature, contributing 5.5pp to fix rate when present.",
    485       "evidence": "Table 3: w/o Local Variables 38.0% vs Default 43.5%; larger drop than control flow (3.5pp) or method calls (3.9pp). Evaluated only on v1.2 subset.",
    486       "supported": "moderate"
    487     }
    488   ],
    489   "methodology_tags": [
    490     "benchmark-eval"
    491   ],
    492   "key_findings": "DynaFix integrates fine-grained execution-level dynamic information (variable states, control-flow paths, call stacks) into an iterative LLM-based APR loop via the ByteTrace instrumentation tool and a Layered Progressive Repair strategy. It outperforms 11 SOTA methods on Defects4J, repairing 186 single-function bugs including 38 uniquely, while requiring only 35 maximum patch attempts per bug — over 70% fewer than the most efficient baseline. The ablation study reveals that the iterative LPR strategy is the dominant factor (21.9pp fix rate drop when removed), while execution-level dynamic features together add another ~13pp; dynamic information alone without iteration provides only marginal gains. The one notable negative result is that fine-grained dynamic info is actually less useful than coarse exception messages for multi-function bugs without iteration, underscoring that the iterative mechanism is what unlocks the value of richer execution context.",
    493   "red_flags": [
    494     {
    495       "flag": "No statistical testing",
    496       "detail": "All comparative claims across 11 baselines are made with no confidence intervals, p-values, or significance tests. Temperature=1.0 generates stochastic outputs but no multiple runs or variance measures are reported."
    497     },
    498     {
    499       "flag": "GPT-4o version unpinned",
    500       "detail": "Only 'GPT-4o' is specified without a snapshot date or model version ID, making precise reproducibility impossible as OpenAI updates the model."
    501     },
    502     {
    503       "flag": "Perfect fault localization assumption",
    504       "detail": "All experiments use perfect fault localization from the Defects4J oracle. Real-world APR requires automatic localization, making the evaluated setting substantially more favorable than deployment."
    505     },
    506     {
    507       "flag": "Code not yet released",
    508       "detail": "The replication package (DynaFix framework, ByteTrace, experimental data) is promised 'upon acceptance' but currently unavailable for independent verification."
    509     },
    510     {
    511       "flag": "Baselines not re-run",
    512       "detail": "Baseline results are taken directly from original publications rather than re-executed in a unified environment, raising fairness concerns given configuration and environment differences."
    513     },
    514     {
    515       "flag": "No funding disclosure",
    516       "detail": "No acknowledgment of funding sources or competing interests is present in the paper."
    517     }
    518   ],
    519   "cited_papers": [
    520     {
    521       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    522       "relevance": "Key agent-based APR baseline; DynaFix outperforms it by 26.5% while using 70% fewer patch attempts"
    523     },
    524     {
    525       "title": "Automated program repair via conversation: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT (ChatRepair)",
    526       "relevance": "Representative iterative APR baseline using coarse-grained dialogue feedback; DynaFix improves upon this paradigm with execution-level feedback"
    527     },
    528     {
    529       "title": "Tracefixer: Execution trace-driven program repair",
    530       "relevance": "Most closely related prior work using execution traces — but only during fine-tuning, not iteratively; DynaFix extends this approach to iterative inference-time use"
    531     },
    532     {
    533       "title": "Towards Effectively Leveraging Execution Traces for Program Repair with Code LLMs",
    534       "relevance": "Direct predecessor that injects execution traces once into the prompt; DynaFix differentiates itself by re-collecting traces iteratively"
    535     },
    536     {
    537       "title": "Hybrid Automated Program Repair by Combining Large Language Models and Program Analysis (GIANTRepair)",
    538       "relevance": "Strongest baseline, aggregating four LLMs; DynaFix outperforms it with a single model plus execution-level feedback"
    539     },
    540     {
    541       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    542       "relevance": "The benchmark providing all 483 evaluated bugs"
    543     },
    544     {
    545       "title": "Are large language models memorizing bug benchmarks?",
    546       "relevance": "Cited to contextualize the data leakage/contamination threat to validity"
    547     },
    548     {
    549       "title": "Impact of code language models on automated program repair",
    550       "relevance": "Cited to support the claim that training data overlap with Defects4J has limited practical impact on APR results"
    551     }
    552   ],
    553   "engagement_factors": {
    554     "practical_relevance": {
    555       "score": 2,
    556       "justification": "DynaFix proposes a usable APR approach for Java bugs, but requires ByteTrace setup, GPT-4o API access, and is not yet released."
    557     },
    558     "surprise_contrarian": {
    559       "score": 1,
    560       "justification": "Incremental improvement over existing APR methods; the insight that execution traces help is intuitive rather than surprising."
    561     },
    562     "fear_safety": {
    563       "score": 0,
    564       "justification": "No safety or security concerns — this is about fixing software bugs, not creating them."
    565     },
    566     "drama_conflict": {
    567       "score": 0,
    568       "justification": "No controversy or provocative claims; standard benchmark evaluation."
    569     },
    570     "demo_ability": {
    571       "score": 0,
    572       "justification": "No code, demo, or tool released; all artifacts are promised 'upon acceptance.'"
    573     },
    574     "brand_recognition": {
    575       "score": 1,
    576       "justification": "Uses GPT-4o (OpenAI) but the research lab (Chongqing University) is not widely known in the APR community."
    577     }
    578   },
    579   "hn_data": {
    580     "threads": [],
    581     "top_points": 0,
    582     "total_points": 0,
    583     "total_comments": 0
    584   }
    585 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs