scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (34263B)
      1 {
      2   "paper": {
      3     "title": "HAFixAgent: History-Aware Automated Program Repair Agent",
      4     "authors": ["Yu Shi", "Hao Li", "Bram Adams", "Ahmed E. Hassan"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2511.01047",
      8     "doi": "10.48550/arXiv.2511.01047"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "HAFixAgent integrates blame-derived repository history into an agentic APR loop and evaluates on all 854 Defects4J bugs. 71.1% of bugs are blameable and 70.7% map to exactly one unique blame commit. History-aware configurations add 194 unique fixes over the non-history baseline, outperforming RepairAgent by 212.3% and BIRCH-feedback by 29.9%. The three history heuristics (fn_all, fn_pair, fl_diff) are complementary, and combining 2-3 yields the best cost-effectiveness with no significant step or cost overhead.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper states: 'We will release the complete replication package of code and the evaluation script after the revision is completed. In the interim, please contact us by email to request access.' A promise of future release and email-on-request do not count as released."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses Defects4J v3.0.1, a publicly available benchmark with 854 bugs. The dataset is fully public and cited with a reference (Just et al., 2014)."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions 'Ubuntu 20.04' and 'isolated Docker container' built from the 'released Defects4J image' (Section 5.4), but provides no requirements.txt, Dockerfile, or detailed library version listing sufficient to recreate the environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The code is not yet released, and no README or 'Reproducing Results' section exists."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Main results in Tables 3 and 4 report point estimates (#Pass, Plausible@1) with no confidence intervals or error bars. Box plots in Figures 4 and 5 show distributions across bugs but not uncertainty intervals on aggregate metrics."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "The paper uses Friedman tests followed by pairwise Wilcoxon signed-rank tests with Bonferroni correction (α = 0.0167) to compare cost and step distributions across configurations (Section 6.2, Table 5)."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context: 212.3% over RepairAgent (164 → 512.5 avg), 29.9% over BIRCH-feedback (133 → 172.75 avg), with specific absolute counts for each configuration (Tables 3-4)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No power analysis or explicit justification for sample sizes. The paper uses all 854 Defects4J bugs but does not discuss whether subgroup sizes (e.g., N=22 matched MFMH bugs in Table 5) are adequate for the statistical tests applied."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper uses temperature 0.0 with a single run per bug per configuration (Section 5.3). No variance across experimental runs is reported. Box plots show distributions across different bugs, not repeated runs. The authors acknowledge 'API level determinism does not guarantee identical runs in practice' (Section 8.2) but do not measure this variation."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Two baselines are included: RepairAgent (Bouzenia et al., 2025) on 829 shared bugs and BIRCH-feedback (Nashid et al., 2025) on 371 multi-hunk bugs (Section 5.2). Additionally, an internal non-history ablation serves as a controlled baseline."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "RepairAgent (ICSE 2025) and BIRCH-feedback (2025) are both recent, state-of-the-art APR systems. The paper explicitly characterizes them as SOTA (Section 5.2)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper compares four context configurations: non-history, fn_all, fn_pair, and fl_diff (Section 4.1.3, Table 4). This isolates the contribution of each history heuristic. Venn diagrams (Figure 3) show complementarity."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports Plausible@1, #Pass, #Unique Pass for effectiveness (Section 5.3), plus agent steps and inference cost (USD) for efficiency (Section 6.2). Cost-performance trade-off analysis is provided in Figure 6."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "Evaluation is entirely automated via test suite pass/fail. No human evaluation of patch quality, correctness, or maintainability is performed. The paper acknowledges this limitation and suggests patch quality assessment as future work (Section 7.3)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The preliminary study (RQ0, Section 3) analyzing blame availability was conducted on the same Defects4J dataset used for final evaluation. Design decisions (single-commit heuristic, fallback strategy) were motivated by RQ0 findings on this data, meaning the evaluation set informed the system design."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by four bug categories: SL, SH, SFMH, and MFMH throughout (Tables 3, 4, Figures 3-6). Both effectiveness and cost/efficiency metrics are stratified by category."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 7.1 discusses 32 performance regression cases where history configurations underperform non-history, with analysis attributing this to context distraction effects (citing Shi et al., 2023 and Levy et al., 2024)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that in MFMH, the non-history configuration (50 bugs) outperforms all three history configurations (47, 43, 48). Section 7.1 explicitly discusses these regressions. RQ2 shows history increases cost for SL bugs (Table 5, p=0.0022 for fn_all, p=0.001 for fl_diff)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of 212.3% improvement over agent baseline and 29.9% over multi-hunk baseline are supported by Tables 3a and 3b in Section 6.1. Efficiency claims (no significant step increase, comparable costs) are supported by Table 5 and Figures 4-5."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper claims history 'improves' repair performance. This is supported by a controlled ablation using the same LLM, same setup, same bugs, varying only the history context. The internal comparison (non-history vs history configurations) is a well-controlled single-variable manipulation. The paper also acknowledges confounds in the SOTA comparison (Section 6.1.3)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract offers 'a practical recipe for history-aware agentic APR' without bounding to Java or Defects4J. The title names a general 'Automated Program Repair Agent.' Results are on a single benchmark (Defects4J) in a single language (Java) with one LLM. While Section 8.2 discusses generalizability limitations, the main claims in the abstract and conclusion are not bounded to the tested setting."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper acknowledges the LLM confound: 'these SOTA comparisons are confounded by the use of different LLMs' (Section 6.1.3). Section 7.1 discusses how added context can hurt via distraction effects, citing Shi et al. (2023) and Levy et al. (2024). Section 8.1 discusses data leakage as an alternative explanation."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper defines Plausible@1 precisely (Eq. 1, Section 5.3) as test-passing patches and uses it consistently. It distinguishes between 'plausible' (test-passing) and 'correct' (semantically verified) patches when comparing against RepairAgent. Section 7.3 explicitly acknowledges the gap between test-pass-based evaluation and patch quality."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper specifies 'DeepSeek-V3.2-Exp' via the official API (Section 5.4), but no snapshot date or API version ID is provided. The technical note is cited with an access date of 2025-10-20, but the exact model checkpoint used is not pinned."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The full system prompt is provided in Appendix A, including all tool instructions, methodology guidelines, and the conditional historical context injection template. Placeholders (e.g., '{{ repo_path }}') are clearly marked with runtime rendering described."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.4 reports: temperature 0.0, 50-step loop cap, $1 USD cost guard, 1-hour timeout. Prompts are rendered with jinja2. These are the key parameters governing agent behavior."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The agent architecture is described in detail in Section 4 with Figure 2 showing the full workflow. The execution loop (observe-act-feedback), context builder with history extractor, tools (Table 2), termination guards, and Docker sandboxing are all documented. The system builds on mini-swe-agent (Section 5.4)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5.1 documents metadata collection (bug reports from issue trackers, failing tests from Defects4J, fault localization from developer patches), historical data collection (blame/blameless classification, fallback strategy), and the three heuristic extraction procedures. Manual verification of 18 Chart commit messages for data leakage is also mentioned."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 8 'Threats to Validity' contains substantive discussion divided into Internal Validity (Section 8.1: data leakage, fault localization) and External Validity (Section 8.2: dataset/language generalizability, LLM/agent generalizability)."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The threats are specific to this study: DeepSeek-V3.2-Exp's undisclosed training corpus may include Defects4J (Section 8.1), perfect fault localization inflates results vs real-world noisy FL (Section 8.1), results on Java/Defects4J may not transfer to Python or repositories with history rewriting (Section 8.2)."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 8.2 states specific boundaries: single language (Java), single benchmark (Defects4J v3.0.1), one LLM, one agent loop design. Section 10 explicitly lists: 'We assume perfect fault localization, focus on one benchmark and language, and use a single strong agent model and loop.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The replication package is not yet released. The paper states: 'We will release the complete replication package... In the interim, please contact us by email to request access.' Patches and logs are not publicly available for verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.1 describes data collection in detail: bug reports mined from issue tracker links, failing tests from Defects4J, fault locations derived from developer patches, historical data from git blame. The 18 Chart cases with commit-message-as-bug-report are manually verified for leakage."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The data source is Defects4J, a standard benchmark with well-documented bug collection methodology."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Section 3.2 documents the preliminary study pipeline (git blame on buggy lines → deduplicate commit hashes → count per category). Section 5.1 documents the full pipeline from metadata collection through historical data extraction to prompt construction. Bug categorization into SL/SH/SFMH/MFMH is defined in Section 3.2.2."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding sources are mentioned anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All four authors are listed as affiliated with Queen's University, Canada. Contact emails are provided. The authors are not affiliated with DeepSeek or any APR product company."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Since funding is not disclosed, independence of funder from outcome cannot be verified. Academic researchers at Queen's University are likely grant-funded but no funding statement is provided."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "Section 8.1 explicitly states: 'the pretraining corpus for this model is not disclosed to the best of our knowledge. As a result, we cannot verify if the specific projects or bugs in Defects4J are included in the pretraining data.' No training cutoff date is stated."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 8.1 discusses potential overlap: 'we cannot verify if the specific projects or bugs in Defects4J are included in the pretraining data.' The paper argues the internal comparison (history vs non-history) naturally mitigates this threat since improvement is attributed to historical context regardless of prior exposure."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Section 8.1 explicitly addresses contamination risk. Defects4J (2014) predates DeepSeek's training. The paper argues their evaluation design 'naturally isolates and mitigates the possible influence of potential data leakage' through internal ablation, though they acknowledge full resolution would require model retraining."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. It is a benchmark evaluation of an automated tool."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The study evaluates an automated system on a software benchmark."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Detailed cost analysis in RQ2 (Section 6.2): median costs per bug by category and configuration (e.g., SL success 0.005-0.008 USD, MFMH success 0.023-0.029 USD). Figure 5 shows full cost distributions. Figure 6 shows cost-performance trade-offs across configuration combinations."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Per-bug cost guard ($1 USD) and median per-bug costs are reported, but total API spend across all 854 bugs × 4 configurations is not stated. Hardware specifications for the Docker host are not provided beyond 'Ubuntu 20.04.'"
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The paper uses temperature 0.0 for determinism with a single run per bug (Section 5.3). No multi-seed analysis is performed. The threats section acknowledges 'API level determinism does not guarantee identical runs in practice' (Section 8.2) but does not measure this variation."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 5.3 explicitly states: 'We generate one patch per bug using standard decoding settings, with the temperature set to 0.0 for enhancing reproducibility.' One run per bug per configuration."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search is described. Temperature 0.0 is justified by DeepSeek's official recommendation (reference [10]). Step limit (50), cost guard ($1), and timeout (1 hour) appear chosen without systematic tuning."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "All four configurations (non-history, fn_all, fn_pair, fl_diff) are reported transparently without cherry-picking. Results for all configurations are shown in Tables 3-4, and the paper discusses trade-offs rather than declaring a single winner."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": true,
    316         "justification": "Bonferroni correction is applied with α = 0.0167 for three pairwise Wilcoxon signed-rank tests comparing each history configuration against non-history (Table 5, Section 6.2)."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The paper compares HAFixAgent against RepairAgent and BIRCH-feedback using published results from those papers, not independent re-implementations. The authors acknowledge the LLM confound (Section 6.1.3) but do not discuss the general bias of evaluating their own system."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure 6 plots success rate vs average cost per bug across 1-4 configuration combinations, showing diminishing returns. The paper explicitly analyzes the cost-performance frontier by category."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "Defects4J is used as a standard benchmark without questioning whether it adequately represents real-world bug fixing. No discussion of whether Defects4J's bug distribution, test suite adequacy, or project selection reflects actual development scenarios."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": true,
    336         "justification": "All configurations use the same mini-swe-agent-based execution loop, identical toolset (Table 2), same Docker sandbox, and same LLM. The only variable is the context input (non-history vs three history heuristics), cleanly isolating the scaffold confound."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Section 8.1 discusses that Defects4J projects and bugs may exist in DeepSeek's training data since the benchmark predates the model. The paper argues the internal ablation design mitigates this concern."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "Section 8.1 explicitly discusses the perfect fault localization assumption: exact buggy file(s) and line(s) are provided to the agent, which would not be available in practice. This is acknowledged as feature leakage for real-world deployment."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The paper does not discuss whether Defects4J projects or their code appear in DeepSeek's training data at a structural level. The general contamination discussion in Section 8.1 does not specifically address non-independence between training examples and test bugs."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection method is applied. Section 8.1 states: 'To fully resolve this threat, we would need to retrain DeepSeek-V3.2-Exp from scratch, which would be infeasible for an academic project.' No canary strings, membership inference, or decontamination methods are used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "71.1% of Defects4J bugs are blameable (have at least one blame commit across buggy lines)",
    365       "evidence": "Table 1 and Section 3.3 show 607 of 854 bugs are blameable, with breakdown by category: SL 83.8%, SH 52.2%, SFMH 69.2%, MFMH 80.5%.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "70.7% of all Defects4J bugs map to exactly one unique blame commit",
    370       "evidence": "Section 3.3 and Figure 1 show 604 of 854 bugs have exactly one unique blame commit, with only 3 bugs having two or more. Manual inspection of multi-hunk cases confirmed this.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "HAFixAgent outperforms RepairAgent by an average of 212.3% across context configurations",
    375       "evidence": "Table 3a shows HAFixAgent configurations fix 502-523 bugs vs RepairAgent's 164 correct fixes on 829 shared bugs. However, this comparison is confounded by different LLMs (DeepSeek-V3.2-Exp vs GPT-3.5).",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "HAFixAgent outperforms BIRCH-feedback by an average of 29.9%",
    380       "evidence": "Table 3b shows HAFixAgent fixes 171-175 bugs vs BIRCH-feedback's 133 on 371 shared multi-hunk bugs. Confounded by different LLMs (DeepSeek-V3.2-Exp vs o4-mini).",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "History configurations collectively add 194 unique fixes that non-history never achieves, vs only 32 unique non-history fixes",
    385       "evidence": "Section 6.1.3 and Figure 3 Venn diagrams show the union of history configurations uniquely solves 194 bugs, while non-history uniquely solves 32. This is an internally controlled comparison with the same LLM.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Historical context does not significantly increase agent steps or costs (except SL cost)",
    390       "evidence": "Table 5: Friedman tests show no significant step differences in any category (all p ≥ 0.05). For cost, only SL shows significant increases for fn_all (p=0.0022) and fl_diff (p=0.001). SH, SFMH, MFMH costs are not significantly different.",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Combining two or three history heuristics yields the best cost-effectiveness with diminishing returns for the fourth",
    395       "evidence": "Figure 6 shows cost-performance trade-off curves by category. Adding the fourth configuration raises cost substantially for small gains (e.g., SL: 90.4% at $0.041 for 2 configs vs 95.2% at $0.086 for 4 configs).",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "Confounded baseline comparisons",
    402       "detail": "The SOTA comparisons use different LLMs: HAFixAgent uses DeepSeek-V3.2-Exp (2025), RepairAgent used GPT-3.5, and BIRCH-feedback used o4-mini. The 212.3% and 29.9% improvement claims cannot be attributed to the history-aware approach versus the LLM upgrade. The paper acknowledges this but still headlines these numbers in the abstract."
    403     },
    404     {
    405       "flag": "Perfect fault localization assumption",
    406       "detail": "The evaluation assumes perfect fault localization (exact buggy files and lines given), which is unavailable in practice. This inflates results and makes the approach appear more effective than it would be in real-world deployment. While common in APR research, this is a significant gap between claimed and actual utility."
    407     },
    408     {
    409       "flag": "Design-evaluation circularity",
    410       "detail": "The preliminary study (RQ0) that motivated HAFixAgent's design was conducted on the same Defects4J dataset used for final evaluation. The finding that most bugs have one blame commit directly informed the single-commit heuristic, creating a circularity between design choices and evaluation outcomes."
    411     },
    412     {
    413       "flag": "Single-run evaluation without seed sensitivity",
    414       "detail": "Despite acknowledging that 'API level determinism does not guarantee identical runs in practice,' the paper reports only single-run results at temperature 0.0 without measuring run-to-run variation. Small API-side non-determinism could affect the 3-bug difference between some configurations."
    415     },
    416     {
    417       "flag": "Code not released",
    418       "detail": "The replication package is promised for future release and currently available only by email request. Independent verification of results is not possible at the time of publication."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "RepairAgent: An Autonomous, LLM-Based Agent for Program Repair",
    424       "authors": ["Islem Bouzenia", "Premkumar T. Devanbu", "Michael Pradel"],
    425       "year": 2025,
    426       "doi": "10.1109/ICSE55347.2025.00157",
    427       "relevance": "Pioneering autonomous LLM-based agent for APR with pre-defined API tooling, evaluated on Defects4J — direct baseline for HAFixAgent."
    428     },
    429     {
    430       "title": "SWE-agent: Agent-Computer Interfaces Enable Automated Software Engineering",
    431       "authors": ["John Yang", "Carlos E. Jimenez", "Alexander Wettig", "Kilian Lieret", "Shunyu Yao", "Karthik Narasimhan", "Ofir Press"],
    432       "year": 2024,
    433       "relevance": "Introduced Agent-Computer Interface paradigm for LLM agents solving real-world GitHub issues; foundational for agentic APR."
    434     },
    435     {
    436       "title": "OpenHands: An Open Platform for AI Software Developers as Generalist Agents",
    437       "authors": ["Xingyao Wang", "Boxuan Li", "Yufan Song"],
    438       "year": 2025,
    439       "relevance": "Open platform for generalist AI software development agents, relevant to the agentic SE paradigm."
    440     },
    441     {
    442       "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?",
    443       "authors": ["Carlos E. Jimenez", "John Yang", "Alexander Wettig", "Shunyu Yao", "Kexin Pei", "Ofir Press", "Karthik R. Narasimhan"],
    444       "year": 2024,
    445       "relevance": "Standard benchmark for evaluating LLM agents on real-world software engineering tasks."
    446     },
    447     {
    448       "title": "HAFix: History-Augmented Large Language Models for Bug Fixing",
    449       "authors": ["Yu Shi", "Abdul Ali Bangash", "Emad Fallahzadeh", "Bram Adams", "Ahmed E. Hassan"],
    450       "year": 2025,
    451       "arxiv_id": "2501.09135",
    452       "relevance": "Prior work by same authors showing blame-derived historical context improves LLM repair for single-line bugs; direct predecessor to HAFixAgent."
    453     },
    454     {
    455       "title": "Characterizing Multi-Hunk Patches: Divergence, Proximity, and LLM Repair Challenges",
    456       "authors": ["Noor Nashid", "Daniel Ding", "Keheliya Gallaba", "Ahmed E. Hassan", "Ali Mesbah"],
    457       "year": 2025,
    458       "arxiv_id": "2506.04418",
    459       "relevance": "Characterizes multi-hunk bugs and evaluates LLM repair challenges; provides BIRCH-feedback baseline for HAFixAgent."
    460     },
    461     {
    462       "title": "AutoCodeRover: Autonomous Program Improvement",
    463       "authors": ["Yuntong Zhang", "Haifeng Ruan", "Zhiyu Fan", "Abhik Roychoudhury"],
    464       "year": 2024,
    465       "doi": "10.1145/3650212.3680384",
    466       "relevance": "Agentic APR system integrating program analysis context; representative of SOTA agent-based repair approaches."
    467     },
    468     {
    469       "title": "Automated Program Repair via Conversation: Fixing 162 out of 337 Bugs for $0.42 Each using ChatGPT",
    470       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    471       "year": 2024,
    472       "doi": "10.1145/3650212.3680323",
    473       "relevance": "Conversation-driven LLM repair approach with cost analysis; demonstrates the cost-effectiveness paradigm for APR evaluation."
    474     },
    475     {
    476       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    477       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R. Narasimhan", "Yuan Cao"],
    478       "year": 2023,
    479       "relevance": "Foundational reasoning-acting framework that underpins the observe-act-feedback agent loop used in HAFixAgent."
    480     },
    481     {
    482       "title": "EXPEREPAIR: Dual-Memory Enhanced LLM-based Repository-Level Program Repair",
    483       "authors": ["Fangwen Mu", "Junjie Wang", "Lin Shi", "Song Wang", "Shoubin Li", "Qing Wang"],
    484       "year": 2025,
    485       "arxiv_id": "2506.10484",
    486       "relevance": "Dual-memory system leveraging historical repair experiences for LLM-based repair; related history-aware approach."
    487     },
    488     {
    489       "title": "Impact of Code Language Models on Automated Program Repair",
    490       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    491       "year": 2023,
    492       "doi": "10.1109/ICSE48619.2023.00125",
    493       "relevance": "Evaluates how code LLMs advance APR beyond prior learning-based methods on Defects4J."
    494     },
    495     {
    496       "title": "DeepSeek-V3 Technical Report",
    497       "authors": ["DeepSeek-AI"],
    498       "year": 2024,
    499       "arxiv_id": "2412.19437",
    500       "relevance": "Technical report for the LLM family used as HAFixAgent's backbone model."
    501     },
    502     {
    503       "title": "SWE-Effi: Re-Evaluating Software AI Agent System Effectiveness Under Resource Constraints",
    504       "authors": ["Zhiyu Fan", "Kirill Vasilevski", "Dayi Lin"],
    505       "year": 2025,
    506       "arxiv_id": "2509.09853",
    507       "relevance": "Evaluates agent effectiveness under cost constraints; highlights token snowball risk in agent-based systems."
    508     },
    509     {
    510       "title": "The Fact Selection Problem in LLM-Based Program Repair",
    511       "authors": ["Nikhil Parasaram", "Huijie Yan", "Boyu Yang"],
    512       "year": 2025,
    513       "doi": "10.1109/ICSE55347.2025.00162",
    514       "relevance": "Studies which contextual facts improve LLM repair performance — directly relevant to history as context."
    515     }
    516   ],
    517   "engagement_factors": {
    518     "practical_relevance": {
    519       "score": 2,
    520       "justification": "The 'recipe' (ground agents in git blame history) is actionable for APR practitioners, though limited to the specific Defects4J workflow and not yet released as usable software."
    521     },
    522     "surprise_contrarian": {
    523       "score": 1,
    524       "justification": "The finding that 70.7% of multi-hunk bugs trace to a single blame commit is mildly surprising, but the general idea that history helps repair is intuitive."
    525     },
    526     "fear_safety": {
    527       "score": 0,
    528       "justification": "No safety or security concerns raised; this is a software debugging tool."
    529     },
    530     "drama_conflict": {
    531       "score": 0,
    532       "justification": "No controversy or provocative claims; straightforward empirical evaluation."
    533     },
    534     "demo_ability": {
    535       "score": 0,
    536       "justification": "Code is not released; available only by email request with a promise of future release."
    537     },
    538     "brand_recognition": {
    539       "score": 0,
    540       "justification": "Queen's University is respected in SE research but not a high-profile AI lab; DeepSeek is used but is not the focus."
    541     }
    542   }
    543 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs