scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30026B)
      1 {
      2   "paper": {
      3     "title": "DEAR: A Novel Deep Learning-based Approach for Automated Program Repair",
      4     "authors": ["Yi Li", "Shaohua Wang", "Tien N. Nguyen"],
      5     "year": 2022,
      6     "venue": "ICSE '22 (44th International Conference on Software Engineering)",
      7     "arxiv_id": "2205.01859",
      8     "doi": "10.1145/3510003.3510177"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "DEAR advances DL-based automated program repair by supporting multi-hunk, multi-statement fixes through a novel fault localization technique combining spectrum-based FL with deep learning and data-flow analysis, plus a two-tier tree-based LSTM with cycle training. On Defects4J, DEAR fixes 47 bugs (42%–683% improvement over DL-based baselines) including 18 multi-hunk/multi-statement bugs that no other DL-based tool could fix. On CPatMiner, 25.3% of its 667 fixes are multi-hunk/multi-statement, while requiring 7x fewer training parameters than the best baseline CURE.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper states 'Our data and tool are publicly available [2]' and reference [2] points to https://github.com/AutomatedProgramRepair-2021/dear-auto-fix."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All three evaluation datasets are publicly available: Defects4J v1.2.0 [1], BigFix [18], and CPatMiner [26]. The paper also claims data availability at the GitHub repository [2]."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions hardware ('a workstation with a 8-core Intel CPU and a single GTX Titan GPU') but provides no software dependencies, library versions, requirements.txt, or Dockerfile."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper describes the approach and settings but does not include a README or reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (e.g., '47 bugs fixed', '15.1%') with no confidence intervals or error bars in any table or figure."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims DEAR 'outperforms' baselines based solely on comparing raw numbers (e.g., 47 vs 36 bugs). No statistical significance tests (p-values, t-tests, etc.) are reported."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper provides both absolute differences and relative improvements: 'DEAR can auto-fix 32, 41, 33, 17, 14, and 11 more bugs... (i.e., 213%, 683%, 236%, 57%, 42%, and 31% relative improvements)' with baseline values clearly stated."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for dataset sizes. The paper uses existing benchmarks (395 bugs in Defects4J, 26k in BigFix, 44k in CPatMiner) without discussing whether these are adequate for the claims being made."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No standard deviations, variance, or multi-run results are reported. All results appear to be from single experimental runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Extensive baselines: 6 DL-based APR models (DLFix, CoCoNuT, SequenceR, Tufano19, CODIT, CURE) and 8 pattern-based APR tools (Elixir, ssFix, CapGen, FixMiner, Avatar, Hercules, SimFix, TBar)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include CURE (2021), CoCoNuT (2020), DLFix (2020), Hercules (2019), and TBar (2019) — all published within 1–3 years of the submission. These represent the state-of-the-art at the time."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "RQ4 (Section 6.4) presents a sensitivity analysis removing hunk detection, multi-statement expansion, and attention+cycle training individually, showing each component's contribution (Table 9)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports number of correct patches, number of plausible patches, top-K metrics (top-1, top-3, top-5), and per-bug-type breakdowns."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "All evaluation is automated: test-case validation for Defects4J, exact match with developer fixes for BigFix and CPatMiner. No human evaluation of patch quality."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "For Defects4J: trained on CPatMiner and tested on Defects4J with 'no overlap between the two datasets.' For BigFix/CPatMiner: 80%/10%/10% split for training/tuning/testing. Cross-dataset evaluation also performed."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by Defects4J project (Chart, Closure, Lang, Math, Mockito, Time) in Tables 1, 7; by bug type (Types 1-5) in Tables 2, 3, 6, 8; and across multiple datasets."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The Limitations section discusses where DEAR fails: rare/OOV names, non-failing-test bugs, large dependent fixes. The sensitivity analysis (RQ4) discusses incorrect hunk detection and incorrect expansion. Specific multi-hunk bugs that Hercules handles but DEAR misses are analyzed."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Table 9 shows that adding hunk detection causes DEAR to fix 2 fewer Type-1 bugs, and expansion causes 3 fewer Type-1/Type-3 bugs. The paper acknowledges 'multi-statement expansion may expand the buggy hunk incorrectly.'"
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims are supported: '42%–683%' improvement matches Table 1 calculations (11→47 is 327%, 6→47 is 683%). '31–145 more bugs' on BigFix matches Table 4. '169 (25.3%) multi-hunk/multi-statement bugs' matches Table 6."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims ('DEAR outperforms') and justifies them through controlled ablation studies (RQ4) where individual components are removed one at a time from the system. The single-variable manipulation design is adequate for the ablation claims."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims 'Automated Program Repair' generally, and the abstract says 'fixing for the general bugs.' All evaluation is on Java only. The Limitations section acknowledges 'we currently focus on Java' but the title, abstract, and contribution claims make unbounded generalization."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for DEAR's improvements. The Threats to Validity section is one short paragraph about Java focus and baseline reimplementation, with no substantive discussion of confounding factors (e.g., dataset-specific properties, architectural advantages unrelated to multi-hunk capability)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures number of correctly fixed bugs (matching developer fixes or passing test suites) and claims to evaluate automated program repair capability. The measurement directly matches the claim with no proxy gap."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper references 'Google's pre-trained BERT model [8]' without specifying which variant (base/large) or version. GloVe and TreeCaps are referenced by citation only. No version identifiers for any pre-trained component."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "DEAR does not use prompting. It uses custom neural network models (tree-based LSTM, fine-tuned BERT, RNN) trained on data."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.3.1 reports detailed hyperparameter settings: BERT (epoch=4, batch=32, lr=1e-4), LSTM (epoch=200, lr=0.003, batch=128), GloVe (v-size=200, lr=0.001, batch=64, epoch=200), plus the search ranges tried."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "DEAR does not use agentic scaffolding. It is a direct neural model pipeline (fault localization → hunk detection → expansion → tree-based repair)."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.1 documents subtree pairing rules. Section 3.2 describes context building including alpha-renaming, GloVe encoding, TreeCaps summarization. Section 5.3.1-5.3.2 describe dataset splitting (80/10/10), cross-dataset setup, and that CPatMiner was used for training while Defects4J for testing."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A 'Limitations' paragraph appears at the end of Section 6 with five specific limitations, and a 'Threats to Validity' paragraph discusses Java-only evaluation and baseline reimplementation."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The Limitations section identifies specific issues: (1) rare/OOV names are challenging, (2) only bugs with failing tests, (3) cannot generate fixes with many new statements, (4) expansion may produce incorrect hunks, (5) Java-only. These are specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly states it cannot handle: security/vulnerability bugs, non-failing-test bugs, fixes with 'several new statements added or arbitrarily large sizes,' and currently focuses on Java only."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "All three datasets are publicly available: Defects4J [1] with source code and test cases, BigFix [18], and CPatMiner [26]. The authors also claim to release their data at [2]."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 5.2 describes the three datasets with their properties: 'Defects4J v1.2.0 [1] with 395 bugs with test cases; BigFix [18] with +26k bugs in +1.8 million buggy methods; CPatMiner [26] with +44k bugs from 5,832 Java projects.'"
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data comes from standard public benchmarks (Defects4J, BigFix, CPatMiner)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The paper documents: CPatMiner used for AST differencing, subtree pairing rules (Section 3.1), context building pipeline (Section 3.2), dataset splits (80/10/10), and test set sizes (4,415 CPatMiner, 2,594 BigFix tested bugs). The pipeline from raw data through training and evaluation is traceable."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The Acknowledgments section discloses NSF funding: grants CNS-2120386, CCF-1723215, CCF-1723432, TWC-1723198, CCF-1518897, and CNS-1513263."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: Yi Li and Shaohua Wang at New Jersey Institute of Technology, Tien N. Nguyen at University of Texas at Dallas. No commercial product is being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The US National Science Foundation (NSF) is the funder and has no financial stake in whether DEAR outperforms specific APR tools."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "DEAR trains its own models (LSTM, fine-tuned BERT, RNN) on specific datasets. It does not evaluate a pre-trained model's inherent capability on a benchmark. The concern of pre-training contamination does not apply."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Same rationale: DEAR is not evaluating a pre-trained model on a benchmark. The authors control train/test splits directly. They do use separate datasets for training and testing on Defects4J."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "DEAR is a custom-trained system, not a pre-trained model being evaluated on benchmarks. Pre-training contamination is not applicable."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. All evaluation is automated."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Section 6.5 reports: 'predicting on CPatMiner took 2.4-3.1 seconds for each candidate patch,' 'predicting on BigFix took 3.6-4.2 seconds,' 'Predicting on Defects4J took only 2.1 seconds.' Test validation times also reported."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Section 6.5 reports training time: '+22 hours' on CPatMiner, '18-19 hours' on BigFix. Hardware stated: '8-core Intel CPU and a single GTX Titan GPU.' Training parameters: DEAR 0.39M vs CURE 3.1M."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or results across multiple seeds. All results appear to be from a single run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper does not state how many experimental runs produced the reported results."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper lists hyperparameter ranges searched (Section 5.3.1) but does not report the total number of configurations tried or compute spent on the search."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": true,
    311         "justification": "The paper states 'We tuned all approaches with the aforementioned parameters on the same CPatMiner dataset to obtain the best performance' and reports the best settings found. Tuning on CPatMiner and testing on Defects4J avoids selection on the test set."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes numerous comparisons across 6+ DL baselines and 8 pattern-based baselines without any statistical tests, let alone corrections for multiple comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors re-implemented CURE ('which is unavailable') and replicated other baselines. They note 'We tried our best to re-implement the pattern-based APR baselines and CURE for a fair comparison' but do not discuss the systematic bias of authors implementing their own baselines."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Section 6.5 compares DEAR's training parameters (0.39M) against CURE (3.1M), showing DEAR achieves better results with 7x fewer parameters. Training times are also reported."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether Defects4J, BigFix, or CPatMiner actually measure real-world bug-fixing capability. The paper uses these benchmarks without questioning their construct validity."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No agentic scaffolding is involved. All tools are compared as direct model pipelines. The FL tool (Ochiai) is controlled to be the same across approaches."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether training data temporally overlaps with test bugs, or whether BERT pre-training data includes code from the evaluation datasets."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether any features leak information about the fix. In RQ2, actual bug locations are provided to all tools, which is acknowledged but is the evaluation design rather than leakage."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "For BigFix and CPatMiner, random 80/10/10 splits are used without discussing whether train and test bugs come from the same projects or share structural similarities that could inflate performance."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method is applied. For Defects4J, the paper states 'no overlap between the two datasets' (CPatMiner and Defects4J) but provides no verification."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "DEAR outperforms DL-based APR baselines by 42%–683% on Defects4J in terms of number of auto-fixed bugs with top-1 patches.",
    365       "evidence": "Table 1 shows DEAR fixes 47 bugs vs. CURE 36, CoCoNuT 33, DLFix 30, Tufano19 14, CODIT 6, and SequenceR 15. Relative improvements calculated from these numbers (Section 6.1.1).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "DEAR fixes 18 multi-hunk/multi-statement bugs on Defects4J that no existing DL-based APR tool can fix.",
    370       "evidence": "Table 2 shows DEAR fixes 4+11+1+2=18 Type 2-5 bugs while all baselines fix 0 bugs of Types 2-5. These require dependent changes to multiple statements at once.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "On BigFix, DEAR fixes 31–145 more bugs than existing DL-based APR models with top-1 patches.",
    375       "evidence": "Table 4 shows DEAR fixes 14.1% of 2,594 bugs (366 bugs) vs. baselines ranging from 3.9% to 12.9%. The difference of 31 (vs. CURE) to 145 (vs. CODIT adjusted for test set size) is verifiable.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "On CPatMiner, 25.3% of DEAR's 667 fixed bugs are multi-hunk/multi-statement, and DEAR fixes 40–61 more such bugs than baselines.",
    380       "evidence": "Table 6: DEAR fixes 22+120+22+5=169 multi-hunk/multi-statement bugs. CoCoNuT fixes 117, DLFix 108, CURE 129 of Types 2-5 combined. Differences: 52, 61, 40.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "DEAR performs at a comparable level to state-of-the-art pattern-based APR tools.",
    385       "evidence": "Table 7 shows DEAR fixes 47 bugs on Defects4J vs. Hercules 49 and TBar 43. DEAR fixes 12 bugs that Hercules misses, including 7 multi-hunk/multi-statement ones.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "Each component (hunk detection, expansion, attention-cycle) positively contributes to DEAR's performance.",
    390       "evidence": "Table 9: Without hunk detection: 35 bugs (vs. 47), without expansion: 43 bugs, without attention-cycle: 40 bugs. Each component adds unique bug-fixing capability.",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "DEAR requires 7x fewer training parameters than CURE while achieving better results.",
    395       "evidence": "Section 6.5: 'DEAR and CURE require 0.39M and 3.1M training parameters on CPatMiner, and 0.42M and 3.5M parameters on BigFix.'",
    396       "supported": "strong"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No statistical significance testing",
    402       "detail": "All comparative claims ('DEAR outperforms X by Y%') are based on raw number comparisons across 14 baselines with no significance tests, p-values, or confidence intervals. It is impossible to distinguish signal from noise."
    403     },
    404     {
    405       "flag": "Single-run results with no variance",
    406       "detail": "Results appear to be from single experimental runs with no standard deviations, multi-seed analysis, or any measure of result stability. Neural model training is stochastic, so results may vary substantially across runs."
    407     },
    408     {
    409       "flag": "Re-implemented baselines may underperform",
    410       "detail": "CURE was re-implemented by the authors since its code was 'unavailable.' Other baselines were replicated. The authors acknowledge 'We tried our best to re-implement' but do not address the systematic bias documented by Lucic et al. (2018) where authors' re-implementations of baselines underperform originals."
    411     },
    412     {
    413       "flag": "Random data splits without independence verification",
    414       "detail": "BigFix and CPatMiner use random 80/10/10 splits without verifying that train and test bugs are independent (e.g., from different projects). Bugs from the same project may share patterns that inflate performance."
    415     },
    416     {
    417       "flag": "Broad title and claims vs. Java-only evaluation",
    418       "detail": "The paper claims to address 'Automated Program Repair' and 'general software defects' but evaluates exclusively on Java. The generalization is acknowledged only in the limitations section, not in the title or abstract."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "DLFix: Context-Based Code Transformation Learning for Automated Program Repair",
    424       "authors": ["Yi Li", "Shaohua Wang", "Tien N. Nguyen"],
    425       "year": 2020,
    426       "doi": "10.1145/3377811.3380345",
    427       "relevance": "DL-based APR baseline using tree-based translation model for context-aware bug fixing; direct predecessor to DEAR."
    428     },
    429     {
    430       "title": "CoCoNuT: Combining Context-Aware Neural Translation Models Using Ensemble for Program Repair",
    431       "authors": ["Thibaud Lutellier", "Hung Viet Pham", "Lawrence Pang", "Yitong Li", "Moshi Wei", "Lin Tan"],
    432       "year": 2020,
    433       "doi": "10.1145/3395363.3397369",
    434       "relevance": "Context-aware NMT ensemble approach for program repair; key DL-based APR baseline."
    435     },
    436     {
    437       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    438       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    439       "year": 2021,
    440       "doi": "10.1109/ICSE43902.2021.00107",
    441       "relevance": "Code-aware NMT using GPT model for APR; strongest DL-based baseline with 7x more parameters than DEAR."
    442     },
    443     {
    444       "title": "SEQUENCER: Sequence-to-Sequence Learning for End-to-End Program Repair",
    445       "authors": ["Zimin Chen", "Steve James Kommrusch", "Michele Tufano", "Louis-Noël Pouchet", "Denys Poshyvanyk", "Martin Monperrus"],
    446       "year": 2019,
    447       "doi": "10.1109/TSE.2019.2940179",
    448       "relevance": "Seq2seq NMT approach for end-to-end program repair; foundational DL-based APR baseline."
    449     },
    450     {
    451       "title": "CODIT: Code Editing with Tree-Based Neural Models",
    452       "authors": ["Saikat Chakraborty", "Yangruibo Ding", "Miltiadis Allamanis", "Baishakhi Ray"],
    453       "year": 2020,
    454       "doi": "10.1109/TSE.2020.3020502",
    455       "relevance": "Tree-based neural model for code editing that encodes code structures and learns code edits."
    456     },
    457     {
    458       "title": "TBar: Revisiting Template-Based Automated Program Repair",
    459       "authors": ["Kui Liu", "Anil Koyuncu", "Dongsun Kim", "Tegawendé F. Bissyandé"],
    460       "year": 2019,
    461       "doi": "10.1145/3293882.3330577",
    462       "relevance": "Template-based APR tool collecting fix patterns; top pattern-based baseline for comparison."
    463     },
    464     {
    465       "title": "Harnessing Evolution for Multi-Hunk Program Repair",
    466       "authors": ["Seemanta Saha", "Ripon K. Saha", "Mukul R. Prasad"],
    467       "year": 2019,
    468       "doi": "10.1109/ICSE.2019.00020",
    469       "relevance": "Pattern-based APR tool specifically designed for multi-hunk fixes; key comparison for multi-hunk repair capability."
    470     },
    471     {
    472       "title": "On Learning Meaningful Code Changes Via Neural Machine Translation",
    473       "authors": ["Michele Tufano", "Jevgenija Pantiuchina", "Cody Watson", "Gabriele Bavota", "Denys Poshyvanyk"],
    474       "year": 2019,
    475       "doi": "10.1109/ICSE.2019.00021",
    476       "relevance": "NMT approach learning code changes with code abstractions and keyword replacing for program repair."
    477     },
    478     {
    479       "title": "Shaping Program Repair Space with Existing Patches and Similar Code",
    480       "authors": ["Jiajun Jiang", "Yingfei Xiong", "Hongyu Zhang", "Qing Gao", "Xiangqun Chen"],
    481       "year": 2018,
    482       "doi": "10.1145/3213846.3213871",
    483       "relevance": "SimFix: pattern-based APR using existing patches and similar code to shape the repair search space."
    484     },
    485     {
    486       "title": "GenProg: A Generic Method for Automatic Software Repair",
    487       "authors": ["Claire Le Goues", "ThanhVu Nguyen", "Stephanie Forrest", "Westley Weimer"],
    488       "year": 2012,
    489       "doi": "10.1109/TSE.2011.104",
    490       "relevance": "Foundational search-based APR approach using genetic programming for automatic repair."
    491     },
    492     {
    493       "title": "DeepFix: Fixing Common C Language Errors by Deep Learning",
    494       "authors": ["Rahul Gupta", "Soham Pal", "Aditya Kanade", "Shirish Shevade"],
    495       "year": 2017,
    496       "relevance": "Early DL-based approach for fixing syntax errors; demonstrates deep learning for automated repair."
    497     },
    498     {
    499       "title": "ARJA: Automated Repair of Java Programs via Multi-Objective Genetic Programming",
    500       "authors": ["Yuan Yuan", "Wolfgang Banzhaf"],
    501       "year": 2020,
    502       "doi": "10.1109/TSE.2018.2874648",
    503       "relevance": "Multi-objective genetic programming approach for Java program repair; search-based APR baseline."
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 2,
    509       "justification": "DEAR is a working APR tool with released code that practitioners could apply to Java bug fixing, though requires significant ML pipeline setup."
    510     },
    511     "surprise_contrarian": {
    512       "score": 1,
    513       "justification": "The multi-hunk capability is novel for DL-based APR but the general finding that a better approach fixes more bugs is expected."
    514     },
    515     "fear_safety": {
    516       "score": 0,
    517       "justification": "No AI safety or security concerns raised; purely a software engineering tool."
    518     },
    519     "drama_conflict": {
    520       "score": 0,
    521       "justification": "No controversy; straightforward academic contribution with incremental improvements."
    522     },
    523     "demo_ability": {
    524       "score": 1,
    525       "justification": "Code is released on GitHub but requires training data, GPU setup, and ML pipeline expertise to run."
    526     },
    527     "brand_recognition": {
    528       "score": 0,
    529       "justification": "Academic authors from NJIT and UT Dallas; no major industry lab or famous product involved."
    530     }
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs