scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29862B)
      1 {
      2   "paper": {
      3     "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair",
      4     "authors": [
      5       "Nan Jiang",
      6       "Thibaud Lutellier",
      7       "Yiling Lou",
      8       "Lin Tan",
      9       "Dan Goldwasser",
     10       "Xiangyu Zhang"
     11     ],
     12     "year": 2023,
     13     "venue": "IEEE/ACM 45th International Conference on Software Engineering (ICSE)",
     14     "arxiv_id": "2302.01857",
     15     "doi": "10.1109/ICSE48619.2023.00111"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "KNOD proposes a three-stage tree decoder that directly generates ASTs of patched code, combined with domain-rule distillation that injects syntactic and semantic knowledge via teacher-student distributions during both training and inference. Evaluated on Defects4J v1.2 (72 bugs), Defects4J v2.0 (50 bugs), and QuixBugs (25 bugs), KNOD outperforms all compared APR tools with perfect fault localization. Ablation shows the tree decoder contributes 16 additional fixes and training-time distillation contributes 10, while patch precision reaches 86.7% on Defects4J v1.2.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper states 'Data Availability: our replication package is available at [65]' with reference [65] pointing to https://github.com/lin-tan/knod."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Training data is constructed from a dataset shared in previous work [7], [10]. The evaluation benchmarks Defects4J v1.2, v2.0, and QuixBugs are all standard public benchmarks."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions 'PyTorch' and specific GPU hardware (8× RTX 2080 TI) but provides no library versions, requirements.txt, Dockerfile, or detailed environment specification sufficient to recreate the setup."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper provides a replication package URL but does not include step-by-step reproduction instructions, commands to run, or a 'Reproducing Results' section within the paper itself."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables III and IV report only point estimates (number of bugs fixed). No confidence intervals or error bars are provided for any results."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims KNOD 'outperforms all existing APR techniques' based solely on comparing raw bug-fix counts. No statistical significance tests (t-test, Mann-Whitney, etc.) are used for any comparison."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports absolute differences with baseline context: '8 and 19 more bugs than the most effective DL-based and non-DL-based APR approach Recoder and TBar respectively.' Patch precision is compared: 86.7% vs DLFix 58.4%, TBar 62.4%, RewardRepair 70.3%."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is given for the benchmark sizes (393, 444, and 40 bugs). QuixBugs has only 40 bugs, which is quite small, and this is not discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. The ensemble uses 5 models but results are reported as single aggregate numbers without variance."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Extensive baselines are included: SequenceR, SimFix, DLFix, CoCoNuT, RewardRepair, TBar, CURE, and Recoder (Section III-B)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include Recoder (2021), RewardRepair (2022), and CURE (2021), which were the most recent state-of-the-art DL-based APR tools at the time of submission."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table V presents a thorough ablation study with three variants: KNOD-decoder (removes tree decoder), KNOD-distTrain (removes distillation from training), and KNOD-distInf (removes distillation from inference)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Multiple metrics are reported: number of correctly fixed bugs, compilation rate (Table V), patch precision (86.7%), and ranking of correct fixes at various top-k cutoffs (Figure 6)."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Patch correctness is manually verified: 'we manually check the correctness of plausible patches... The labeling procedure involves two participants. The agreement ratio is 92.1% and inconsistent cases are resolved by further discussion' (Section III-C)."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Test benchmarks (Defects4J, QuixBugs) are completely separate from training. 'We remove projects that are in or cloned from Defects4J projects from our training set.' The validation set (10% of training data) is used for model selection, not evaluation."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by three benchmarks (Defects4J v1.2, v2.0, QuixBugs), by two fault localization settings (perfect, spectrum-based), and Figure 4 shows per-project unique bug fixes. The ablation study breaks down by component."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section V discusses failure modes: KNOD 'cannot fix multi-hunk bugs very well' and 'performance depends on accuracy of fault localization tools.' Section IV-A analyzes bugs Recoder fixes but KNOD does not, attributing failures to fault localization quality."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The ablation study shows that removing components hurts performance. KNOD performs worse than Recoder (38 vs 45) on Defects4J v1.2 with spectrum-based fault localization (Table IV), which is honestly reported."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims 72 bugs on D4J v1.2, 25 on QuixBugs, 50 on D4J v2.0, and 'outperforming all existing APR tools.' Table III directly supports these numbers with perfect fault localization."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims are made through the ablation study (Table V): each component contributes to performance. The ablation design uses controlled single-variable manipulation (removing one component at a time), which is adequate for these causal claims."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'Automated Program Repair' and abstract claim 'outperforming all existing APR tools' are broad, but evaluation is only on Java benchmarks. Section III-D acknowledges 'Evaluation on more benchmarks of different program languages could be done in the future' but the framing is not bounded to Java."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Section III-D discusses threats to validity (implementation correctness, benchmark representativeness) but does not discuss alternative explanations for why KNOD outperforms baselines. For example, whether the improvement comes from ensemble learning or the specific training data rather than the architectural novelties is not examined."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures 'number of correctly fixed bugs' (manually verified against developer patches) and claims APR effectiveness. The measurement directly matches the claim — no proxy gap exists."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "The architecture is described but hyperparameters are given only as search ranges: 'number of encoder layers (6-8), number of parent and edge decoder layers (1-2), number of node decoder layers (4-8), embedding and hidden states dimension (256-384).' The exact configurations of the top-5 selected models are not reported."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "KNOD is a custom encoder-decoder model trained from scratch — it does not use prompting."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Key hyperparameters are reported: dropout 0.1, Adam optimizer with lr 2.5e-4, beam size 1000 (200 for ranking comparison), top-5 ensemble, 5-hour validation time limit. Architectural hyperparameters are given as search ranges (Section III-C)."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "KNOD is a direct encoder-decoder model — no agentic scaffolding is used."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section II-B details code normalization via src2abs (replacing identifiers with normalized tokens), AST parsing with javalang and JavaParser, and ASG construction with sibling edges. Section III-A describes training data construction including D4J exclusion and 90/10 split."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section V 'Limitation' is a dedicated section discussing two specific limitations: inability to fix multi-hunk bugs and dependence on fault localization accuracy."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section III-D 'Threats to Validity' discusses study-specific threats: implementation correctness (mitigated by multiple authors checking code), manual patch labeling (two participants with 92.1% agreement), and benchmark representativeness (mitigated by using three benchmarks with 877 bugs)."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section V explicitly states KNOD cannot fix multi-hunk bugs well and depends on fault localization. Section III-D acknowledges 'Evaluation on more benchmarks of different program languages could be done in the future since our approach is not specifically designed for Java.'"
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "A replication package is provided at https://github.com/lin-tan/knod. The evaluation benchmarks (Defects4J, QuixBugs) are publicly available, enabling independent verification of results."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section III-A describes training data: 576,002 pairs from open-source GitHub Java projects mined from previous work [7], [10], with D4J projects removed, randomly split 90/10 for training and validation."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data sources are standard benchmarks (Defects4J, QuixBugs) and a publicly available training dataset from prior work."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The data pipeline is documented: training data from prior work → removal of D4J projects → 576,002 pairs → 90/10 split. Code preprocessing is detailed: src2abs normalization → AST parsing via javalang → ASG construction with edge labeling (Section II-B)."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "The acknowledgments section states: 'This work is partially supported by a J.P. Morgan AI Faculty Research Award.'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All author affiliations are clearly listed: Purdue University, University of Alberta, and Fudan University. No authors are affiliated with companies whose products are being evaluated."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "J.P. Morgan has no direct financial stake in the comparative performance of KNOD versus other academic APR tools on public benchmarks."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is present in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "The paper describes training data as coming from 'open-source GitHub Java projects' via previous work [7], [10] but does not state when this data was collected or any temporal cutoff date."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": true,
    242         "justification": "Section III-A explicitly addresses train/test overlap: 'Following previous work, we remove projects that are in or cloned from Defects4J projects from our training set.'"
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": true,
    247         "justification": "Contamination is addressed by project-level exclusion: D4J projects and their clones are removed from the training set. This prevents the model from learning directly from the test benchmark code."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants — this is a benchmark evaluation of an automated program repair tool."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants — this is a benchmark evaluation of an automated program repair tool."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants — this is a benchmark evaluation of an automated program repair tool."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants — this is a benchmark evaluation of an automated program repair tool."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants — this is a benchmark evaluation of an automated program repair tool."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants — this is a benchmark evaluation of an automated program repair tool."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants — this is a benchmark evaluation of an automated program repair tool."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Section IV-A reports: 'KNOD spends 12.8s on average generating one thousand candidate patches for a given bug (using one NVIDIA RTX 2080 TI GPU).' A 5-hour validation time limit is also stated."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Training hardware is described (56-core server with 8× RTX 2080 TI GPUs) but total training time, GPU hours, or overall compute budget is not quantified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "The paper trains multiple models with different random hyperparameters for ensembling but does not report seed sensitivity or how results vary across training seeds."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The paper does not explicitly state the number of experimental runs. It uses an ensemble of 5 models but does not clarify whether evaluation is a single pass or averaged across runs."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The hyperparameter search ranges are stated (Section III-C) and top-5 models are selected by validation perplexity, but the total number of configurations tried and the compute spent on search are not reported."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "Model selection is justified: 'we first train different models with random hyperparameters... and then select the Top-k models according to their loss on the validation set' — selection is on the validation set, not the test set."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, so the question of multiple comparison correction does not arise."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement their own system and compare against baselines using numbers from those baselines' papers. There is no acknowledgment of potential bias from author-implemented vs externally-reported results."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Compute differences between KNOD and baselines are not discussed. For the ranking comparison, they note 'KNOD uses the ensemble result from five models, while CURE uses an ensemble of ten models' but do not compare at matched compute."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper uses Defects4J and QuixBugs without discussing whether these benchmarks adequately measure real-world APR effectiveness or what aspects of APR capability they may miss."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "No scaffolding is involved — KNOD is a direct encoder-decoder model, not a scaffold-based system."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "The paper does not discuss when the training data was collected relative to the benchmark creation dates, or whether training data could contain patches temporally related to benchmark bugs."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The paper does not discuss whether the input representation (buggy function + bug location) could leak information beyond what would be available in a realistic deployment."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": true,
    360         "justification": "Train/test independence is addressed: 'we remove projects that are in or cloned from Defects4J projects from our training set' (Section III-A), ensuring project-level separation."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": true,
    365         "justification": "Project-level exclusion of Defects4J projects and their clones from the training set is a concrete decontamination method applied to prevent train/test overlap."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "KNOD fixes 72 bugs on Defects4J v1.2 with perfect fault localization, outperforming all existing APR tools including Recoder (64) and TBar (53).",
    372       "evidence": "Table III shows the comparison across 8 baselines. KNOD fixes 8 more bugs than Recoder and 19 more than TBar.",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "KNOD fixes 50 bugs on Defects4J v2.0 and 25 on QuixBugs, demonstrating generalizability across benchmarks.",
    377       "evidence": "Table III reports these numbers. On D4J v2.0, KNOD fixes 5 more than RewardRepair (45) and 31 more than CURE (19).",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "The three-stage tree decoder enables KNOD to fix 16 more bugs than using a sequential decoder.",
    382       "evidence": "Table V ablation: KNOD (72) vs KNOD-decoder (56). Compilation rate also improves from 33.6% to 47.0%.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Domain-rule distillation during training is more important than during inference alone, contributing 10 additional bug fixes.",
    387       "evidence": "Table V: KNOD (72) vs KNOD-distTrain (62) vs KNOD-distInf (69). Removing distillation from training hurts more than removing it from inference.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "KNOD achieves 86.7% patch precision on Defects4J v1.2, substantially higher than existing APR tools.",
    392       "evidence": "Section IV-A: 72 of 83 plausible patches are correct. Compared to DLFix 58.4%, TBar 62.4%, RewardRepair 70.3%.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "KNOD uniquely fixes 12 bugs on D4J v1.2 that no other top-3 tool can fix.",
    397       "evidence": "Figure 4(a) Venn diagram shows 12 bugs uniquely fixed by KNOD compared to TBar, CURE, and Recoder.",
    398       "supported": "strong"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No statistical significance testing",
    404       "detail": "All comparative claims ('outperforms all existing APR tools') are based on comparing raw counts without any statistical tests. With small absolute differences (e.g., 72 vs 64 bugs), the significance of improvements is unclear."
    405     },
    406     {
    407       "flag": "No variance or error bars reported",
    408       "detail": "Despite using ensemble learning with random hyperparameters, the paper reports only single aggregate numbers. The stability of results across different training seeds or model selections is unknown."
    409     },
    410     {
    411       "flag": "Incomplete baseline comparisons across benchmarks",
    412       "detail": "Many cells in Tables III and IV are marked '-' (tool not evaluated on that benchmark), making cross-benchmark comparisons incomplete. Only RewardRepair and KNOD have results on all three benchmarks in Table III."
    413     },
    414     {
    415       "flag": "Unbounded generalization claims",
    416       "detail": "The paper is titled 'Automated Program Repair' and claims to outperform 'all existing APR tools' but evaluation is limited to Java bugs on three benchmarks. No testing on other languages despite the paper noting 'our approach is not specifically designed for Java.'"
    417     }
    418   ],
    419   "cited_papers": [
    420     {
    421       "title": "Neural program repair with execution-based backpropagation",
    422       "authors": ["Hao Ye", "Matias Martinez", "Martin Monperrus"],
    423       "year": 2022,
    424       "arxiv_id": "2105.04123",
    425       "relevance": "RewardRepair: DL-based APR using dynamic execution feedback, one of the main baselines compared against KNOD."
    426     },
    427     {
    428       "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair",
    429       "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano", "Louis-Noël Pouchet", "Denys Poshyvanyk", "Martin Monperrus"],
    430       "year": 2019,
    431       "relevance": "Early DL-based APR using sequence-to-sequence NMT, baseline showing the limitations of token-level generation that KNOD's tree decoder addresses."
    432     },
    433     {
    434       "title": "CoCoNuT: Combining context-aware neural translation models using ensemble for program repair",
    435       "authors": ["Thibaud Lutellier", "Hung Viet Pham", "Lawrence Pang", "Yitong Li", "Moshi Wei", "Lin Tan"],
    436       "year": 2020,
    437       "doi": "10.1145/3395363.3397369",
    438       "relevance": "Context-aware DL-based APR with ensemble learning, source of the training dataset used by KNOD."
    439     },
    440     {
    441       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    442       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    443       "year": 2021,
    444       "relevance": "DL-based APR incorporating code awareness into NMT, key baseline from same research group as KNOD."
    445     },
    446     {
    447       "title": "A syntax-guided edit decoder for neural program repair",
    448       "authors": ["Qihao Zhu", "Zeyu Sun", "Yuan-an Xiao", "Wenjie Zhang", "Kang Yuan", "Yingfei Xiong", "Lu Zhang"],
    449       "year": 2021,
    450       "doi": "10.1145/3468264.3468544",
    451       "relevance": "Recoder: edit-generation APR using production rules, the strongest DL-based baseline that KNOD outperforms by 8 bugs."
    452     },
    453     {
    454       "title": "Hoppity: Learning graph transformations to detect and fix bugs in programs",
    455       "authors": ["Elizabeth Dinella", "Hanjun Dai", "Ziyang Li", "Mayur Naik", "Le Song", "Ke Wang"],
    456       "year": 2020,
    457       "relevance": "Graph-based DL approach to bug detection and repair, relevant as an edit-generation APR technique using graph transformations."
    458     },
    459     {
    460       "title": "DLFix: Context-Based Code Transformation Learning for Automated Program Repair",
    461       "authors": ["Yi Li", "Shaohua Wang", "Tien N. Nguyen"],
    462       "year": 2020,
    463       "relevance": "DL-based APR using context-based code transformations, one of the baselines generating ASTs of patches."
    464     },
    465     {
    466       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    467       "authors": ["René Just", "Darioush Jalali", "Michael D. Ernst"],
    468       "year": 2014,
    469       "relevance": "The most widely-used Java bug benchmark in APR research, used for evaluation in KNOD and nearly all compared tools."
    470     },
    471     {
    472       "title": "Evaluating large language models trained on code",
    473       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    474       "year": 2021,
    475       "arxiv_id": "2107.03374",
    476       "relevance": "Codex paper — foundational LLM for code generation, discussed in related work as an alternative paradigm to specialized APR models like KNOD."
    477     },
    478     {
    479       "title": "CodeBERT: A pre-trained model for programming and natural languages",
    480       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    481       "year": 2020,
    482       "arxiv_id": "2002.08155",
    483       "relevance": "Pre-trained code model representing the LLM-based approach to code understanding, discussed as related work to KNOD's specialized approach."
    484     },
    485     {
    486       "title": "TBar: Revisiting Template-Based Automated Program Repair",
    487       "authors": ["Kui Liu", "Anil Koyuncu", "Dongsun Kim", "Tegawendé F. Bissyandé"],
    488       "year": 2019,
    489       "relevance": "Template-based APR tool, the most effective non-DL baseline compared against KNOD."
    490     },
    491     {
    492       "title": "Harnessing deep neural networks with logic rules",
    493       "authors": ["Zhiting Hu", "Xuezhe Ma", "Zhengzhong Liu", "Eduard Hovy", "Eric Xing"],
    494       "year": 2016,
    495       "relevance": "Foundation for KNOD's domain-rule distillation approach — the teacher-student architecture for injecting logic rules into neural networks."
    496     }
    497   ],
    498   "engagement_factors": {
    499     "practical_relevance": {
    500       "score": 2,
    501       "justification": "KNOD is a usable APR tool with released code that could help developers fix Java bugs, though it requires training infrastructure and Java-specific setup."
    502     },
    503     "surprise_contrarian": {
    504       "score": 1,
    505       "justification": "The results confirm that domain knowledge improves DL-based APR, which is expected; the specific mechanism (tree decoder + distillation) is novel but not contrarian."
    506     },
    507     "fear_safety": {
    508       "score": 0,
    509       "justification": "No AI safety or security concerns — this is a program repair tool aimed at improving software reliability."
    510     },
    511     "drama_conflict": {
    512       "score": 0,
    513       "justification": "No controversy — straightforward improvement over existing APR tools with standard benchmark comparisons."
    514     },
    515     "demo_ability": {
    516       "score": 1,
    517       "justification": "Replication package is available on GitHub but requires significant setup (GPU training, Java environment, benchmark infrastructure) to use."
    518     },
    519     "brand_recognition": {
    520       "score": 1,
    521       "justification": "Purdue University is well-known in software engineering research but not a high-profile AI brand. Published at ICSE, a top SE venue."
    522     }
    523   }
    524 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs