scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (23545B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair",
      6     "authors": [
      7       "Nan Jiang",
      8       "Thibaud Lutellier",
      9       "Yiling Lou",
     10       "Lin Tan",
     11       "Dan Goldwasser",
     12       "Xiangyu Zhang"
     13     ],
     14     "year": 2023,
     15     "venue": "International Conference on Software Engineering",
     16     "arxiv_id": "2302.01857",
     17     "doi": "10.1109/ICSE48619.2023.00111"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The abstract's claims of fixing 72 bugs on Defects4J v1.2, 25 on QuixBugs, and 50 on Defects4J v2.0 are all directly supported by Table III results.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Ablation study (Table V) isolates the causal contribution of the three-stage tree decoder (+16 bugs over KNOD-decoder) and domain-rule distillation (+10 bugs for training phase), adequately supporting the causal framing.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section V explicitly states KNOD is evaluated on Java programs only and that multi-hunk bugs remain a limitation; generalization claims are bounded to the three benchmarks tested.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper does not discuss alternative explanations for KNOD's superiority, such as ensemble size advantages (5 vs fewer models for some baselines) or potential training data size differences.",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper uses number of correctly fixed bugs (manually verified as semantically equivalent to developer patches) as the primary metric, which directly measures the claimed objective.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section V is a dedicated 'LIMITATION' section discussing multi-hunk bug failures and fault localization dependence.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Section III.D names specific threats: implementation correctness mitigated by multi-author review, manual labeling with 92.1% inter-rater agreement, and benchmark coverage limited to three Java benchmarks.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper explicitly states KNOD cannot fix multi-hunk bugs well and results are limited to Java programs; future work on other languages is noted as out of current scope.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgment section discloses 'partially supported by a J.P. Morgan AI Faculty Research Award.'",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All author affiliations are listed on the first page (Purdue University, University of Alberta, Fudan University); notes clarify institutions at time of the work.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "J.P. Morgan funds general academic research on program repair; the paper does not evaluate J.P. Morgan products or systems, so the funder is independent of experimental outcomes.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests or financial interests declaration is present; the acknowledgment only discloses funding source but not patents, equity, or consulting relationships.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "APR, AST, Abstract Syntax Graph, domain knowledge, and the three-stage decoder components are all defined or described with concrete examples (e.g., Figure 1 walkthrough of Closure-123).",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Section I lists four explicit contributions: the three-stage tree decoder, domain-rule distillation, the KNOD system, and its evaluation on three benchmarks.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section VI provides substantial related work covering DL-based APR and code generation, explaining how KNOD differs architecturally from specific competing approaches like Recoder, CURE, and RewardRepair.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "Reference [65] and a 'Data Availability' statement link to a replication package at https://github.com/lin-tan/knod.",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Defects4J and QuixBugs are publicly available benchmarks used unmodified; the training data is sourced from a prior work's public dataset of GitHub Java patches.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Hardware (RTX 2080 TI, 56-core server) and framework (PyTorch) are mentioned, but no version numbers, requirements file, or Dockerfile are provided.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper references a replication package but includes no step-by-step reproduction instructions in the paper text itself.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "Main results in Tables III-V report only single counts of correctly fixed bugs with no confidence intervals or error bars across runs.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "No statistical significance tests are used for any comparative claims; comparisons are made purely on raw bug counts.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "The paper reports specific numerical improvements ('8 and 19 more bugs than the best DL-based and non-DL-based APR techniques') and patch precision (86.7% vs 58.4-70.3% for competitors).",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "No power analysis or justification for using these specific benchmarks; the choice is justified by convention (widely-used), not statistical reasoning.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No variance, standard deviation, or spread across multiple runs is reported; single results per configuration appear in all tables.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Eight baselines are included: SequenceR, SimFix, DLFix, CoCoNuT, RewardRepair, TBar, CURE, and Recoder — covering both DL-based and non-DL-based APR.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines include recent work from 2021-2022 (CURE, RewardRepair, Recoder), which were state-of-the-art at time of writing.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Table V presents ablation with three variants (KNOD-decoder, KNOD-distTrain, KNOD-distInf) isolating the tree decoder and domain-rule distillation contributions in training vs. inference phases.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Evaluation uses number of correctly fixed bugs, patch precision (86.7%), compilation rate, and ranking of correct fixes across top-k candidate patches (Figure 6).",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Manual patch correctness labeling by two participants with 92.1% agreement ratio is used to verify plausible patches as semantically equivalent to developer patches.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": true,
    213           "justification": "Training data explicitly excludes projects in or cloned from Defects4J; bug benchmarks serve as held-out test sets separate from the 576,002-pair training corpus.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Figure 4 provides Venn diagrams of uniquely and jointly fixed bugs per benchmark; Tables III/IV break down results per benchmark across two FL settings.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section V discusses multi-hunk bug failures and fault localization dependence; Section IV.A analyzes why KNOD underperforms Recoder on Defects4J v1.2 under spectrum-based FL.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Table IV honestly reports that KNOD fixes fewer bugs than Recoder on Defects4J v1.2 under spectrum-based fault localization (38 vs 45).",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": false,
    238           "answer": false,
    239           "justification": "KNOD is a custom-built model, not an off-the-shelf pre-trained model; hyperparameters are reported directly and version specificity is not applicable.",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": false,
    244           "answer": false,
    245           "justification": "KNOD is a custom deep learning system, not an LLM-based system using prompts; this criterion is not applicable.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Section III.C reports encoder layers (6-8), decoder layers (1-2 for parent/edge, 4-8 for node), embedding dimensions (256-384), dropout 0.1, Adam lr 2.5e-4, beam size 1000.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "KNOD is not an agentic system with scaffolding; there is no scaffolding component to describe.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section II.B describes code normalization using src2abs, AST/ASG construction using javalang and JavaParser, identifier normalization, and buggy location sequence generation in detail.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "The replication package (reference [65]) is publicly available, and the bug benchmarks (Defects4J, QuixBugs) are well-known public datasets.",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Training data collection is described: mined from prior work's dataset of open-source GitHub Java projects, with Defects4J projects removed; 576,002 pairs, 90/10 split.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participant recruitment; evaluation uses standard public bug benchmarks.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "The full pipeline from raw buggy code through normalization, ASG construction, training/validation split, and patch validation is documented across Sections II and III.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "No date is given for when the GitHub training data was mined; there is no stated cutoff for the training corpus relative to the bug benchmarks.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": true,
    303           "justification": "The paper explicitly states 'we remove projects that are in or cloned from Defects4J projects from our training set' to prevent training/test overlap.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "While Defects4J projects are excluded from training, no discussion addresses whether QuixBugs or Defects4J v2.0 bug fixes might appear in the GitHub-mined training corpus.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study; manual patch labeling is conducted by the authors, not external subjects.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants requiring IRB approval.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human experimental design requiring randomization.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": true,
    361           "justification": "Section IV.A states 'KNOD spends 12.8s on average generating one thousand candidate patches for a given bug (using one NVIDIA RTX 2080 TI GPU)'.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "Hardware specs for training (8x RTX 2080 TI, 56-core server) are stated but no total training time, GPU-hours, or compute budget is reported.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "KNOD fixes 72 bugs on Defects4J v1.2 with perfect fault localization, outperforming all existing APR tools.",
    376       "evidence": "Table III shows KNOD fixing 72 bugs versus next best Recoder at 64 (DL-based) and TBar at 53 (non-DL).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "The three-stage tree decoder improves patch generation by fixing 16 more bugs than a sequential decoder baseline.",
    381       "evidence": "Table V ablation: KNOD (72) vs KNOD-decoder (56) on Defects4J v1.2; compilation rate 47.0% vs 33.6%.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Domain-rule distillation during training is more effective than applying it only during inference.",
    386       "evidence": "Table V: KNOD-distTrain (inference-only rules) fixes 62 bugs vs KNOD-distInf (training-only rules) at 69, confirming the training phase is more critical.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "KNOD achieves 86.7% patch precision, substantially higher than existing APR tools (DLFix 58.4%, TBar 62.4%, RewardRepair 70.3%).",
    391       "evidence": "Reported in Section IV.A; comparison figures cited from [8] under same configuration.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "KNOD generalizes across benchmarks, fixing 50 bugs on Defects4J v2.0 and 25 on QuixBugs.",
    396       "evidence": "Table III reports these figures; limited comparison as many baselines have no published results on these benchmarks.",
    397       "supported": "moderate"
    398     },
    399     {
    400       "claim": "KNOD uniquely fixes 12 bugs on Defects4J v1.2 that no compared technique fixes, complementing existing tools.",
    401       "evidence": "Figure 4(a) Venn diagram shows 12 bugs uniquely fixed by KNOD not fixed by TBar, CURE, or Recoder.",
    402       "supported": "strong"
    403     }
    404   ],
    405   "methodology_tags": [
    406     "benchmark-eval"
    407   ],
    408   "key_findings": "KNOD outperforms all prior automated program repair tools on Defects4J v1.2 by fixing 72 bugs with 86.7% patch precision, substantially higher than competitors. The ablation study demonstrates that both the three-stage tree decoder (generating ASTs directly rather than token sequences) and domain-rule distillation (injecting syntactic/semantic rules during training via teacher-student distributions) independently contribute to improvement, with the training-phase component being more impactful than inference-only domain knowledge application. The system also generalizes to Defects4J v2.0 and QuixBugs, though comparisons there are limited by fewer baselines reporting results.",
    409   "red_flags": [
    410     {
    411       "flag": "No statistical tests",
    412       "detail": "All comparative claims are made on raw bug counts without significance tests, making it impossible to assess whether differences (e.g., 72 vs 64) are statistically meaningful given benchmark variance."
    413     },
    414     {
    415       "flag": "No variance across runs",
    416       "detail": "Results are single-run point estimates; no standard deviation or confidence intervals are reported for any metric including ablation results."
    417     },
    418     {
    419       "flag": "Ensemble size confound",
    420       "detail": "KNOD uses an ensemble of 5 models while some baselines use fewer (Recoder: 1) and others more (CURE: 10); the ranking comparison acknowledges but does not fully control for this confound."
    421     },
    422     {
    423       "flag": "Training data cutoff unknown",
    424       "detail": "No date is given for when GitHub training data was mined; potential overlap between training data and QuixBugs or Defects4J v2.0 fix patterns is not addressed."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    430       "relevance": "Direct predecessor by overlapping authors; KNOD builds on and outperforms CURE on the same benchmarks"
    431     },
    432     {
    433       "title": "CoCoNuT: Combining Context-Aware Neural Translation Models Using Ensemble for Program Repair",
    434       "relevance": "Co-author's prior APR work sharing training data methodology; key baseline"
    435     },
    436     {
    437       "title": "A Syntax-Guided Edit Decoder for Neural Program Repair (Recoder)",
    438       "relevance": "Main DL-based competitor; achieves competitive results on Defects4J v1.2 under spectrum-based FL"
    439     },
    440     {
    441       "title": "Neural Program Repair with Execution-Based Backpropagation (RewardRepair)",
    442       "relevance": "Key baseline using dynamic domain knowledge (execution feedback), contrasted with KNOD's static rules"
    443     },
    444     {
    445       "title": "TBar: Revisiting Template-Based Automated Program Repair",
    446       "relevance": "Best non-DL baseline representing template-based APR"
    447     },
    448     {
    449       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    450       "relevance": "Primary evaluation benchmark used throughout; most widely cited APR benchmark"
    451     },
    452     {
    453       "title": "Harnessing Deep Neural Networks with Logic Rules",
    454       "relevance": "Foundational teacher-student distribution technique that KNOD's domain-rule distillation is directly based on"
    455     },
    456     {
    457       "title": "Graph Transformer Networks",
    458       "relevance": "Architecture for the graph-transformer encoder used in KNOD's encoding stage"
    459     }
    460   ],
    461   "engagement_factors": {
    462     "practical_relevance": {
    463       "score": 2,
    464       "justification": "APR tools directly help developers fix bugs and KNOD has an open-source release, but it is Java-only and requires significant GPU compute."
    465     },
    466     "surprise_contrarian": {
    467       "score": 1,
    468       "justification": "The finding that training-phase domain rule injection is more critical than inference-phase filtering is a useful but incremental insight; the overall direction is expected."
    469     },
    470     "fear_safety": {
    471       "score": 0,
    472       "justification": "No safety or AI risk implications; this is a software engineering productivity tool."
    473     },
    474     "drama_conflict": {
    475       "score": 0,
    476       "justification": "Standard benchmark competition in the APR field; no controversy or conflict angle."
    477     },
    478     "demo_ability": {
    479       "score": 2,
    480       "justification": "Replication package available at github.com/lin-tan/knod; practitioners can run KNOD on Java projects with the provided setup."
    481     },
    482     "brand_recognition": {
    483       "score": 1,
    484       "justification": "Purdue University is a respected CS program with J.P. Morgan AI funding, but no top-tier industry lab affiliation."
    485     }
    486   },
    487   "hn_data": {
    488     "threads": [],
    489     "top_points": 0,
    490     "total_points": 0,
    491     "total_comments": 0
    492   }
    493 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs