scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (30148B)
      1 {
      2   "paper": {
      3     "title": "GLAD: Neural Predicate Synthesis to Repair Omission Faults",
      4     "authors": ["Sungmin Kang", "Shin Yoo"],
      5     "year": 2022,
      6     "venue": "ISSTA 2022",
      7     "arxiv_id": "2204.06771",
      8     "doi": "10.1109/ICSE-Companion58688.2023.00087"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "GLAD, a grammar-constrained language model approach for synthesizing if-statement predicates, fixes 28 Defects4J v1.2 faults (16 never fixed by any DL-based APR tool, 8 never fixed by any of 40 surveyed tools). An analysis of 48 unfixed 'next level' faults shows 81.3% involve omission, motivating the approach. Ablation confirms all three components (LM, grammar, debugger-based reranking) contribute independently, and dynamic reranking reduces worst-case verification rank from 28,321 to 3. GLAD generalizes to Defects4J v2.0, correctly fixing 20 additional bugs.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper provides an anonymous repository URL (https://anonymous.4open.science/r/neural-pred-synth-4816/README.md) and states 'we make our tool publicly available' (Section 1)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The raw APR analysis data is made publicly available at the anonymous repository (Section 3: 'The raw data used in this section is made publicly available'). The evaluation uses the publicly available Defects4J benchmark."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions hardware (Intel i7-6700, NVIDIA 1080/3090 GPUs in Section 5.1.4) and uses javalang for parsing, but provides no requirements.txt, Dockerfile, or detailed dependency list sufficient to recreate the environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The anonymous repository is referenced but the paper itself contains no 'Reproducing Results' section or runnable commands."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results are reported as point estimates (counts of bugs fixed, median/mean runtimes). No confidence intervals, error bars, or uncertainty measures are provided."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims GLAD is 'orthogonal' to existing techniques and 'makes a significant contribution' based solely on comparing raw counts of bugs fixed without any statistical significance tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Results are reported as raw counts (28 fixed, 16 new vs DL, 8 never fixed). No standardized effect sizes (Cohen's d, odds ratios, etc.) are provided. While baseline counts are shown for context, the improvements are not characterized with effect size measures."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper evaluates on 95 bugs from Defects4J v1.2 and 89 from v2.0 (184 total) but provides no justification for these sample sizes beyond using what the benchmark provides."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "All results appear to be from single experimental runs. No standard deviations, variance across seeds, or spread measures are reported for any experiment."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Extensive baselines are included: TBar (template-based), 7 specific DL-based tools (SequenceR, CODIT, DLFix, CoCoNuT, Ratchet, CURE, Recoder), Restore (condition synthesis), and 40 total APR tools surveyed (Table 1, Table 3)."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include contemporary tools: Recoder (2021), CURE (2021), Restore (2020), CoCoNuT (2020), and TBar (2019), representing the state of the art at time of publication."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 6.3 (RQ3) presents a thorough ablation study removing finetuning (-9 correct), grammar (-27 correct), and the language model (-47 correct), plus analysis of dynamic reranking's contribution (Table 5, Table 6)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are reported: number of plausible fixes, number of correct fixes (Tables 3-5), runtime statistics (Table 7), MRR, acc@1, acc@5, and worst-case rank (Table 6)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Plausible patches are manually inspected for correctness: 'plausible patches are manually inspected to check for correctness' (Section 4.4). Section 6.5 provides qualitative analysis of successful and failed patches."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The LM is pretrained on Java-med with Defects4J data purged (Section 5.1.2: 'data related to the Defects4J projects is purged'). Results are reported on Defects4J v1.2 and generalization tested on the separate Defects4J v2.0 dataset."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 3 provides per-fault results for all 31 bugs attempted in Defects4J v1.2, comparing GLAD, TBar, DL, GLAD-M, and Restore for each individual fault. Results are also broken down by v1.2 vs v2.0."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.5 (RQ5) presents and analyzes specific failure cases: Math-106 (correct predicate but incomplete body) and Time-27 (lexical precedent issue). Also notes GLAD generates correct predicates for 32 bugs it could not fully fix."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The ablation study (Table 5) shows configurations that perform worse. RQ5 discusses failure modes. The paper notes GLAD cannot fix certain bugs due to lexical precedent issues and body synthesis limitations."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'GLAD can correctly fix 16 Defects4J v1.2 faults that previous NMT-based techniques could not' — supported by Table 3. 'Maintaining a reasonable runtime cost' — supported by Table 7 (median 9.38 min). 'Highly orthogonal to existing techniques' — supported by the 8 never-before-fixed bugs."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims are made through ablation studies (Section 6.3) using controlled single-variable manipulation: removing one component at a time (finetuning, grammar, LM) while holding others constant. This is adequate for the causal claim that each component contributes to performance."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Section 7 explicitly bounds claims: 'While the principles of GLAD are not derived from any characteristics of the Java language itself... the performance of GLAD under such conditions is currently unknown.' The paper scopes claims to Java and Defects4J."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "Section 7 discusses internal validity (whether components or randomness explain results) and external validity (generalization to other projects/languages). Section 6.5 discusses specific alternative explanations for failures (lexical precedent, body synthesis limitations)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures 'number of bugs correctly fixed' against a test suite and verifies via manual inspection. The claims match the granularity of measurements — 'fixes N bugs' is directly what was measured, with no proxy gap."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "GLAD uses the authors' own GRU-based language model. The architecture is described (GRU, BPE with ~5000 pairs, pretrained on Java-med dataset from Alon et al. [2], trained for 5 epochs). Section 5.1.2 provides the model specification."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "GLAD does not use prompting in the LLM sense. It feeds code context plus a repair seed ('if' and '(' tokens) to a generative language model. The input construction is described in Section 4.1.2 but this is not prompt engineering."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section 5.1.3 reports key hyperparameters: beam width W=10,000, max length l=15 tokens, finetuning for 1 epoch, BPE with ~5000 pairs, pretraining for 5 epochs, dynamic reranking timeout of 15 minutes, max runtime 3 hours."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "GLAD is a traditional ML pipeline (LM + grammar + debugger), not an agentic scaffolding system with tool use, retry logic, or feedback loops in the modern LLM agent sense."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 5.1.2 documents preprocessing: methods extracted via javalang, subtokenized with BPE (~5000 pairs), less than 1% of unparseable methods/files discarded, Defects4J data purged from pretraining. Legal identifier extraction is documented in Section 4.1.3."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 7 'Threats to Validity' provides substantive discussion of both internal validity (component contributions, randomness effects) and external validity (generalization to other languages and projects)."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Specific threats discussed: GLAD only targets if-statement omission faults, performance unknown on non-Java languages, lexical precedent issues (e.g., tokens never appearing in training data, Section 6.5 Time-27 example), body synthesis is not addressed."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Explicit scope boundaries: limited to Java (Section 7), limited to if-statement omission faults (Section 4), performance on non-Java 'currently unknown' (Section 7), body synthesis left to future work (Section 6.5)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The APR analysis raw data (fix_data.csv) is made publicly available at the anonymous repository. Per-fault results are shown in Table 3 enabling verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3 describes how APR results were gathered: surveyed program-repair.org and Monperrus's living review, with specific criteria for using perfect FL results, paper-reported results, and two TBar versions. 46 tools identified, 40 with retrievable results."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. The data source is the standard Defects4J benchmark and published APR results."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from data collection to analysis is documented: survey of APR tools (Section 3, Table 1), feature extraction from developer patches (Defects4J-dissection + git diff), distribution analysis (Figure 1), and manual investigation of 48 unfixed faults (Table 2)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source is mentioned anywhere in the paper. There is no acknowledgments section listing grants or sponsors."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: both authors are from KAIST, Daejeon, Republic of Korea. They are not affiliated with any company whose product is being evaluated."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Cannot assess funder independence since no funding source is disclosed. The authors' academic affiliation (KAIST) does not inherently create a conflict, but without funding disclosure this cannot be verified."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement or financial interest declaration is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper describes the training dataset (Java-med from Alon et al.) but does not state a temporal cutoff date for when the training data was collected. They address overlap by purging Defects4J data, but no cutoff date is given."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": true,
    235         "justification": "Section 5.1.2: 'Prior to training, data related to the Defects4J projects is purged.' The paper explicitly addresses train/test overlap by removing benchmark data from the pretraining dataset."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": true,
    240         "justification": "Contamination is addressed through data purging: Defects4J project data is removed from the pretraining corpus (Section 5.1.2). Additionally, finetuning only uses the faulty project version, not the fixed version."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study. Evaluation is automated on the Defects4J benchmark."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Table 7 reports detailed runtime statistics: min/median/mean time to first plausible patch under both FL settings. Table 5 shows median time for each ablated component. Table 6 shows debugger execution time (13.5s + 119s). Maximum timeout is 3 hours per bug."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is mentioned (Intel i7-6700, NVIDIA 1080/3090 in Section 5.1.4) and per-bug runtimes are given, but total computational budget (total GPU hours for pretraining, total experiment time across all bugs) is not stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of random seeds or results across multiple seeds. All results appear to be from single experimental runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The paper does not state how many times each experiment was run. Results are presented without indicating whether they come from single or averaged runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The paper states beam width W=10,000 and max length l=15 'empirically leads to the best performance' and timeout of 15 minutes was 'empirically determined,' but the search budget (number of configurations tried, method used) is not reported."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "Hyperparameters are described as 'empirically' determined but no validation set, selection criterion, or explanation of how the best configuration was chosen is provided."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own system against baselines without acknowledging self-comparison bias. They use results from prior papers for baselines rather than re-implementing them, which partially mitigates this, but the bias is not discussed."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No performance curves across compute levels are provided. While Table 7 compares mean runtime with CURE and Restore, there is no systematic analysis of how performance varies with allocated compute budget."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": true,
    331         "justification": "Section 3 provides a thorough analysis of the Defects4J benchmark, examining which faults have been fixed, what features predict difficulty, and identifying that omission faults constitute the 'next level' of challenge — demonstrating understanding of what the benchmark measures."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "GLAD is evaluated as a complete tool, not as a model within interchangeable scaffolds. Comparisons are between entire APR tools with fundamentally different architectures, not models in different wrappers."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal aspects — whether the Java-med pretraining dataset could contain code related to Defects4J bug fixes committed before the dataset was collected. Only project-level purging is performed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": true,
    348         "justification": "The paper evaluates under both perfect fault localization (which provides oracle information) and method-given FL (more realistic). Section 5.1.1 explicitly acknowledges perfect FL as a controlled setting 'to gauge repair performance without localization bias,' and provides the more realistic method-given evaluation alongside it."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether pretraining data (Java-med) shares structural similarities with Defects4J projects beyond the project-level purge. No analysis of near-duplicate code or shared coding patterns."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": true,
    358         "justification": "A concrete prevention method is used: 'data related to the Defects4J projects is purged' from the pretraining dataset (Section 5.1.2). This is a decontamination pipeline applied before training."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "81.3% of unfixed 'next level' faults (16-63 added characters) have an omission aspect, with 58.3% being strong omission faults.",
    365       "evidence": "Manual investigation of all 48 unfixed faults in the 16-63 added character range, presented in Table 2 (Section 3).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "GLAD fixes 16 Defects4J v1.2 faults that no previous deep learning-based APR technique could fix.",
    370       "evidence": "Table 3 (Section 6.1) compares GLAD against all known DL-based tool results (SequenceR, CODIT, DLFix, CoCoNuT, Ratchet, CURE, Recoder) under perfect FL.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "GLAD fixes 8 faults that were never correctly fixed by any of the 40 APR tools surveyed.",
    375       "evidence": "Table 3 'Ever' column shows 8 faults with 0 prior fixes (Cl-1, Cl-15, Cl-52, L-9, M-28, M-48, Mo-24, Mo-34). Section 6.1.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Each component (LM, grammar, finetuning) contributes independently to GLAD's performance.",
    380       "evidence": "Ablation study in Table 5 (Section 6.3): removing finetuning drops correct fixes from 48 to 39, removing grammar drops to 21, removing LM drops to 1.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Dynamic reranking significantly reduces verification effort, improving worst-case rank from 28,321 to 745.",
    385       "evidence": "Table 6 (Section 6.3) shows MRR improves from 0.138 to 0.449, acc@1 from 5 to 18, and max rank from 28,321 to 745, with only ~2 minutes of debugger time.",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "GLAD generates most plausible patches in under 20 minutes even without exact fault location.",
    390       "evidence": "Table 7 (Section 6.4) shows median time of 9.38 min (perfect FL) and 18.59 min (method-given FL).",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "GLAD generalizes to Defects4J v2.0, correctly fixing 20 bugs.",
    395       "evidence": "Table 4 (Section 6.2) reports 20 correct fixes. However, comparison with TBar/SimFix/Recoder baselines is not directly comparable due to different FL settings.",
    396       "supported": "moderate"
    397     }
    398   ],
    399   "red_flags": [
    400     {
    401       "flag": "No variance or seed sensitivity reporting",
    402       "detail": "All results appear to be from single experimental runs with no error bars, variance across seeds, or repeated trials. Given that beam search and LM generation involve stochastic elements, result stability is unknown."
    403     },
    404     {
    405       "flag": "Unfair v2.0 comparison acknowledged but still presented",
    406       "detail": "Table 4 compares GLAD (perfect FL) against TBar/SimFix/Recoder (GZoltar-based FL) with a note that 'these numbers are not directly comparable,' yet the table layout invites comparison and the text highlights GLAD fixing 'the largest number.'"
    407     },
    408     {
    409       "flag": "Manual correctness inspection without inter-rater agreement",
    410       "detail": "Plausible patches are manually inspected for correctness by the authors, but no inter-rater agreement metric, independent evaluator, or clear correctness criteria beyond 'semantically equivalent to developer patch' are provided."
    411     },
    412     {
    413       "flag": "Hyperparameter selection lacks transparency",
    414       "detail": "Key hyperparameters (beam width 10,000, max length 15, timeout 15 min) are described as 'empirically' determined without reporting the search space, method, or validation procedure used to select them."
    415     }
    416   ],
    417   "cited_papers": [
    418     {
    419       "title": "Evaluating Large Language Models Trained on Code",
    420       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    421       "year": 2021,
    422       "arxiv_id": "2107.03374",
    423       "relevance": "Foundational Codex paper on LLM code generation, directly relevant to understanding LM capabilities for code synthesis."
    424     },
    425     {
    426       "title": "Language Models are Few-Shot Learners",
    427       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    428       "year": 2020,
    429       "relevance": "GPT-3 paper establishing pretrained LM few-shot capabilities that underpin language model-based code approaches."
    430     },
    431     {
    432       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    433       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    434       "year": 2021,
    435       "doi": "10.1109/ICSE43902.2021.00107",
    436       "relevance": "State-of-the-art NMT-based APR tool using code-aware translation; key baseline for GLAD."
    437     },
    438     {
    439       "title": "A Syntax-Guided Edit Decoder for Neural Program Repair",
    440       "authors": ["Qihao Zhu", "Zeyu Sun", "Yuan-an Xiao"],
    441       "year": 2021,
    442       "doi": "10.1145/3468264.3468544",
    443       "relevance": "Recoder — best-performing learning-based APR tool at time of writing; key baseline showing limits of edit-based approaches."
    444     },
    445     {
    446       "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair",
    447       "authors": ["Zimin Chen", "Steve Kommrusch", "Michele Tufano"],
    448       "year": 2019,
    449       "relevance": "Seminal NMT-based APR tool using seq2seq architecture; establishes the translation paradigm that GLAD departs from."
    450     },
    451     {
    452       "title": "CoCoNuT: Combining Context-Aware Neural Translation Models Using Ensemble for Program Repair",
    453       "authors": ["Thibaud Lutellier", "Hung Viet Pham", "Lawrence Pang"],
    454       "year": 2020,
    455       "doi": "10.1145/3395363.3397369",
    456       "relevance": "Context-aware NMT ensemble for APR; demonstrates how combining models improves repair, relevant to GLAD's ensemble potential."
    457     },
    458     {
    459       "title": "TBar: Revisiting Template-Based Automated Program Repair",
    460       "authors": ["Kui Liu", "Anil Koyuncu", "Dongsun Kim"],
    461       "year": 2019,
    462       "doi": "10.1145/3293882.3330577",
    463       "relevance": "Strongest template-based APR tool collating all prior templates; key baseline demonstrating limits of template approaches on omission faults."
    464     },
    465     {
    466       "title": "DLFix: Context-Based Code Transformation Learning for Automated Program Repair",
    467       "authors": ["Yi Li", "Shaohua Wang", "Tien N. Nguyen"],
    468       "year": 2020,
    469       "doi": "10.1145/3377811.3380345",
    470       "relevance": "Deep learning-based code transformation for APR using context; representative of NMT-based repair techniques."
    471     },
    472     {
    473       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    474       "authors": ["René Just", "Darioush Jalali", "Michael D. Ernst"],
    475       "year": 2014,
    476       "doi": "10.1145/2610384.2628055",
    477       "relevance": "The primary benchmark dataset used for evaluation in this paper and widely used across APR research."
    478     },
    479     {
    480       "title": "On the Efficiency of Test Suite Based Program Repair: A Systematic Assessment of 16 Automated Repair Systems for Java Programs",
    481       "authors": ["Kui Liu", "Shangwen Wang", "Anil Koyuncu"],
    482       "year": 2020,
    483       "doi": "10.1145/3377811.3380338",
    484       "relevance": "Comprehensive assessment of APR tools' efficiency and effectiveness; source of controlled comparison data used in GLAD's community-wide analysis."
    485     },
    486     {
    487       "title": "Improving language understanding by generative pre-training",
    488       "authors": ["Alec Radford", "Karthik Narasimhan", "Tim Salimans"],
    489       "year": 2018,
    490       "relevance": "Foundational work on generative pretraining for language models, the paradigm underlying GLAD's use of LMs for code synthesis."
    491     }
    492   ],
    493   "engagement_factors": {
    494     "practical_relevance": {
    495       "score": 2,
    496       "justification": "GLAD is a working APR tool with public code targeting a specific class of real bugs; a practitioner could integrate it into a repair pipeline."
    497     },
    498     "surprise_contrarian": {
    499       "score": 1,
    500       "justification": "The insight that omission faults are the next frontier of APR and that generative LMs outperform NMT for them is a useful reframing but not deeply contrarian."
    501     },
    502     "fear_safety": {
    503       "score": 0,
    504       "justification": "No AI safety or security concerns raised; this is a constructive tool for fixing bugs."
    505     },
    506     "drama_conflict": {
    507       "score": 0,
    508       "justification": "No controversy or conflict with existing work; GLAD is positioned as complementary to existing tools."
    509     },
    510     "demo_ability": {
    511       "score": 1,
    512       "justification": "Code is available via anonymous repository but requires Java/Defects4J setup; not a simple pip-installable demo."
    513     },
    514     "brand_recognition": {
    515       "score": 0,
    516       "justification": "Authors are from KAIST, a respected research institution but not a high-profile AI lab in the mainstream tech press."
    517     }
    518   }
    519 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs