scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (18404B)
      1 {
      2   "paper": {
      3     "title": "Can We Automatically Fix Bugs by Learning Edit Operations?",
      4     "authors": ["Aidan Connor", "Aaron Harris", "Nathan Cooper", "Denys Poshyvanyk"],
      5     "year": 2022,
      6     "venue": "IEEE/ACM International Conference on Automated Software Engineering (ASE)",
      7     "doi": null
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository provided: https://github.com/WM-SEMERU/hephaestus (Section IX/Conclusions)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The study uses the publicly available Bugs2Fix subset from Microsoft's CodeXGlue project [11], a standard public benchmark."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using an NVIDIA Titan RTX GPU and OpenNMT toolkit but does not provide a requirements.txt, Dockerfile, or detailed dependency list with versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Section IX states 'replication documentation can be found at' the GitHub repository URL."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results are reported as point estimates only (e.g., PPA of 14.7%, 8.3%). No confidence intervals or error bars are provided."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims differences are or are not 'significant' (e.g., 'no significant difference in PPA') but uses no statistical tests to support these claims."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Raw percentage differences are reported but no formal effect sizes (Cohen's d, etc.) are computed. The differences are described informally (e.g., 'approximately 0.8%')."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "The dataset contains ~58,000 BFPs and 10% is used for testing, but there is no justification for why this size is adequate or any power analysis."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures are reported. Results appear to be from single training runs with no repeated experiments."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "A control model (direct buggy-to-fixed translation) serves as the baseline, trained with the same parameters and data. This follows the approach of Tufano et al. [17]."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The control baseline replicates the approach from Tufano et al. [17] (2019) and the study also references CURE [7] (2021). These were contemporary at submission time."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The study systematically varies condensing strategy (basic, strict, loose), machine string form (typed vs general), and architecture (LSTM vs GRU), which functions as an ablation across design choices."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Four metrics are used: Perfect Prediction Accuracy, Failed Prediction Rate, Edit Distance Decrease, and Training Accuracy (Section V-A)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of generated fixes was performed. All evaluation is automated (exact match, edit distance)."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Section IV-D states: 'Each model is trained with 80% of their respective dataset and validated with 10%... We then test each Hephaestus model using the remaining 10%.'"
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results are broken down by condensing strategy (basic, strict, loose), parameter group (LSTM+General, GRU+General, LSTM+Typed) in Figures 3a-3d."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section VI-E (RQ1) discusses failure cases: 'the main cause of prediction failure was that some generated edit operations modified token indices that were out-of-bounds.'"
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The entire paper reports a negative result: edit operations-based models performed worse than the baseline. The paper honestly concludes 'the introduction of these specific methods... did not provide a benefit.'"
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims that 'all models which learned from edit operations were not as effective at repairing bugs as models which learned from fixed code segments directly,' which is supported by the results in Section VI."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The causal claim is that edit operations introduce more complexity leading to worse performance. The controlled experimental design (same data, same architecture, only varying the output representation) adequately supports this via single-variable manipulation."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section VII (External Validity) explicitly states: 'we can only say that our method is ineffective for Java-based examples' and acknowledges the dataset limitation."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses that higher entropy from the more complex output format is a likely explanation (Section VI-E, RQ1), and the Threats to Validity section discusses construct validity concerns about the edit operation representation choice."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper mentions using OpenNMT and LSTM/GRU architectures but does not specify the exact version of OpenNMT used."
    133       },
    134       "prompts_provided": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "This paper uses NMT models trained from scratch, not prompted language models. Prompting is not applicable."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Key hyperparameters are reported: 2-layer LSTM/GRU, dropout rate 0.2, SGD optimizer with cross-entropy loss, 50,000 training steps, 80/10/10 train/val/test split. Parameters follow Tufano et al. [17]."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a standard NMT training and evaluation pipeline."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Section IV-C describes dataset construction in detail: the CodeXGlue Bugs2Fix subset, abstraction process from Tufano et al. [17], filtering to 1-50 token methods, and the edit operation extraction pipeline."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section VII 'Threats to Validity' provides a dedicated discussion of construct, external, and internal validity threats."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Threats are specific to this study: the edit operation format choice (construct), Java-only dataset (external), and code abstraction limiting which BFPs can be used (internal)."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section VII (External Validity): 'we can only say that our method is ineffective for Java-based examples.' The paper also notes limitation to methods with 1-50 tokens and the abstraction constraint."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The underlying Bugs2Fix/CodeXGlue dataset is publicly available [11], and the code repository is released, enabling verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section IV-C describes using the Bugs2Fix subset from CodeXGlue: ~58,000 Java method BFPs with 1-50 tokens, preformatted per Tufano et al. [17]."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. The data comes from a standard public benchmark dataset."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from raw BFPs to edit operation extraction to dataset construction is documented in Section IV (Levenshtein operations, condensing algorithms, machine string encoding, dataset formatting in Table I)."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section X: 'The authors have been supported in part by the NSF CCF-1955853 and CCF-2007246 grants.'"
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors are affiliated with William and Mary, clearly listed on the first page. No product being evaluated is tied to their affiliation."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "NSF is a government funding agency with no financial stake in whether edit operations work for bug repair."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is present in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper trains NMT models from scratch on the provided dataset. There is no pre-trained model whose training cutoff is relevant."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Models are trained from scratch with explicit 80/10/10 splits. No pre-trained model is evaluated on a benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No pre-trained language model is being evaluated. Models are trained from scratch on the dataset."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No inference cost or latency is reported for the models."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section IV-D: 'The training process took about 5.5 hours for each model using an NVIDIA Titan RTX GPU.' With 12 models total, this gives a clear compute budget."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "All models which learned from edit operations were not as effective at repairing bugs as models which learned from fixed code segments directly.",
    286       "evidence": "Control model PPA of 14.7% vs best experimental model PPA of 8.3% (Figure 3a). Control had better EDD (-1.32 avg vs -2.54 for basic, -1.57 strict, -1.61 loose) (Figure 3c).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Condensing edit operations into strict and loose forms is beneficial over not condensing them at all.",
    291       "evidence": "EDD values: strict/loose models averaged ~-1.57/-1.61 vs basic models at -2.54 (Section VI-E, RQ2). PPA differences were negligible.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "There is no significant difference in performance between LSTM and GRU architectures for this task.",
    296       "evidence": "PPA variation between LSTM and GRU is approximately 0.8-1% across conditions (Figure 3a, Section VI-E RQ3). No statistical test performed.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "The main cause of prediction failure in edit operation models was out-of-bounds token indices.",
    301       "evidence": "Stated in Section VI-E (RQ1) based on failure analysis, with reference to source documentation.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "Hephaestus, a novel approach using Levenshtein edit operations for NMT-based automated bug repair, performed consistently worse than the standard direct buggy-to-fixed translation approach across all metrics. The best edit operations model achieved only 8.3% perfect prediction accuracy compared to 14.7% for the control. Condensing edit operations (strict/loose) provided marginal improvements over basic operations but still underperformed the baseline. The study is notable for honestly reporting negative results.",
    307   "red_flags": [
    308     {
    309       "flag": "No statistical tests for claimed differences",
    310       "detail": "The paper uses words like 'significant' and 'negligible' to describe differences between models without any statistical testing. All comparisons are based on eyeballing point estimates from single runs."
    311     },
    312     {
    313       "flag": "Single-run experiments",
    314       "detail": "Results appear to be from single training runs with no repeated experiments, making it impossible to assess whether observed differences are due to random variation in training."
    315     }
    316   ],
    317   "cited_papers": [
    318     {
    319       "title": "CODIT: Code Editing with Tree-Based Neural Machine Translation",
    320       "authors": ["Saikat Chakraborty", "Miltiadis Allamanis", "Baishakhi Ray"],
    321       "year": 2019,
    322       "doi": "10.1109/TSE.2020.3020502",
    323       "relevance": "Directly related approach using AST-level edit operations for code transformation via NMT."
    324     },
    325     {
    326       "title": "SEQUENCER: Sequence-to-Sequence Learning for End-to-End Program Repair",
    327       "authors": ["Zimin Chen"],
    328       "year": 2019,
    329       "doi": "10.1109/TSE.2019.2940179",
    330       "relevance": "Seq2seq approach to automated program repair, a key baseline in the APR literature."
    331     },
    332     {
    333       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    334       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    335       "year": 2021,
    336       "doi": "10.1109/icse43902.2021.00107",
    337       "relevance": "State-of-the-art NMT-based program repair with code-aware pre-training, a key contemporary baseline."
    338     },
    339     {
    340       "title": "An Empirical Study on Learning Bug-Fixing Patches in the Wild via Neural Machine Translation",
    341       "authors": ["Michele Tufano"],
    342       "year": 2019,
    343       "doi": "10.1145/3340544",
    344       "relevance": "Foundational NMT-based bug repair work whose methodology and dataset this paper builds on directly."
    345     },
    346     {
    347       "title": "On Learning Meaningful Code Changes via Neural Machine Translation",
    348       "authors": ["Michele Tufano"],
    349       "year": 2019,
    350       "doi": "10.1109/ICSE.2019.00021",
    351       "relevance": "Related work on learning code changes via NMT, relevant to automated program repair methodology."
    352     },
    353     {
    354       "title": "CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation",
    355       "authors": ["Shuai Lu"],
    356       "year": 2021,
    357       "arxiv_id": "2102.04664",
    358       "relevance": "Source of the Bugs2Fix benchmark dataset used in this study, a widely-used code generation benchmark."
    359     },
    360     {
    361       "title": "Learning to Fix Build Errors with Graph2Diff Neural Networks",
    362       "authors": ["Daniel Tarlow"],
    363       "year": 2019,
    364       "arxiv_id": "1911.01205",
    365       "relevance": "Related approach using graph neural networks to predict code diffs for automated repair."
    366     },
    367     {
    368       "title": "Toward Better Evolutionary Program Repair: An Integrated Approach",
    369       "authors": ["Yuan Yuan", "Wolfgang Banzhaf"],
    370       "year": 2020,
    371       "doi": "10.1145/3360004",
    372       "relevance": "Evolutionary approach to automated program repair, representing an alternative paradigm to NMT-based methods."
    373     }
    374   ]
    375 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs