scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (35596B)
      1 {
      2   "paper": {
      3     "title": "T5APR: Empowering Automated Program Repair across Languages through Checkpoint Ensemble",
      4     "authors": [
      5       "Reza Gharibi",
      6       "Mohammad Hadi Sadreddini",
      7       "Seyed Mostafa Fakhrahmad"
      8     ],
      9     "year": 2023,
     10     "venue": "Journal of Systems and Software",
     11     "arxiv_id": "2309.15742",
     12     "doi": "10.1016/j.jss.2024.112083"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "methodology_tags": [
     20     "benchmark-eval"
     21   ],
     22   "key_findings": "T5APR, a multilingual neural program repair approach based on fine-tuned CodeT5-small (60M parameters) with checkpoint ensemble, fixes 1,985 bugs across six benchmarks in Java, Python, C, and JavaScript, including 1,442 bugs not fixed by any compared tool. The checkpoint ensemble strategy improves over individual checkpoints, and the multilingual model outperforms monolingual models on most benchmarks, demonstrating cross-language transfer learning. However, the majority of unique fixes (1,390/1,442) come from the Codeflaws benchmark where only one other tool was compared.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper provides a GitHub repository: https://github.com/h4iku/T5APR. Section 4.6 states 'we make all generated patches and our source code publicly available for verification and review.' Results are also released at https://github.com/h4iku/T5APR/tree/main/results."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The training data is the publicly available CoCoNuT dataset (https://github.com/lin-tan/CoCoNut-Artifact). All evaluation benchmarks (Defects4J, Bears, QuixBugs, Codeflaws, ManyBugs, BugAID) are publicly available. Generated patches are released on GitHub."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "The paper specifies hardware (NVIDIA T4 GPU, Intel Xeon CPU) and mentions Hugging Face Transformers, PyTorch, and Datasets libraries, but does not provide library versions, a requirements.txt, Dockerfile, or detailed environment specification sufficient to recreate the setup."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper describes the methodology in detail (Sections 2-3) and releases code, but does not include step-by-step reproduction instructions within the paper. There is no 'Reproducing Results' section or mention of scripts to replicate experiments."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "All results are reported as point estimates (e.g., 67 correct, 1,985 total in Table 3). No confidence intervals or error bars are provided anywhere in the paper."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper claims T5APR 'outperforms' or 'achieves competitive results' compared to other tools (Section 4.1) based solely on comparing raw counts. No statistical significance tests (p-values, t-tests, etc.) are used for any comparison."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Tables 3, 8, 9, and 11 provide raw counts with full baselines, allowing readers to assess effect magnitude. For example, Table 3 shows T5APR fixes 67 vs KNOD's 71 on Defects4J v1.2, and 56 vs KNOD's 50 on v2.0. RQ2 reports 17.5% improvement from multiple plausible patches (1,965 to 2,309)."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No justification is given for why these particular benchmarks or sample sizes were chosen beyond stating they are standard. No power analysis is discussed. The choice of 5,000 validation instances for hyperparameter tuning (Section 3.3) is also unjustified."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "All results appear to be from a single experimental run. No standard deviation, variance, or any spread measure is reported. No mention of multiple runs or seeds."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "T5APR is compared against 11 state-of-the-art APR tools including SequenceR, TBar, DLFix, CoCoNuT, CURE, Recoder, RewardRepair, KNOD, SOSRepair, Codex, and ChatGPT (Table 3, Section 3.5)."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Baselines include recent tools: KNOD (2023), ChatGPT (2023), RewardRepair (2022), Codex (2022), CURE (2021), Recoder (2021). These represent the state of the art at time of submission."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "RQ3 (Section 4.3) provides an ablation study on checkpoint ensemble, showing individual checkpoint performance (Table 9), checkpoint contributions (Table 10), and incremental addition effects (Figure 13). RQ4 (Section 4.4) compares multilingual vs monolingual models (Table 11)."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "The paper reports multiple metrics: correct patches, plausible patches, identical patches (Table 3), compilable patch rate (Tables 4-5), patch ranking position (Figure 6), unique bug fixes (Figure 7), and validation time cost (Tables 6-7)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 3.4 describes manual patch correctness assessment: 'one author checked whether the patches were identical to those created by the developer or other existing techniques. For the remaining patches that required semantic equivalence checking, the author consulted with another author in case of uncertainty.'"
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Training data uses temporal cutoff dates preceding benchmark bug dates (Table 1: Java 2006, Python 2010, C 2005, JavaScript 2010). Evaluation benchmarks (Defects4J, Bears, etc.) are independent test sets not used during training. Hyperparameter tuning used a separate 5,000-instance validation split (Section 3.3)."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down by individual benchmark (Table 3), per checkpoint (Tables 9-10), per plausible ranking threshold (Table 8), per benchmark compilation rate (Table 5), and per benchmark validation time (Tables 6-7)."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper notes specific failures: 'T5APR fails to fix the SQRT bug' on QuixBugs Java, fails DEPTH_FIRST_SEARCH on Python. Section 4.4 shows monolingual models outperform multilingual on ManyBugs and BugAID. Figure 13 shows adding checkpoints doesn't help for BugAID."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Several negative findings are reported: KNOD outperforms T5APR on Defects4J v1.2 (71 vs 67, Table 3); monolingual models beat multilingual on ManyBugs (16 vs 15) and BugAID (6 vs 5, Table 11); adding checkpoints doesn't always improve results (BugAID in Figure 13); T5APR has slightly lower compilation rate than RewardRepair (Table 4)."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims '1,985 bugs' fixed and '1,442 bugs that none of the compared techniques has fixed' — both supported by Table 3 and Figure 7. The claim of 'competitiveness against state-of-the-art techniques' is supported by Table 3 showing T5APR outperforms or matches on most benchmarks."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper makes causal claims about checkpoint ensemble ('improves performance') and multilingual training ('enables cross-lingual transfer'). Both are supported by controlled ablations: RQ3 compares individual vs combined checkpoints (Tables 9-10), and RQ4 compares multilingual vs monolingual models trained under the same conditions (Table 11)."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The title 'Empowering Automated Program Repair across Languages' is broader than the 4 languages tested. The abstract states 'potential of T5APR for use in real-world applications' without bounding this claim. While the paper evaluates on 6 benchmarks, the generalization to other languages, bug types, or real-world settings is claimed but not tested."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "Section 4.6 discusses alternative explanations: CodeT5's pre-training data could overlap with benchmarks, potentially inflating results; the manual patch assessment could introduce bias; the beam size advantage/disadvantage relative to competitors is noted. They also acknowledge that 'plausible but incorrect patches' exist due to test suite limitations."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The paper measures correct/plausible patches and claims 'repair effectiveness,' which closely matches the measurement. They clearly distinguish between plausible patches (pass tests) and correct patches (semantically equivalent to developer patches), and discuss the overfitting problem where plausible patches may not truly fix bugs (Section 2.9)."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The paper specifies 'CodeT5-small' with 60M parameters, trained with 'identifier-aware denoising pre-training objective for 100 epochs' (Section 3.3). The tokenizer vocabulary size (32,100) and architecture details are provided."
    152       },
    153       "prompts_provided": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "T5APR is a fine-tuned encoder-decoder model, not a prompted LLM. The input format uses language control codes (Java, Python, C, JavaScript) as prefixes in a structured encoding, not natural language prompts. The full input format is described in Section 2.4 and Figure 4."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.3 reports comprehensive hyperparameters: learning rate 1e-4, batch size 8, 1 training epoch, constant LR scheduler, AdamW optimizer, FP16 mixed precision, beam size 100, 5 checkpoints saved every 20% step, max input length 512, max output length 256. The hyperparameter search space and method (Optuna) are also described."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "T5APR uses a standard encoder-decoder model with beam search for patch generation. No agentic scaffolding, tool use, retry logic, or feedback mechanisms are involved."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 2.3 documents five preprocessing steps (comment removal, deduplication, identical source/target removal, empty target filtering, source length filtering) with rationale for each. Table 1 provides counts before and after preprocessing (e.g., Java: 3,241,966 → 1,125,599 → 1,009,268). Section 2.4 describes tokenization in detail."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 4.6 'Threats to validity' provides a substantive multi-paragraph discussion covering internal validity (patch assessment, implementation, CodeT5 data overlap) and external validity (benchmark overfitting)."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 4.6 discusses threats specific to this study: potential errors in manual patch correctness assessment, CodeT5's pre-training data potentially overlapping with evaluation benchmarks, implementation and hyperparameter configuration faults, and the 'benchmark overfitting' phenomenon documented by Durieux et al. (2019)."
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 4.6 states: 'our approach might not be generalizable to fixing bugs outside the tested bugs benchmarks.' The paper acknowledges resource limitations prevented using larger CodeT5 variants and notes evaluation on additional benchmarks (Bugs.jar, BugsJS, BugsInPy) is left to future work. Section 6 also outlines limitations in context window selection and multi-hunk handling."
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "All generated patches are publicly available at https://github.com/h4iku/T5APR/tree/main/results. Training data comes from the publicly available CoCoNuT artifact. All evaluation benchmarks are public."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 2.2 describes training data extraction from CoCoNuT's dataset, which uses keyword-based heuristics on commit messages to identify bug-fixing commits. Section 3.2 describes each benchmark's origin and characteristics. Table 1 provides detailed statistics including cutoff years, project counts, and instance counts."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": false,
    204         "answer": false,
    205         "justification": "No human participants in this study. Data comes from standard publicly available bug benchmarks and open-source repository mining."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The full pipeline is documented: raw commits → keyword filtering (CoCoNuT) → comment removal → deduplication → identical source/target removal → empty target filtering → size filtering → tokenization. Table 1 shows counts at key stages (instances before/after preprocessing/size filtering). Section 2.4 details the tokenization pipeline."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding information or acknowledgments section is present in the paper. The authors are from Shiraz University but no grants or funding sources are mentioned."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "All authors are identified as affiliated with the Department of Computer Science and Engineering and IT, School of Electrical and Computer Engineering, Shiraz University, Iran. They are not evaluating a commercial product they are affiliated with."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "Since no funding is disclosed, independence of funding cannot be assessed. The absence of funding disclosure makes it impossible to determine whether any funder has a stake in the results."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement is present in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The fine-tuning data cutoff years are stated in Table 1 (Java 2006, Python 2010, C 2005, JavaScript 2010). However, the pre-training data cutoff for CodeT5 (the base model) is not stated. Since CodeT5 was pre-trained on CodeSearchNet and BigQuery data that could include benchmark code, this is a significant omission noted in Section 4.6."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": true,
    243         "answer": true,
    244         "justification": "Section 4.6 explicitly discusses this: 'using CodeT5 as our base model, which is trained on large amounts of open-source code snippets. This means that its training data could overlap with our evaluation benchmarks.' They also note fine-tuning data uses temporal cutoffs to avoid overlap with benchmarks."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "Section 4.6 acknowledges contamination risk from CodeT5's pre-training but provides only qualitative arguments ('overlapping data would be a very small fraction,' 'both correct and incorrect versions would be present'). No concrete contamination analysis (canary strings, membership inference, n-gram overlap) is performed."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study. This is a benchmark evaluation of an automated program repair tool."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants in this study."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Tables 6 and 7 report detailed validation time statistics per benchmark (min, max, median, mean) for both plausible and correct patches. Overall median validation time is 6.182 seconds to plausible patch. Total validation took 27 days across 1,172,267 patches."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": true,
    298         "justification": "Section 3.3 specifies training hardware (4-core Intel Xeon, 16GB RAM, NVIDIA T4 16GB) and evaluation hardware (6-core i7-8750H, 16GB RAM, GTX 1060 6GB). Training the multilingual model for one epoch took about 17 hours. Total validation took about 27 days."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Section 4.6 mentions 'we used fixed manual seed values wherever possible' but does not report results across different seeds."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper does not state how many experimental runs produced the results. All evidence suggests a single training run, but this is not explicitly stated."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "Section 3.3 describes using the Optuna framework for hyperparameter search, specifying the search space (learning rate 1e-5 to 1e-3, epochs 1-5, batch size 4-16, beam size 5, four scheduler types). The objective metric (exact match × 100 + BLEU) and validation set (5,000 instances) are specified, though the total number of configurations tried is not stated."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "Section 3.3 describes hyperparameter selection using Optuna optimization on a separate validation set of 5,000 Python instances, with a defined objective metric (exact match + BLEU). Selection was performed on validation data, not test data."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper compares T5APR against 11 tools across 8 benchmarks, making many implicit comparative claims of superiority. No statistical tests are performed at all, let alone corrections for multiple comparisons."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper compares its results against numbers from other tools' papers/repositories without acknowledging the general bias of authors evaluating their own system. While they note beam size differences (100 vs 1000 for competitors), they do not discuss whether experimental conditions systematically favor or disfavor any approach."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper notes that competitors use beam size 1000 vs T5APR's 100 and that 'larger beam size leads to more correct patches,' but does not systematically compare performance at matched compute budgets. The 10x beam size difference is acknowledged but not normalized."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper uses six standard benchmarks without discussing whether they measure real-world bug-fixing ability. Section 4.6 mentions 'benchmark overfitting' as an external validity threat but does not analyze whether the benchmarks are valid constructs for measuring program repair capability."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": false,
    344         "answer": false,
    345         "justification": "T5APR is a direct model evaluation without agentic scaffolding. All compared approaches are also evaluated at the model level without scaffold confounds."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "Table 1 shows training data cutoff years (Java 2006, Python 2010, C 2005, JavaScript 2010) that predate benchmark bug dates. Section 3.2 states: 'CoCoNuT finds the date of the earliest bug in each evaluation benchmark and collects commits that were made before that date, and discards instances committed after that to avoid overlapping train and evaluation data.'"
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "The paper uses perfect fault localization (exact bug location provided), which gives the model information not available in real-world usage. While they argue this is standard practice for fair comparison (Section 3.5), they do not discuss how this could inflate results compared to realistic deployment conditions."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "The paper does not discuss whether training projects overlap with benchmark projects. While temporal splits are applied, no analysis checks whether the same repositories, authors, or code patterns appear in both training and test data."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": true,
    367         "justification": "Temporal splits are used as a concrete prevention method: training data is collected only up to the cutoff date before benchmark bugs, preventing the model from seeing fixes to benchmark bugs during fine-tuning. However, no detection method is applied for CodeT5's pre-training data."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "T5APR correctly fixes 1,985 bugs across six benchmarks in four programming languages, including 1,442 bugs that none of the compared techniques fixed.",
    374       "evidence": "Table 3 shows per-benchmark results summing to 1,985 correct patches. Figure 7 shows Venn diagrams of unique and overlapping fixes, with 1,442 unique to T5APR (1,390 from Codeflaws alone).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Checkpoint ensemble improves T5APR's performance over individual checkpoints.",
    379       "evidence": "Table 9 shows individual checkpoint results (1,474-1,565 total) vs combined 1,985. Table 10 shows contributions from each checkpoint. Figure 13 demonstrates incremental improvement from adding checkpoints for most benchmarks (Section 4.3).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "T5APR's multilingual model outperforms monolingual models for most benchmarks.",
    384       "evidence": "Table 11 shows multilingual (1,985 total) vs monolingual (1,679 total). Multilingual wins on Defects4J v1.2 (67 vs 55), v2.0 (56 vs 52), Bears (24 vs 21), QuixBugs Java (25 vs 21), QuixBugs Python (29 vs 26), and Codeflaws (1,764 vs 1,482). Monolingual wins on ManyBugs (16 vs 15) and BugAID (6 vs 5).",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "T5APR achieves competitive or superior performance compared to 11 state-of-the-art APR tools.",
    389       "evidence": "Table 3 shows T5APR outperforms or ties all tools on 6 of 8 benchmarks. On Defects4J v1.2, KNOD fixes 71 vs T5APR's 67 with 10x larger beam size. On ManyBugs, SOSRepair fixes 16 vs T5APR's 15.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Considering multiple plausible patches improves repair effectiveness by 17.5%.",
    394       "evidence": "Table 8 shows first plausible yields 1,965 correct, while all plausible patches yields 2,309, an increase of 344 bugs. Most correct patches are within the top-5 plausible patches (Section 4.2).",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "T5APR ranks correct patches highly, with 310 of them ranked first in the candidate patch list.",
    399       "evidence": "Figure 6 shows T5APR outperforms all other approaches in correct patch ranking on Defects4J v1.2, v2.0, and QuixBugs Java up to top-500 (Section 4.1).",
    400       "supported": "strong"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No statistical testing",
    406       "detail": "All comparative claims ('outperforms', 'competitive') are based on raw count comparisons without any statistical significance tests. With single-run results and no variance reporting, it is impossible to assess whether observed differences are meaningful or within noise."
    407     },
    408     {
    409       "flag": "Inflated unique bug count",
    410       "detail": "The headline claim of '1,442 unique bugs' is dominated by 1,390 bugs from Codeflaws alone, where only one other tool (CoCoNuT) was compared. This makes the unique fix count appear more impressive than the actual cross-benchmark complementarity suggests."
    411     },
    412     {
    413       "flag": "Uncontrolled beam size difference",
    414       "detail": "T5APR uses beam size 100 while key competitors (CoCoNuT, CURE, KNOD) use beam size 1000. The paper acknowledges 'larger beam size leads to more correct patches' but does not control for this confound. This makes comparisons unfair in both directions: T5APR is disadvantaged on raw counts but may benefit from appearing efficient."
    415     },
    416     {
    417       "flag": "Single run, no variance",
    418       "detail": "Despite mentioning 'fixed manual seed values,' only single-run results are presented. No standard deviation, confidence intervals, or multi-seed analysis is provided, making it impossible to assess result stability."
    419     },
    420     {
    421       "flag": "CodeT5 pre-training contamination unaddressed",
    422       "detail": "CodeT5 was pre-trained on large-scale open-source code that could include benchmark programs. The paper acknowledges this risk in Section 4.6 but only provides qualitative mitigation arguments, with no concrete contamination detection or analysis."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "CoCoNuT: Combining context-aware neural translation models using ensemble for program repair",
    428       "authors": [
    429         "Thibaud Lutellier",
    430         "Hung Viet Pham",
    431         "Lawrence Pang",
    432         "Yitong Li",
    433         "Moshi Wei",
    434         "Lin Tan"
    435       ],
    436       "year": 2020,
    437       "doi": "10.1145/3395363.3397369",
    438       "relevance": "Multilingual APR approach training separate models per language; provides the training dataset used by T5APR and is the primary cross-language baseline."
    439     },
    440     {
    441       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    442       "authors": [
    443         "Nan Jiang",
    444         "Thibaud Lutellier",
    445         "Lin Tan"
    446       ],
    447       "year": 2021,
    448       "doi": "10.1109/ICSE43902.2021.00107",
    449       "relevance": "Neural program repair using code-aware beam search and model ensemble, demonstrating effectiveness of subword tokenization and pre-trained language models for APR."
    450     },
    451     {
    452       "title": "KNOD: Domain Knowledge Distilled Tree Decoder for Automated Program Repair",
    453       "authors": [
    454         "Nan Jiang",
    455         "Thibaud Lutellier",
    456         "Yiling Lou",
    457         "Lin Tan",
    458         "Dan Goldwasser",
    459         "Xiangyu Zhang"
    460       ],
    461       "year": 2023,
    462       "doi": "10.1109/ICSE48619.2023.00111",
    463       "relevance": "State-of-the-art neural APR using tree decoder and domain-rule distillation, the strongest Java baseline compared against T5APR."
    464     },
    465     {
    466       "title": "Neural program repair with execution-based backpropagation",
    467       "authors": [
    468         "He Ye",
    469         "Matias Martinez",
    470         "Martin Monperrus"
    471       ],
    472       "year": 2022,
    473       "doi": "10.1145/3510003.3510222",
    474       "relevance": "RewardRepair incorporates compilation and test execution feedback into the training objective, achieving high compilable patch rates as a key baseline."
    475     },
    476     {
    477       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation",
    478       "authors": [
    479         "Yue Wang",
    480         "Weishi Wang",
    481         "Shafiq Joty",
    482         "Steven C.H. Hoi"
    483       ],
    484       "year": 2021,
    485       "doi": "10.18653/v1/2021.emnlp-main.685",
    486       "relevance": "The pre-trained code model that serves as T5APR's foundation, demonstrating multilingual code understanding capabilities."
    487     },
    488     {
    489       "title": "CIRCLE: Continual repair across programming languages",
    490       "authors": [
    491         "Wei Yuan",
    492         "Quanjun Zhang",
    493         "Tieke He",
    494         "Chunrong Fang",
    495         "Nguyen Quoc Viet Hung",
    496         "Xiaodong Hao",
    497         "Hongzhi Yin"
    498       ],
    499       "year": 2022,
    500       "doi": "10.1145/3533767.3534219",
    501       "relevance": "Multilingual APR using continual learning with T5, addressing catastrophic forgetting — the closest alternative multilingual approach to T5APR."
    502     },
    503     {
    504       "title": "SequenceR: Sequence-to-Sequence Learning for End-to-End Program Repair",
    505       "authors": [
    506         "Zimin Chen",
    507         "Steve Kommrusch",
    508         "Michele Tufano",
    509         "Louis-Noël Pouchet",
    510         "Denys Poshyvanyk",
    511         "Martin Monperrus"
    512       ],
    513       "year": 2019,
    514       "doi": "10.1109/TSE.2019.2940179",
    515       "relevance": "Foundational NMT-based APR approach treating bug fixing as translation from buggy to fixed code."
    516     },
    517     {
    518       "title": "Recoder: A syntax-guided edit decoder for neural program repair",
    519       "authors": [
    520         "Qihao Zhu",
    521         "Zeyu Sun",
    522         "Yuan-an Xiao",
    523         "Wenjie Zhang",
    524         "Kang Yuan",
    525         "Yingfei Xiong",
    526         "Lu Zhang"
    527       ],
    528       "year": 2021,
    529       "doi": "10.1145/3468264.3468544",
    530       "relevance": "Neural APR using syntax-guided edit decoder with provider/decider architecture, representing the edit-based approach to program repair."
    531     },
    532     {
    533       "title": "Can OpenAI's codex fix bugs? an evaluation on QuixBugs",
    534       "authors": [
    535         "Julian Aron Prenner",
    536         "Hlib Babii",
    537         "Romain Robbes"
    538       ],
    539       "year": 2022,
    540       "doi": "10.1145/3524459.3527351",
    541       "relevance": "Evaluation of large language models (Codex) for APR, representing the LLM-based approach to automated bug fixing."
    542     },
    543     {
    544       "title": "An Analysis of the Automatic Bug Fixing Performance of ChatGPT",
    545       "authors": [
    546         "Dominik Sobania",
    547         "Martin Briesch",
    548         "Carol Hanna",
    549         "Justyna Petke"
    550       ],
    551       "year": 2023,
    552       "doi": "10.1109/APR59189.2023.00012",
    553       "relevance": "Evaluation of ChatGPT for APR on QuixBugs, comparing LLM-based repair against traditional APR approaches."
    554     },
    555     {
    556       "title": "An Empirical Study on Learning Bug-Fixing Patches in the Wild via Neural Machine Translation",
    557       "authors": [
    558         "Michele Tufano",
    559         "Cody Watson",
    560         "Gabriele Bavota",
    561         "Massimiliano Di Penta",
    562         "Martin White",
    563         "Denys Poshyvanyk"
    564       ],
    565       "year": 2019,
    566       "doi": "10.1145/3340544",
    567       "relevance": "Foundational empirical study on NMT for learning bug fixes from commit history, establishing the code-as-translation paradigm for APR."
    568     },
    569     {
    570       "title": "A Survey of Learning-based Automated Program Repair",
    571       "authors": [
    572         "Quanjun Zhang",
    573         "Chunrong Fang",
    574         "Yuxiang Ma",
    575         "Weisong Sun",
    576         "Zhenyu Chen"
    577       ],
    578       "year": 2023,
    579       "doi": "10.1145/3631974",
    580       "relevance": "Comprehensive survey of learning-based APR techniques, providing taxonomy and analysis of the field T5APR contributes to."
    581     }
    582   ],
    583   "engagement_factors": {
    584     "practical_relevance": {
    585       "score": 2,
    586       "justification": "Open-source multilingual APR tool with a GitHub repo that practitioners could potentially integrate into CI pipelines, though requires ML infrastructure to fine-tune and run."
    587     },
    588     "surprise_contrarian": {
    589       "score": 1,
    590       "justification": "Shows a small 60M parameter model can compete with much larger systems, somewhat surprising, but the results are incremental improvements rather than paradigm shifts."
    591     },
    592     "fear_safety": {
    593       "score": 0,
    594       "justification": "No AI safety or security concerns raised; this is about automated bug fixing, not attacks or risks."
    595     },
    596     "drama_conflict": {
    597       "score": 0,
    598       "justification": "No controversy or dramatic claims; straightforward benchmark comparison paper."
    599     },
    600     "demo_ability": {
    601       "score": 2,
    602       "justification": "Source code available on GitHub (https://github.com/h4iku/T5APR), but requires model training or checkpoint access to use."
    603     },
    604     "brand_recognition": {
    605       "score": 0,
    606       "justification": "Academic paper from Shiraz University with no famous lab affiliation or well-known product connection."
    607     }
    608   }
    609 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs