ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24565B)


      1 {
      2   "paper": {
      3     "title": "Boosting Redundancy-based Automated Program Repair by Fine-grained Pattern Mining",
      4     "authors": [
      5       "Jiajun Jiang",
      6       "Fengjie Li",
      7       "Zijie Zhao",
      8       "Zhirui Ye",
      9       "Mengjiao Liu",
     10       "Bo Wang",
     11       "Hongyu Zhang",
     12       "Junjie Chen"
     13     ],
     14     "year": 2023,
     15     "venue": "arXiv (cs.SE)",
     16     "arxiv_id": "2312.15955"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper states 'We make all our experimental results and implementations publicly available' and provides a Zenodo link (https://zenodo.org/records/14997209) in reference [40]."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses the publicly available Defects4J benchmark (v1.2 and v2.0) and states all experimental results are publicly available via the Zenodo archive [40]."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions 'Ubuntu 18.04, equipped with 128GB RAM and a processor of Intel(R) Xeon(R) E5-2640' in Section IV-B, but does not provide a requirements.txt, Dockerfile, or detailed library/dependency versions needed to recreate the environment."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "While a Zenodo archive is referenced, the paper itself does not include step-by-step reproduction instructions, a README with commands, or a 'Reproducing Results' section."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper reports only point estimates for bug counts and precision percentages (e.g., '75 bugs', '54.3% precision') without confidence intervals or error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper makes comparative claims such as 'REPATT significantly outperforms the baseline approaches' and '97.4% and 23.0% more bugs' but uses no statistical significance tests (no p-values, t-tests, etc.)."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports effect sizes in the form of percentage improvements with baseline context, e.g., '15.6%-51.7% higher patch precision', '97.4% and 23.0% more bugs', and '39 more bugs than the best-performing method (i.e., TBar)'. This provides enough context to judge the magnitude of differences."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper uses Defects4J v1.2 (395 bugs) and v2.0 (440 bugs) as the benchmark without justifying why this sample size is adequate or discussing power considerations."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. The approach is deterministic (pattern mining is deterministic given the same input), but no discussion of this is provided, and multiple stochastic components (fault localization) exist."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares REPATT against ten state-of-the-art APR approaches: CapGen, SimFix, TransplantFix, TBar, Recoder, SelfAPR, ITER, AlphaRepair, Repilot, and GAMMA (Section IV-A)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baselines include contemporary methods: AlphaRepair (2022), Repilot (2023), GAMMA (2023), SelfAPR (2022), and ITER (2024), which represent the state of the art at the time of submission."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "RQ3 (Section V-C) presents a thorough ablation study examining the contribution of offline token-level pattern mining, online expression-level code search, the skip-fashion mining, and the S-TAC representation (Table III)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper reports both the number of correctly repaired bugs (recall) and patch precision (ratio of correctly repaired bugs to bugs with plausible patches), as stated in Section IV-B."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section IV-B states 'a patch is deemed to be correct iff it is semantically equivalent to the developer patch by manual check. In this process, the first three authors independently conducted the annotation and reached a consensus through discussion.' This constitutes human evaluation of the system's outputs."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The approach uses Defects4J v1.2 and v2.0 as separate evaluation sets. The pattern mining is done per-project (on the faulty program itself, not a training set), so there is no train/test contamination concern in the traditional sense. Defects4J v2.0 serves as additional evaluation data."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table I provides per-project breakdowns of bugs repaired for all 18 projects in Defects4J v1.2 and v2.0, rather than just aggregate numbers."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section V-A1 discusses incorrect patches: 'When analyzing the incorrect patches generated by REPATT, we found that most of them resulted from the identification of non-representative patterns, which led to incorrect modification locations or inappropriate fixes.' The ablation in RQ3 also shows where components fail (e.g., No S-TAC producing 0 correct patches)."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper honestly reports that REPATT 'did not outperform all baselines considering all comparison aspects' (Section V-A1) and repairs fewer bugs than several baselines (75 vs 121 for GAMMA). The No S-TAC ablation shows 0 correct patches, a clear negative result."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims about 9 unique bugs vs LLM-based methods, 19 unique bugs vs traditional methods, 83.8% patch precision, and 124 bugs with COMBINE are all supported by results in Tables I, II, and Figure 5."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper makes causal claims through ablation studies (RQ3): removing skip-fashion mining causes 18 fewer bugs repaired, removing S-TAC causes 0 correct patches. These are controlled single-variable manipulations that adequately support the causal claims about component contributions."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper bounds its claims to the Defects4J benchmark and explicitly discusses generalization limitations in the threats to validity: 'the performance of REPATT may not be generalized to other datasets' (Section VI). The title and claims are scoped to 'redundancy-based automated program repair' rather than making sweeping general claims."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The threats-to-validity section (Section VI) discusses methodological threats (manual verification, benchmark selection, reuse of baseline results) but does not discuss specific alternative explanations for the observed results, such as whether improvements might be due to the specific characteristics of Defects4J bugs rather than the approach's general effectiveness."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": false,
    140         "answer": false,
    141         "justification": "REPATT is not an LLM-based approach and does not use pre-trained language models. It is a traditional pattern-mining-based APR technique. The LLM-based baselines are compared using their published results."
    142       },
    143       "prompts_provided": {
    144         "applies": false,
    145         "answer": false,
    146         "justification": "The paper does not use prompting. REPATT is a traditional program analysis approach based on pattern mining, not an LLM-based method."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Key hyperparameters are reported: MAX_LEN and MAX_SKIP for pattern mining, MIN_SUPPORT threshold (default 3, with sensitivity analysis in RQ4), maximum patches generated (200 token-level, 1000 expression-level), and the 5-hour time budget for repair (Section IV-B, V-D)."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "The paper does not use agentic scaffolding. REPATT is a traditional automated program repair tool."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section II-A documents the data preparation: filtering bugs whose patches involve more than one Java file, resulting in 564 bugs from Defects4J and 609 bugs from Bugs.jar across 24 projects. The token extraction process is described in Section III-A."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section VI (Discussion) contains a 'Limitation' subsection and both 'Internal threats to validity' and 'External threats to validity' subsections with substantive discussion."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "The threats are specific to this study: (1) manual verification of patches by three authors with consensus discussion, (2) reuse of baseline results from published papers without reproduction in a unified environment, (3) evaluation limited to Defects4J v1.2 and v2.0."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section VI states specific scope boundaries: 'For complex bugs that require multi-line changes, the reference code may not exist, where our approach will be less effective' and 'the performance of REPATT may not be generalized to other datasets.' The paper also explicitly notes the limitation to Java programs and single-file bugs."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper states 'We have also made all generated patches publicly available for further inspection and validation' via the Zenodo archive [40], and uses the publicly available Defects4J benchmark."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section II-A describes the data collection: using Defects4J and Bugs.jar benchmarks, filtering for single-file bugs, resulting in 564 and 609 bugs respectively. Section IV-A describes the baseline selection and result collection process."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants are involved. The study uses standard public benchmarks (Defects4J)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is documented: faulty program input -> token extraction and embedding -> offline pattern mining -> online code search -> S-TAC transformation -> patch generation -> ranking -> test validation -> manual correctness check (Sections III-IV). Filtering criteria for the preliminary study are also documented."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "The Acknowledgment section discloses funding: 'This work was supported by the National Key Research and Development Program of China (Grant No. 2024YFB4506300), and the National Natural Science Foundation of China (Grant Nos. 62202324, No. 62322208 and No. 62202040).'"
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All author affiliations are listed: Tianjin University, University of Pennsylvania, Westlake University, Beijing Jiaotong University, and Chongqing University. None of the authors are affiliated with companies whose products are being evaluated."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "The funding comes from the Chinese national science foundations (NSFC and National Key R&D Program), which are independent government funding agencies with no financial stake in the outcome of this specific APR tool evaluation."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "There is no competing interests or financial interests statement in the paper. Absence of disclosure is not the same as absence of conflict."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "REPATT is not a pre-trained model. It is a traditional pattern-mining approach that mines patterns from the faulty program itself at repair time. It does not have a training data cutoff. The LLM baselines are compared using their published results."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "REPATT does not have a pre-trained model with training data that could overlap with the test benchmark. The pattern mining is done per-project on the faulty program itself, which is the intended use case."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Benchmark contamination is not applicable to REPATT since it does not use pre-trained models. However, the paper does note that LLM-based baselines may face 'data leakage' issues (reference [81] in Section VII)."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants are involved in this study. The manual patch verification by authors is part of the evaluation methodology, not a human subjects study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants are involved."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants are involved."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants are involved."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants are involved."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants are involved."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants are involved."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "The paper mentions 'the construction is actually efficient and on average took about three minutes' for offline pattern mining, and references a 5-hour time budget, but does not report the total wall-clock time per bug, cost per patch, or total computational cost of running REPATT across all bugs."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The hardware is mentioned (Ubuntu 18.04, 128GB RAM, Intel Xeon E5-2640) but the total computational budget (total CPU hours, total time for all experiments) is not quantified."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "REPATT repairs 75 bugs on Defects4J with perfect fault localization, complementing existing approaches by repairing 9 unique bugs vs LLM-based/deep learning methods and 19 unique bugs vs traditional methods.",
    295       "evidence": "Table I shows 75 correctly repaired bugs out of 138 plausible. Figure 5 shows Venn diagrams of unique bug repairs: 19 unique vs traditional (Fig 5a), 9 unique vs LLM/DL methods (Fig 5b), and 5 unique vs all baselines (Fig 5c).",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "REPATT achieves 83.8% patch precision under automated fault localization (SBFL), significantly outperforming baselines by 15.6%-51.7%.",
    300       "evidence": "Table II shows REPATT at 31/37 correct/plausible = 83.8% precision. The next best is ITER at 68.2%. Other baselines range from 32.1% to 50.0%. The improvement range of 15.6%-51.7% is verified against these numbers.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "The combined method (COMBINE) repairs 124 bugs at Top-1, 39 more than the best-performing individual method (TBar).",
    305       "evidence": "Table I shows COMBINE at 124/280 correct/plausible, while TBar is at 85/180. 124-85=39 more bugs. This is confirmed in Section V-B.",
    306       "supported": "strong"
    307     },
    308     {
    309       "claim": "89.3% of reusable code elements contain fewer than three tokens.",
    310       "evidence": "Preliminary study in Section II-A with Figure 1 showing token length distributions across 564 bugs from Defects4J and 609 bugs from Bugs.jar.",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "Both the skip-fashion pattern mining and S-TAC representation are effective components.",
    315       "evidence": "Table III (Section V-C) shows removing skip reduces correct fixes from 42 to 24 and precision from 65.6% to 44.4%. Removing S-TAC results in 0 correct fixes and 22 incorrect ones.",
    316       "supported": "strong"
    317     },
    318     {
    319       "claim": "REPATT 'significantly outperforms' baselines under automated fault localization.",
    320       "evidence": "Table II shows higher precision (83.8%) but fewer correct patches (31 vs 70 for Recoder). The claim of 'significantly outperforms' is scoped to precision but the word 'significantly' is used without statistical testing.",
    321       "supported": "moderate"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval"
    326   ],
    327   "key_findings": "REPATT, a redundancy-based automated program repair technique using two-level pattern mining (token-level and expression-level), repairs 75 bugs on Defects4J with 54.3% precision under perfect fault localization, complementing existing APR approaches by fixing 5 unique bugs that no baseline can repair. Under realistic automated fault localization, REPATT achieves 83.8% patch precision, substantially higher than all baselines (15.6-51.7% improvement), though at the cost of repairing fewer total bugs. A combined ranking strategy (COMBINE) that merges REPATT with traditional APR tools repairs 124 bugs at Top-1, improving the best individual method by 45.9%.",
    328   "red_flags": [
    329     {
    330       "flag": "Reuse of baseline results without reproduction",
    331       "detail": "The paper reuses published baseline results from their respective papers rather than reproducing them in a unified environment. This means differences in hardware, configuration, time budgets, or evaluation methodology could affect comparisons. The authors acknowledge this as a threat to validity."
    332     },
    333     {
    334       "flag": "Use of 'significantly' without statistical tests",
    335       "detail": "The paper uses the term 'significantly outperforms' (Section V-A3) when describing REPATT's precision advantage, but no statistical significance tests are performed. The comparison is based on single-run point estimates."
    336     },
    337     {
    338       "flag": "Recall-precision tradeoff obscured in claims",
    339       "detail": "REPATT repairs substantially fewer bugs than LLM-based methods (75 vs 121 for GAMMA) but achieves higher precision. The abstract and claims emphasize complementarity and precision without equally emphasizing the recall gap, which could give a misleading impression of overall effectiveness."
    340     }
    341   ],
    342   "cited_papers": [
    343     {
    344       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    345       "authors": ["C. S. Xia", "L. Zhang"],
    346       "year": 2022,
    347       "relevance": "AlphaRepair is a key baseline; represents LLM-based zero-shot program repair relevant to AI-assisted code generation."
    348     },
    349     {
    350       "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair",
    351       "authors": ["Y. Wei", "C. S. Xia", "L. Zhang"],
    352       "year": 2023,
    353       "relevance": "Repilot is a baseline that integrates LLMs with completion engines for program repair, directly relevant to LLM-based software engineering."
    354     },
    355     {
    356       "title": "Gamma: Revisiting template-based automated program repair via mask prediction",
    357       "authors": ["Q. Zhang", "C. Fang", "T. Zhang", "B. Yu", "W. Sun", "Z. Chen"],
    358       "year": 2023,
    359       "relevance": "GAMMA uses CodeBERT and UniXcoder for template-based APR, relevant to evaluating LLM-based code repair methods."
    360     },
    361     {
    362       "title": "Impact of code language models on automated program repair",
    363       "authors": ["N. Jiang", "K. Liu", "T. Lutellier", "L. Tan"],
    364       "year": 2023,
    365       "relevance": "Directly evaluates the impact of code language models on program repair, relevant to AI-assisted programming evaluation."
    366     },
    367     {
    368       "title": "Automated program repair in the era of large pre-trained language models",
    369       "authors": ["C. S. Xia", "Y. Wei", "L. Zhang"],
    370       "year": 2023,
    371       "relevance": "Surveys automated program repair using large pre-trained language models, directly relevant to LLM capability evaluation."
    372     },
    373     {
    374       "title": "Breaking the silence: the threats of using LLMs in software engineering",
    375       "authors": ["J. Sallou", "T. Durieux", "A. Panichella"],
    376       "year": 2024,
    377       "relevance": "Discusses data leakage and other threats when using LLMs in software engineering research, relevant to methodology quality."
    378     },
    379     {
    380       "title": "A systematic literature review on large language models for automated program repair",
    381       "authors": ["Q. Zhang", "C. Fang", "Y. Xie", "Y. Ma", "W. Sun", "Y. Yang", "Z. Chen"],
    382       "year": 2024,
    383       "doi": "arXiv:2405.01466",
    384       "relevance": "Systematic review of LLMs for APR, relevant to survey methodology and LLM capability assessment."
    385     },
    386     {
    387       "title": "Evaluating the generalizability of LLMs in automated program repair",
    388       "authors": ["F. Li", "J. Jiang", "J. Sun", "H. Zhang"],
    389       "year": 2025,
    390       "relevance": "Evaluates LLM generalizability for program repair, directly relevant to AI capability evaluation methodology."
    391     },
    392     {
    393       "title": "Hybrid automated program repair by combining large language models and program analysis",
    394       "authors": ["F. Li", "J. Jiang", "J. Sun", "H. Zhang"],
    395       "year": 2024,
    396       "relevance": "Combines LLMs with traditional program analysis for repair, relevant to hybrid AI-assisted programming approaches."
    397     },
    398     {
    399       "title": "The plastic surgery hypothesis in the era of large language models",
    400       "authors": ["C. S. Xia", "Y. Ding", "L. Zhang"],
    401       "year": 2023,
    402       "relevance": "Revisits the plastic surgery hypothesis (code redundancy for repair) in the context of LLMs, directly relevant to evaluating LLM-based code generation."
    403     }
    404   ]
    405 }

Impressum · Datenschutz