scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (26175B)
      1 {
      2   "paper": {
      3     "title": "Enhancing Automated Program Repair through Fine-tuning and Prompt Engineering",
      4     "authors": [
      5       "Rishov Paul",
      6       "Md. Mohib Hossain",
      7       "Mohammed Latif Siddiq",
      8       "Masum Hasan",
      9       "Anindya Iqbal",
     10       "Joanna C. S. Santos"
     11     ],
     12     "year": 2023,
     13     "arxiv_id": "2304.07840",
     14     "doi": "10.5281/zenodo.8122636"
     15   },
     16   "checklist": {
     17     "artifacts": {
     18       "code_released": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "The paper provides a replication package at https://doi.org/10.5281/zenodo.8122636 (footnote 1), described as containing 'all the scripts used to gather the data and results'."
     22       },
     23       "data_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The datasets used (Tufano et al. and Review4Repair) are from publicly available prior work. The replication package on Zenodo is stated to contain all data and scripts."
     27       },
     28       "environment_specified": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper mentions using an NVIDIA GeForce RTX 2070-8GB GPU and names the models used, but does not provide a requirements.txt, Dockerfile, or detailed environment specification listing library versions."
     32       },
     33       "reproduction_instructions": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "While a replication package is provided on Zenodo, the paper itself does not include step-by-step reproduction instructions, a README description with commands, or a 'Reproducing Results' section."
     37       }
     38     },
     39     "statistical_methodology": {
     40       "confidence_intervals_or_error_bars": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "The paper reports only point estimates (accuracy percentages, BLEU scores) with no confidence intervals, error bars, or uncertainty measures in Tables II and III."
     44       },
     45       "significance_tests": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper claims fine-tuned models 'notably outperformed' and achieve 'significant margin' improvements but provides no statistical significance tests (no p-values, t-tests, or any formal tests). Comparisons are based solely on numeric differences."
     49       },
     50       "effect_sizes_reported": {
     51         "applies": true,
     52         "answer": true,
     53         "justification": "The paper reports absolute improvements with baseline context throughout. For example, Table II shows '+5.69' improvement from 19.59% baseline to 25.28%, and similar formats across all comparisons, providing enough context to understand the magnitude of effects."
     54       },
     55       "sample_size_justified": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "For the manual analysis (RQ3), the paper explicitly justifies the sample size: '314 test samples from Tufano et al. and 340 test samples from Review4Repair datasets in order to achieve a 95% confidence interval and 5% error of margin' (Section III-C)."
     59       },
     60       "variance_reported": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No variance, standard deviation, or spread measures are reported across runs. The paper does not indicate whether experiments were run multiple times, and all results appear to be single-run numbers."
     64       }
     65     },
     66     "evaluation_design": {
     67       "baselines_included": {
     68         "applies": true,
     69         "answer": true,
     70         "justification": "The paper compares against baseline models from prior work: R4R CC from Review4Repair and Tufano 2-encoder from Tufano et al. Results are shown in Table II."
     71       },
     72       "baselines_contemporary": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "The baselines (Review4Repair 2022, Tufano et al. 2021) are the original models from the datasets' source papers, representing the best known results on these specific datasets at the time of writing."
     76       },
     77       "ablation_study": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "No ablation study is performed. The paper does not systematically remove or vary components (e.g., removing code review from input, varying prompt components) to measure individual contributions."
     81       },
     82       "multiple_metrics": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper uses multiple evaluation metrics: Top-1/5/10 Accuracy (exact match), BLEU-4, and CodeBLEU, as described in Section III-D and reported in Tables II and III."
     86       },
     87       "human_evaluation": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "RQ3 includes a manual developer analysis where two software developers scored generated repairs on whether they fulfilled the review intention. Results are in Table IV with Cohen's Kappa inter-rater agreement."
     91       },
     92       "held_out_test_set": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Both datasets have explicit train/validation/test splits. The Tufano et al. dataset has 13,756/1,719/1,719 split and Review4Repair has 53,198/2,956/2,955 split (Table I). Results are reported on the test sets."
     96       },
     97       "per_category_breakdown": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Figure 2 provides per-category breakdowns across three fix categories (Insert, Delete, Update) for both datasets and all models, showing important variation in performance across categories."
    101       },
    102       "failure_cases_discussed": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Section III-B5 discusses common failure patterns in LLM outputs (syntax problems, explanations appended, buggy code mixed in, markdown formatting). The discussion section (V-A) discusses data quality issues in Review4Repair that affect results."
    106       },
    107       "negative_results_reported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The paper reports that LLMs are 'still a long way off' for practical APR. RQ3 shows models fulfill only 38-59% of reviews. The abstract and conclusion explicitly state that practical application is not yet feasible. GPT-3.5-Turbo without heuristics had low accuracy (6.9% on Review4Repair)."
    111       }
    112     },
    113     "claims_and_evidence": {
    114       "abstract_claims_supported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "The abstract claims that fine-tuned models 'notably outperformed' baselines (supported by Table II), that LLMs show promise with prompting (supported by Table III), and that practical application 'is still a long way off' (supported by RQ3 manual analysis in Table IV)."
    118       },
    119       "causal_claims_justified": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The paper makes causal claims such as 'the boost in accuracy is due to mostly the learned parameters of the model rather than the architecture itself' (Section VIII) without adequate experimental support. No controlled experiment isolates the contribution of pre-trained parameters vs. architecture. The comparison between PLBART and CodeT5 does not control for architecture since both differ in architecture AND pre-training."
    123       },
    124       "generalization_bounded": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The threats to validity section (VI) explicitly acknowledges that 'the datasets consisted of only Java codes and respective code reviews in the English language; hence, our focus was confined to a single programming language. As a result, the coverage of our findings is limited.'"
    128       },
    129       "alternative_explanations_discussed": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The paper discusses data leakage as an alternative explanation for LLM performance (Section VI): 'One possible reason might be that these LLMs were also trained with our aforementioned datasets.' It also discusses data quality issues in the Review4Repair dataset as a confound for model performance in Section V-A."
    133       }
    134     },
    135     "setup_transparency": {
    136       "model_versions_specified": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper refers to 'GPT-3.5-Turbo' and 'Code-DaVinci-Edit-001' without specifying exact API versions or snapshot dates. For PLBART and CodeT5, specific model versions are not given beyond referencing the original papers."
    140       },
    141       "prompts_provided": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "Listing 2 provides the full prompt structure for both zero-shot and few-shot scenarios. The system role content is quoted verbatim: 'You are a coding assistant. You generate only the source code.' The user prompt instruction is given: 'Refactor the Buggy Code using the Review without comments.' The Code-DaVinci-Edit-001 instruction is also quoted."
    145       },
    146       "hyperparameters_reported": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section III-A2 reports hyperparameters for fine-tuning (input length 512, target length 200, beam sizes 1/5/10, epochs 11-12 for PLBART, batch size 4, gradient accumulation 8, 45 epochs for CodeT5). Section III-B4 reports LLM parameters: temperature=0, top_p=1, frequency_penalty=0, presence_penalty=0."
    150       },
    151       "scaffolding_described": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "No agentic scaffolding is used. The models are used in a straightforward prompt-response or fine-tuning paradigm without multi-step reasoning, tool use, or feedback loops."
    155       },
    156       "data_preprocessing_documented": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section III-A1 describes dataset preprocessing in detail: concatenation of buggy code with review, special tokens used, token length filtering (removing 57+6 samples exceeding 512 tokens), and reorganization of train/val/test splits with specific percentages and counts."
    160       }
    161     },
    162     "limitations_and_scope": {
    163       "limitations_section_present": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section VI 'Threats to Validity' provides a dedicated discussion of both internal and external validity threats."
    167       },
    168       "threats_to_validity_specific": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "The threats section discusses specific issues: hyperparameter search space limitations, restriction to Java-only datasets, potential data leakage with GPT-3.5-Turbo (training data knowledge cutoff of September 2021 vs. dataset publication dates), and black-box nature preventing verification."
    172       },
    173       "scope_boundaries_stated": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "The paper explicitly states scope boundaries: 'our focus was confined to a single programming language' (Java), datasets from only two sources (Gerrit and GitHub), and the paper acknowledges that 'other datasets of various programming languages might be investigated in future research.'"
    177       }
    178     },
    179     "data_integrity": {
    180       "raw_data_available": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "The replication package is available at Zenodo (https://doi.org/10.5281/zenodo.8122636) and the underlying datasets from prior work (Tufano et al. and Review4Repair) are publicly available."
    184       },
    185       "data_collection_described": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section III-A1 describes data collection: both datasets are from prior published work, consisting of real code reviews collected from Gerrit and GitHub. Dataset sizes and compositions are given in Table I."
    189       },
    190       "recruitment_methods_described": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "For RQ3's manual analysis, the two developers who performed the evaluation are described minimally ('one year of industry experience in a Fortune 500 company') but how they were recruited is not described. The selection process for these specific evaluators is not discussed."
    194       },
    195       "data_pipeline_documented": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "The data pipeline is documented in Section III-A1: from original datasets to preprocessing (concatenation, token filtering, train/val/test splits) with specific counts at each stage (e.g., 56,211 samples, removing 57+6 exceeding 512 tokens, final 53,198/2,956/2,955 split)."
    199       }
    200     },
    201     "conflicts_of_interest": {
    202       "funding_disclosed": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper."
    206       },
    207       "affiliations_disclosed": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "Author affiliations are clearly stated: BUET (Bangladesh), University of Notre Dame (USA), University of Rochester (USA). The paper evaluates OpenAI models but no authors are affiliated with OpenAI."
    211       },
    212       "funder_independent_of_outcome": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding is disclosed, so independence cannot be verified. The absence of a funding disclosure statement means this criterion cannot be satisfied."
    216       },
    217       "financial_interests_declared": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No competing interests or financial interests statement is included in the paper. Two authors list IQVIA email addresses ({rishov.paul, mdmohib.hossain}@iqvia.com) suggesting industry affiliation, but this potential conflict is not addressed."
    221       }
    222     },
    223     "contamination": {
    224       "training_cutoff_stated": {
    225         "applies": true,
    226         "answer": true,
    227         "justification": "The paper states in Section VI: 'The knowledge cut-off of these two models is September 2021' for GPT-3.5-Turbo and Code-DaVinci-Edit-001."
    228       },
    229       "train_test_overlap_discussed": {
    230         "applies": true,
    231         "answer": true,
    232         "justification": "Section VI explicitly discusses data leakage risk: 'One possible reason might be that these LLMs were also trained with our aforementioned datasets. As a result, there might be a data leakage.' It notes the Tufano et al. dataset was published before the cutoff and Review4Repair after it."
    233       },
    234       "benchmark_contamination_addressed": {
    235         "applies": true,
    236         "answer": true,
    237         "justification": "The paper addresses this in Section VI: the Tufano et al. dataset (2021) was published before the September 2021 training cutoff, while Review4Repair (2022) was published after. The paper acknowledges: 'As these models are black-box, there is no way we can verify if there is data leakage for these datasets.'"
    238       }
    239     },
    240     "human_studies": {
    241       "pre_registered": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "The study includes human evaluation (RQ3 with two developers scoring 654 samples) but no pre-registration is mentioned."
    245       },
    246       "irb_or_ethics_approval": {
    247         "applies": true,
    248         "answer": false,
    249         "justification": "The manual analysis involves two human evaluators rating code repairs, but no IRB or ethics approval is mentioned."
    250       },
    251       "demographics_reported": {
    252         "applies": true,
    253         "answer": true,
    254         "justification": "Some demographics are reported for the two evaluators: 'one year of industry experience in a Fortune 500 company and significant involvement in the code review process in software development' (Section III-C)."
    255       },
    256       "inclusion_exclusion_criteria": {
    257         "applies": true,
    258         "answer": false,
    259         "justification": "No inclusion or exclusion criteria are stated for selecting the two developers. The paper does not explain why these specific evaluators were chosen or what qualifications were required."
    260       },
    261       "randomization_described": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "This is not an experimental study with treatment and control conditions requiring randomization. The manual analysis has all evaluators scoring the same samples."
    265       },
    266       "blinding_described": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "The paper does not describe whether evaluators knew which model produced each repair. Given they scored outputs from five named models, it appears they were not blinded, which could bias judgments."
    270       },
    271       "attrition_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "No information is given about whether both raters completed all 654 evaluations or if any were dropped. The paper does not discuss attrition or incomplete evaluations."
    275       }
    276     },
    277     "cost_and_practicality": {
    278       "inference_cost_reported": {
    279         "applies": true,
    280         "answer": false,
    281         "justification": "The paper uses OpenAI API models (GPT-3.5-Turbo, Code-DaVinci-Edit-001) across thousands of test samples but does not report API costs, tokens consumed, or inference time."
    282       },
    283       "compute_budget_stated": {
    284         "applies": true,
    285         "answer": false,
    286         "justification": "While the paper mentions using an NVIDIA GeForce RTX 2070-8GB GPU, it does not report total training time, GPU hours, or API spend for the experiments."
    287       }
    288     }
    289   },
    290   "claims": [
    291     {
    292       "claim": "Fine-tuned PLBART and CodeT5 significantly outperform baseline models on both datasets for code repair using code review",
    293       "evidence": "Table II shows improvements of +5.69% to +25.65% in Top-1/5/10 accuracy over baselines on both Review4Repair and Tufano et al. datasets.",
    294       "supported": "moderate"
    295     },
    296     {
    297       "claim": "CodeT5 generally performs better than PLBART for code repair, especially for Insert and Update categories",
    298       "evidence": "Table II and Figure 2 show CodeT5 achieves higher accuracy in most configurations. Per-category breakdowns in Figure 2 confirm CodeT5 superiority in Insert and Update classes.",
    299       "supported": "moderate"
    300     },
    301     {
    302       "claim": "Zero-shot GPT-3.5-Turbo achieves competitive results after applying heuristic post-processing",
    303       "evidence": "Table III shows zero-shot GPT-3.5-Turbo with heuristics achieves 22.06% (Review4Repair) and 31.70% (Tufano et al.) accuracy, improving by 15.6% and 12.27% over raw output.",
    304       "supported": "moderate"
    305     },
    306     {
    307       "claim": "Code-DaVinci-Edit-001 achieves state-of-the-art performance on the Tufano et al. dataset with 40.70% accuracy",
    308       "evidence": "Table III shows Code-DaVinci-Edit-001 achieving 40.70% accuracy on Tufano et al. dataset, outperforming all other models including fine-tuned ones.",
    309       "supported": "weak"
    310     },
    311     {
    312       "claim": "Language models still cannot reliably fulfill code review intentions in repairs",
    313       "evidence": "Table IV (RQ3) shows the best model fulfills only 52.65% (Review4Repair) and 58.92% (Tufano et al.) of reviews according to manual developer analysis.",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "The accuracy boost is mostly due to learned parameters rather than architecture",
    318       "evidence": "This claim is made in Section VIII (Conclusion) but no controlled experiment isolating pre-trained weights from architecture is presented.",
    319       "supported": "unsupported"
    320     }
    321   ],
    322   "methodology_tags": [
    323     "benchmark-eval"
    324   ],
    325   "key_findings": "Fine-tuned pre-trained models (PLBART, CodeT5) substantially outperform prior baseline models on two code repair datasets when using code review as input, with CodeT5 achieving the best overall performance. Zero-shot and few-shot prompting with GPT-3.5-Turbo and Code-DaVinci-Edit-001 show competitive results, particularly after heuristic post-processing of outputs. However, manual developer analysis reveals that even the best models fulfill only about 50-59% of code review intentions, indicating practical application of LLMs for automated program repair remains premature.",
    326   "red_flags": [
    327     {
    328       "flag": "No statistical significance tests",
    329       "detail": "The paper uses language like 'notably outperformed' and 'significant margin' to describe performance differences but provides no statistical tests (p-values, confidence intervals, or hypothesis tests). All comparisons are based solely on point estimate differences."
    330     },
    331     {
    332       "flag": "No variance or multiple-run reporting",
    333       "detail": "All results appear to be from single runs. For fine-tuned models with stochastic training, this makes it impossible to know whether observed differences are due to the method or random variation in training."
    334     },
    335     {
    336       "flag": "Potential data leakage for LLM results",
    337       "detail": "The paper acknowledges that GPT-3.5-Turbo's training data cutoff (September 2021) overlaps with the Tufano et al. dataset (published 2021). Code-DaVinci-Edit-001 achieves suspiciously high accuracy (40.70%) on this dataset, which may reflect memorization rather than genuine program repair capability."
    338     },
    339     {
    340       "flag": "Unsupported causal claim about learned parameters",
    341       "detail": "The conclusion claims 'this boost in accuracy is due to mostly the learned parameters of the model rather than the architecture itself' without any ablation or controlled experiment to support this attribution."
    342     },
    343     {
    344       "flag": "Only two human evaluators for manual analysis",
    345       "detail": "RQ3 relies on only two developers for manual evaluation. With Cohen's Kappa ranging from 0.51 to 0.68 (moderate to substantial agreement), the inter-rater reliability is not very high, and the small number of evaluators limits the robustness of conclusions from the manual analysis."
    346     },
    347     {
    348       "flag": "Undisclosed industry affiliations",
    349       "detail": "Two authors list IQVIA email addresses, suggesting industry affiliation, but no conflicts of interest statement addresses this. No funding sources are disclosed at all."
    350     }
    351   ],
    352   "cited_papers": [
    353     {
    354       "title": "Review4Repair: Code Review Aided Automatic Program Repairing",
    355       "authors": ["F. Huq", "M. Hasan", "M. M. A. Haque", "S. Mahbub", "A. Iqbal", "T. Ahmed"],
    356       "year": 2022,
    357       "doi": "10.1016/j.infsof.2021.106765",
    358       "relevance": "Introduces a key dataset and baseline for code-review-based automated program repair, directly related to evaluating LLM capabilities for code generation."
    359     },
    360     {
    361       "title": "Towards Automating Code Review Activities",
    362       "authors": ["R. Tufano", "L. Pascarella", "M. Tufano", "D. Poshyvanyk", "G. Bavota"],
    363       "year": 2021,
    364       "relevance": "Establishes transformer-based code repair using code review with one-encoder and two-encoder models, a key baseline for LLM-based program repair evaluation."
    365     },
    366     {
    367       "title": "Evaluating Large Language Models Trained on Code",
    368       "authors": ["M. Chen", "J. Tworek", "H. Jun"],
    369       "year": 2021,
    370       "arxiv_id": "2107.03374",
    371       "relevance": "Introduces Codex and HumanEval benchmark, foundational for evaluating LLM code generation capabilities."
    372     },
    373     {
    374       "title": "Unified Pre-Training for Program Understanding and Generation",
    375       "authors": ["W. Ahmad", "S. Chakraborty", "B. Ray", "K.-W. Chang"],
    376       "year": 2021,
    377       "relevance": "Introduces PLBART, a key pre-trained model for code understanding and generation tasks evaluated in this study."
    378     },
    379     {
    380       "title": "CodeT5: Identifier-Aware Unified Pre-Trained Encoder-Decoder Models for Code Understanding and Generation",
    381       "authors": ["Y. Wang", "W. Wang", "S. Joty", "S. C. Hoi"],
    382       "year": 2021,
    383       "relevance": "Introduces CodeT5 with identifier-aware pre-training, the best-performing fine-tuned model in this study."
    384     },
    385     {
    386       "title": "GitHub Copilot AI Pair Programmer: Asset or Liability?",
    387       "authors": ["A. Moradi Dakhel", "V. Majdinasab", "A. Nikanjam", "F. Khomh", "M. C. Desmarais", "Z. M. J. Jiang"],
    388       "year": 2023,
    389       "relevance": "Evaluates AI pair programming tools, directly relevant to understanding LLM code generation capabilities and limitations."
    390     },
    391     {
    392       "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions",
    393       "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"],
    394       "year": 2022,
    395       "relevance": "Evaluates security of AI-generated code, relevant to assessing quality and safety of LLM code generation."
    396     },
    397     {
    398       "title": "Exploring the Effectiveness of Large Language Models in Generating Unit Tests",
    399       "authors": ["M. L. Siddiq", "J. C. S. Santos", "R. H. Tanvir", "N. Ulfat", "F. A. Rifat", "V. C. Lopes"],
    400       "year": 2023,
    401       "relevance": "Evaluates LLMs for test generation using zero-shot prompting, directly relevant to LLM capabilities in software engineering tasks."
    402     },
    403     {
    404       "title": "Can OpenAI's Codex Fix Bugs?: An Evaluation on QuixBugs",
    405       "authors": ["J. A. Prenner", "H. Babii", "R. Robbes"],
    406       "year": 2022,
    407       "relevance": "Evaluates Codex for automated bug fixing, directly relevant to LLM-based program repair."
    408     },
    409     {
    410       "title": "Extracting Training Data from Large Language Models",
    411       "authors": ["N. Carlini", "F. Tramer", "E. Wallace", "M. Jagielski"],
    412       "year": 2020,
    413       "relevance": "Addresses data leakage and memorization in LLMs, relevant to benchmark contamination concerns in LLM evaluations."
    414     },
    415     {
    416       "title": "Towards Generating Functionally Correct Code Edits from Natural Language Issue Descriptions",
    417       "authors": ["S. Fakhoury", "S. Chakraborty", "M. Musuvathi", "S. K. Lahiri"],
    418       "year": 2023,
    419       "relevance": "Explores LLM-based code editing from natural language, closely related to prompt-engineering-based program repair."
    420     }
    421   ]
    422 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs