scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24240B)
      1 {
      2   "paper": {
      3     "title": "BugsPHP: A dataset for Automated Program Repair in PHP",
      4     "authors": [
      5       "K.D. Pramod",
      6       "W.T.N. De Silva",
      7       "W.U.K. Thabrew",
      8       "Ridwan Shariffdeen",
      9       "Sandareka Wickramanayake"
     10     ],
     11     "year": 2024,
     12     "venue": "MSR 2024 (21st International Conference on Mining Software Repositories)",
     13     "doi": "10.1145/nnnnnnn.nnnnnnn"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper provides a GitHub repository URL in the Conclusion section: https://github.com/bugsphp/bugsPHP.git"
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The dataset is released via the same GitHub repository. The paper states: 'Our dataset can be accessed via GitHub from the following repository: https://github.com/bugsphp/bugsPHP.git'"
     26       },
     27       "environment_specified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper mentions using PHP version 8.1 for manual validation but does not provide a requirements.txt, Dockerfile, or detailed environment setup section listing library versions for reproducing the dataset curation or the APR model experiments."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No step-by-step reproduction instructions are provided in the paper. While the methodology is described at a conceptual level, there are no specific commands, scripts, or README instructions for replicating the dataset construction or the APR experiments."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The APR evaluation results in Table 2 report only point estimates (e.g., CURE fixes 11 bugs, RewardRepair fixes 43 bugs) with no confidence intervals or error bars."
     43       },
     44       "significance_tests": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "The paper compares CURE and RewardRepair performance but does not use any statistical significance test to support the claim that RewardRepair outperforms CURE. The comparison is based solely on raw count differences."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "No effect sizes are reported. The paper presents raw counts of bugs fixed but does not report effect sizes, percentage improvements with baseline context, or any standardized measures of effect."
     53       },
     54       "sample_size_justified": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "The paper does not justify why 5,000 repositories were chosen, why the top 75 were selected for testing, or whether 513 test bugs is sufficient for meaningful APR evaluation. No power analysis or sample size rationale is provided."
     58       },
     59       "variance_reported": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No variance or standard deviation is reported across experimental runs. The APR experiments appear to be single-run results with no information about result stability across seeds or runs."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "Two APR models (CURE and RewardRepair) are evaluated as baselines to demonstrate the utility of the dataset. The comparison between these two models serves as a baseline evaluation."
     70       },
     71       "baselines_contemporary": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "CURE (2021) and RewardRepair (2022) are reasonably recent learning-based APR models relative to this 2024 paper, and represent state-of-the-art learning-based approaches for program repair."
     75       },
     76       "ablation_study": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "This is primarily a dataset paper. The system being evaluated (the dataset) is a single artifact without separable components to ablate."
     80       },
     81       "multiple_metrics": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Table 2 reports multiple metrics: NC (candidate patches generated), NV (failing tests fixed), NP (all tests passed), NE (semantically equivalent patches), and NI (identical patches). This provides a multi-metric view of APR model performance."
     85       },
     86       "human_evaluation": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The test dataset construction involved manual validation of bug-fixing commits. The paper states they 'carry out a manual test case validation' and 'Manual Validation: In this step, we examine the latest bug to validate a repository and see if its fixed version has accompanying test cases.' This is human evaluation of the dataset quality."
     90       },
     91       "held_out_test_set": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper explicitly separates training (653,606 commits from remaining repositories) and test (513 bugs from top 75 repositories) datasets. The repositories for the test dataset are excluded from the training dataset: 'Excluding the project repositories used for the test data-set, the remaining projects are used to construct the training data-set.'"
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 1 provides a per-project breakdown of bugs, tests, LOC, and test coverage. Section 4.2 also breaks down bugs by type (462 functional errors, 16 type errors, 15 security vulnerabilities, 13 compatibility issues, 5 usability issues, 2 performance bugs) and by number of file/line changes."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper notes that 'none of the APR models could fix 473 bugs in our test dataset' and discusses that CURE can only generate single-line patches while RewardRepair generates 3-4 line patches, indicating limitations of existing tools."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper reports that both APR models fail on the majority of bugs (473/513 unfixed). CURE generates candidate patches for only 443/513 bugs, and only 11 pass all tests. These are negative results showing the challenge of the dataset."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims the dataset consists of 'more than 600,000 bug-fixing commits' (confirmed at 653,606 in Section 4) and '513 manually validated bug-fixing commits equipped with developer-provided test cases' (confirmed in Section 4.1 and Table 1). All claims are supported by the paper's content."
    117       },
    118       "causal_claims_justified": {
    119         "applies": false,
    120         "answer": false,
    121         "justification": "The paper is primarily a dataset paper and does not make causal claims. It presents descriptive statistics and preliminary APR results without claiming any causal relationships."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": false,
    126         "justification": "The title and abstract present this as a dataset for 'Automated Program Repair in PHP' broadly, but the test dataset is limited to 15 applications from popular open-source projects. The paper does not explicitly discuss whether these 15 projects are representative of PHP programs in general, or bound its claims to the tested setting."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for the observed APR results. For example, it does not consider whether the low fix rates could be due to training data quality, PHP-specific language features, or the specific models chosen rather than the inherent difficulty of PHP bug repair."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "The paper mentions using CURE and RewardRepair but does not specify exact model versions, checkpoint details, or configuration variants used. It cites the original papers [7] and [17] but does not state which specific trained models or configurations were used."
    139       },
    140       "prompts_provided": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "The paper evaluates traditional learning-based APR models (CURE and RewardRepair) that do not use LLM prompting. Prompting is not part of the methodology."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "No hyperparameters are reported for either CURE or RewardRepair training. The paper states they 'generate 100 candidate patches per bug' but does not report training hyperparameters, beam search settings, or other model configuration details."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used. CURE and RewardRepair are traditional learning-based APR models, not agent-based systems."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "The data preprocessing pipeline is documented in Section 4.1 with explicit filtering criteria: commits containing fix-related keywords, PHP files only, no file addition/deletion/renaming, file changes <= 3, line changes <= 50, commits after Jan 2020. The pipeline stages are shown in Figure 1."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated limitations or threats-to-validity section in the paper. The paper moves directly from preliminary results (Section 5) to conclusion (Section 6) without discussing limitations."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No threats to validity are discussed anywhere in the paper. There is no discussion of potential biases in repository selection, keyword-based commit filtering, or the representativeness of the dataset."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "The paper does not explicitly state what the results do NOT show. It does not discuss the limitations of using only top-starred GitHub repositories, the restriction to specific PHP versions, or the exclusion of bugs requiring more than 3 file changes or 50 line changes."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "The raw dataset is available via GitHub at https://github.com/bugsphp/bugsPHP.git, allowing independent verification of the bug-fixing commits and test cases."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 4.1 describes the data collection procedure in detail: using GitHub REST API to retrieve top 5000 PHP repositories, searching commit messages for bug-fix keywords, applying filtering criteria for file and line changes, and the time period (Jan 2020 to March 2023)."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "This is a mining study of public GitHub repositories with no human participants. Recruitment methods are not applicable."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "The full data pipeline is documented in Section 4.1 and Figure 1, from repository selection through bug collection, filtering, manual validation, and dynamic validation. Each step's criteria are described, and the final counts are provided (5000 repos → top 75 for test → 513 validated bugs; remaining repos → 653,606 training commits)."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Funding is disclosed in the Acknowledgement section: 'This work was partially supported by a Singapore Ministry of Education (MoE) Tier3 grant \"Automated Program Repair\", MOE-MOET32021-0001.'"
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed: University of Moratuwa (3 authors) and National University of Singapore (1 author). The paper does not evaluate any commercial product, so there are no obvious conflicts."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The funder is the Singapore Ministry of Education, a government agency with no financial stake in the outcome of PHP bug dataset research."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interests statement is present in the paper. While there is no obvious conflict, the absence of a declaration means this criterion is not satisfied."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "The paper evaluates traditional learning-based APR models (CURE and RewardRepair) trained on the curated dataset itself, not pre-trained language models evaluated on benchmarks. Contamination in the LLM sense does not apply."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The paper explicitly addresses train/test separation: 'Excluding the project repositories used for the test data-set, the remaining projects are used to construct the training data-set.' The test and training sets come from disjoint sets of repositories."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This is not evaluating a pre-trained model on a pre-existing benchmark. The models are trained from scratch on the curated dataset. Pre-training contamination does not apply."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants. This is a mining study of public GitHub repositories."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants. This is a mining study of public GitHub repositories."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants. This is a mining study of public GitHub repositories."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. This is a mining study of public GitHub repositories."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants. This is a mining study of public GitHub repositories."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants. This is a mining study of public GitHub repositories."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants. This is a mining study of public GitHub repositories."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No inference cost or latency is reported for the APR model evaluations. The paper does not mention how long it takes to generate 100 candidate patches per bug or the computational cost of running CURE and RewardRepair."
    281       },
    282       "compute_budget_stated": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "No computational budget is stated. The paper does not report GPU hours, hardware used, training time for the APR models, or the time required for dataset curation."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "BugsPHP contains 653,606 bug-fixing commits for training and 513 manually validated bugs for testing from popular open-source PHP applications.",
    292       "evidence": "Section 4 and Table 1 provide the dataset statistics. The training dataset has 653,606 commits from 4,483 PHP applications. The test dataset has 513 bugs from 15 applications, each with developer-written test cases.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "RewardRepair outperforms CURE on the BugsPHP test dataset, fixing 43 bugs compared to CURE's 11.",
    297       "evidence": "Table 2 shows RewardRepair fixes 43 bugs (NP) while CURE fixes 11. RewardRepair generates patches for all 513 bugs while CURE only generates candidates for 443. No statistical significance testing accompanies these claims.",
    298       "supported": "moderate"
    299     },
    300     {
    301       "claim": "PHP has no existing benchmark bug dataset for automated program repair research.",
    302       "evidence": "Section 3 (Related Work) surveys existing datasets for Java (Defects4J, Bugs.jar, BEARS, Vul4J), C/C++ (ManyBugs, IntroClass), JavaScript (BugsJS, FixJS), and Python (BugsInPy), but identifies none for PHP.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "90% of bugs in the test dataset are functional errors.",
    307       "evidence": "Section 4.2 states: 'We identified 462 bugs (90%) as functional errors, while the remaining bugs consist of 16 type errors, 15 security vulnerabilities, 13 compatibility issues, 5 usability issues and 2 performance bugs.'",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "Existing APR models cannot fix 473 out of 513 bugs in the test dataset.",
    312       "evidence": "Section 5 states: 'none of the APR models could fix 473 bugs in our test dataset.' RewardRepair fixes 29 unique bugs and CURE fixes 4 unique, with 7 commonly fixed.",
    313       "supported": "strong"
    314     }
    315   ],
    316   "methodology_tags": [
    317     "benchmark-eval",
    318     "observational"
    319   ],
    320   "key_findings": "BugsPHP is the first benchmark dataset for automated program repair in PHP, consisting of 653,606 bug-fixing commits for training and 513 manually validated bugs with developer-written test cases for testing. Preliminary evaluation shows that existing learning-based APR models (CURE and RewardRepair) can fix only a small fraction of PHP bugs (11 and 43 out of 513 respectively), highlighting the need for further research on PHP-specific program repair techniques. The dataset is curated from the top 5,000 PHP repositories on GitHub, with training and test sets drawn from disjoint repository sets.",
    321   "red_flags": [
    322     {
    323       "flag": "No limitations section",
    324       "detail": "The paper has no limitations or threats-to-validity section, which is a significant methodological concern for a dataset paper. There is no discussion of selection bias from using only top-starred GitHub repositories, the impact of keyword-based commit filtering, or whether the 15 test projects are representative of PHP programs in general."
    325     },
    326     {
    327       "flag": "No statistical rigor in APR comparison",
    328       "detail": "The comparison between CURE and RewardRepair lacks any statistical testing, confidence intervals, or variance reporting. The claim that RewardRepair outperforms CURE is based solely on raw count comparisons from apparently single experimental runs."
    329     },
    330     {
    331       "flag": "Missing hyperparameters and training details",
    332       "detail": "No hyperparameters, training configurations, or computational details are reported for the APR model experiments, making it impossible to reproduce the preliminary results."
    333     },
    334     {
    335       "flag": "Popularity-based repository selection may bias dataset",
    336       "detail": "The dataset is constructed from the top 5,000 PHP repositories ranked by GitHub stars. Popular repositories may have different bug patterns, coding standards, and testing practices than typical PHP projects, potentially limiting dataset representativeness. This bias is not acknowledged."
    337     }
    338   ],
    339   "cited_papers": [
    340     {
    341       "title": "Defects4J: A database of existing faults to enable controlled testing studies for Java programs",
    342       "authors": ["René Just", "Darioush Jalali", "Michael D Ernst"],
    343       "year": 2014,
    344       "relevance": "Foundational benchmark dataset for Java program repair, directly comparable to BugsPHP's contribution for PHP."
    345     },
    346     {
    347       "title": "Bugs.jar: A large-scale, diverse dataset of real-world java bugs",
    348       "authors": ["Ripon K Saha", "Yingjun Lyu", "Wing Lam", "Hiroaki Yoshida", "Mukul R Prasad"],
    349       "year": 2018,
    350       "relevance": "Large-scale Java bug dataset with 1,158 bugs from 8 projects, methodologically similar to BugsPHP's training dataset curation."
    351     },
    352     {
    353       "title": "BugsInPy: A Database of Existing Bugs in Python Programs to Enable Controlled Testing and Debugging Studies",
    354       "authors": ["Ratnadira Widyasari", "Sheng Qin Sim"],
    355       "year": 2020,
    356       "doi": "10.1145/3368089.3417943",
    357       "relevance": "Python bug dataset analogous to BugsPHP, enabling comparison of dataset construction methodologies across languages."
    358     },
    359     {
    360       "title": "Bugsjs: a benchmark of javascript bugs",
    361       "authors": ["Péter Gyimesi", "Béla Vancsics", "Andrea Stocco"],
    362       "year": 2019,
    363       "relevance": "JavaScript bug benchmark with 453 verified bugs, comparable in scope and methodology to BugsPHP's test dataset."
    364     },
    365     {
    366       "title": "CURE: Code-aware neural machine translation for automatic program repair",
    367       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    368       "year": 2021,
    369       "relevance": "One of the two APR models evaluated on BugsPHP; demonstrates LLM-based code repair approach."
    370     },
    371     {
    372       "title": "Neural program repair with execution-based backpropagation",
    373       "authors": ["He Ye", "Matias Martinez", "Martin Monperrus"],
    374       "year": 2022,
    375       "relevance": "RewardRepair model evaluated on BugsPHP; learning-based APR approach using execution feedback."
    376     },
    377     {
    378       "title": "Automated Program Repair",
    379       "authors": ["Claire Le Goues", "Michael Pradel", "Abhik Roychoudhury"],
    380       "year": 2019,
    381       "doi": "10.1145/3318162",
    382       "relevance": "Survey of automated program repair field providing context for APR dataset research."
    383     },
    384     {
    385       "title": "An empirical study on learning bug-fixing patches in the wild via neural machine translation",
    386       "authors": ["Michele Tufano", "Cody Watson", "Gabriele Bavota"],
    387       "year": 2019,
    388       "relevance": "Foundational work on learning-based program repair from commit history, methodology followed by BugsPHP for commit collection."
    389     },
    390     {
    391       "title": "FixJS: a dataset of bug-fixing JavaScript commits",
    392       "authors": ["Viktor Csuvik", "László Vidács"],
    393       "year": 2022,
    394       "relevance": "JavaScript bug-fixing commit dataset comparable to BugsPHP's training dataset approach."
    395     },
    396     {
    397       "title": "Vul4J: A Dataset of Reproducible Java Vulnerabilities Geared towards the Study of Program Repair Techniques",
    398       "authors": ["Quang-Cuong Bui", "Riccardo Scandariato", "Nicolás E. Díaz Ferreyra"],
    399       "year": 2022,
    400       "doi": "10.1145/3524842.3528482",
    401       "relevance": "Vulnerability-focused Java dataset for program repair, related to BugsPHP's security vulnerability subset."
    402     }
    403   ]
    404 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs