scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23729B)
      1 {
      2   "paper": {
      3     "title": "Automated Program Repair: Emerging trends pose and expose problems for benchmarks",
      4     "authors": [
      5       "Joseph Renzullo",
      6       "Pemma Reiter",
      7       "Westley Weimer",
      8       "Stephanie Forrest"
      9     ],
     10     "year": 2024,
     11     "venue": "arXiv preprint (submitted to ACM)",
     12     "arxiv_id": "2405.05455",
     13     "doi": "XXXXXXX.XXXXXXX"
     14   },
     15   "checklist": {
     16     "artifacts": {
     17       "code_released": {
     18         "applies": true,
     19         "answer": false,
     20         "justification": "No repository URL, code archive, or data release is mentioned anywhere in the paper. The survey's corpus of 118 papers and analysis data are not publicly available."
     21       },
     22       "data_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "The paper does not release its corpus of 118 reviewed papers, keyword analysis data, or any structured dataset. A survey can release its extracted data and analysis scripts, but this one does not."
     26       },
     27       "environment_specified": {
     28         "applies": false,
     29         "answer": false,
     30         "justification": "This is a survey/review paper with no computational experiments requiring an environment specification."
     31       },
     32       "reproduction_instructions": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No reproduction instructions are provided. While Section 1.1 describes the methodology (keyword search across five venues, 2018-2023), there are no step-by-step instructions, search queries, or scripts that would allow someone to replicate the paper selection process."
     36       }
     37     },
     38     "statistical_methodology": {
     39       "confidence_intervals_or_error_bars": {
     40         "applies": false,
     41         "answer": false,
     42         "justification": "This is a qualitative survey that does not report quantitative experimental results requiring confidence intervals or error bars."
     43       },
     44       "significance_tests": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "The paper does not make comparative statistical claims that would require significance tests. It is a qualitative topical review."
     48       },
     49       "effect_sizes_reported": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "No experiments are conducted; the paper is a qualitative survey reviewing trends and methodological issues in APR literature."
     53       },
     54       "sample_size_justified": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "No experimental sample sizes are involved. The corpus of 118 papers is described but this is a literature review, not a statistical study."
     58       },
     59       "variance_reported": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "No quantitative experiments are run, so variance across runs is not applicable."
     63       }
     64     },
     65     "evaluation_design": {
     66       "baselines_included": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "This is a topical review, not a system or method evaluation. There is no system to compare against baselines."
     70       },
     71       "baselines_contemporary": {
     72         "applies": false,
     73         "answer": false,
     74         "justification": "No system evaluation is performed; the paper is a survey of the APR field."
     75       },
     76       "ablation_study": {
     77         "applies": false,
     78         "answer": false,
     79         "justification": "No system or method is proposed, so ablation is not applicable."
     80       },
     81       "multiple_metrics": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No system evaluation is performed; the paper reviews how other papers report metrics but does not evaluate a system itself."
     85       },
     86       "human_evaluation": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "No system outputs are produced that would require human evaluation. The paper is a survey."
     90       },
     91       "held_out_test_set": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "No experiments with training/test splits are conducted."
     95       },
     96       "per_category_breakdown": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Table 1 provides a per-keyword breakdown of the 118 papers. The paper also breaks down findings by category: ML architectures (Section 3), data leakage issues (Section 4), and benchmark problems (Section 5), with specific counts (e.g., 29% use ML keywords, 44 papers evaluate on Defects4J vs. 5 on ManyBugs)."
    100       },
    101       "failure_cases_discussed": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper extensively discusses failure cases in the reviewed literature: inconsistent correctness definitions (Section 1), papers that do not consider data leakage (Section 4.2 names five specific papers), tools that cannot be compared due to incompatible reporting, and benchmark limitations throughout Section 5."
    105       },
    106       "negative_results_reported": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "The paper's central thesis is a negative finding: current APR benchmarks and evaluation practices are inadequate for ML-based approaches. It reports specific negative observations: 5 papers ignore data leakage, vocabulary sizes are not always reported, decontamination is often infeasible, and StandUp4NPR has design limitations (Section 6.1)."
    110       }
    111     },
    112     "claims_and_evidence": {
    113       "abstract_claims_supported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The abstract claims that ML pervades APR, that evaluations must take care to ensure validity, and that popular benchmarks were not designed with ML in mind. All three are supported by the detailed analysis in Sections 3-5, including the keyword analysis (Table 1) and specific benchmark issues documented throughout."
    117       },
    118       "causal_claims_justified": {
    119         "applies": false,
    120         "answer": false,
    121         "justification": "The paper makes no causal claims. It describes trends and identifies methodological problems in the literature but does not claim that X causes Y. Statements like 'Duplicated data lead to bias' reference general ML principles, not original causal findings."
    122       },
    123       "generalization_bounded": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 1.1 clearly bounds the scope: 'we take a narrower view, examining all of the papers published at the top five software engineering venues since 2018 and focusing on ML in APR.' The five venues are explicitly named (ICSE, TSE, FSE, ASE, EMSE) and the time range specified (Jan 2018 - Sep 2023)."
    127       },
    128       "alternative_explanations_discussed": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The paper does not discuss alternative explanations for its observations. For example, the concentration on Defects4J (44 papers) vs. ManyBugs (5 papers) is attributed partly to build complexity, but the paper notes 'there are other factors that influence the differences' without systematically exploring them. No threats-to-validity or alternative-explanations section exists."
    132       }
    133     },
    134     "setup_transparency": {
    135       "model_versions_specified": {
    136         "applies": false,
    137         "answer": false,
    138         "justification": "No ML models are used in this survey. The paper reviews others' use of models but does not run any models itself."
    139       },
    140       "prompts_provided": {
    141         "applies": false,
    142         "answer": false,
    143         "justification": "No prompting is used. This is a survey paper."
    144       },
    145       "hyperparameters_reported": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "No experiments with hyperparameters are conducted."
    149       },
    150       "scaffolding_described": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "No agentic scaffolding is used."
    154       },
    155       "data_preprocessing_documented": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Section 1.1 describes the methodology at a high level: seeding from Monperrus's living review, selecting five venues, searching by primary keywords (patch, repair) and secondary keywords (automate, fix, fault, bug, vulnerability, generate), and manually reviewing abstracts. However, the actual filtering criteria for what counted as APR-related during abstract review are not stated, nor are counts given at each filtering stage. The paper states 118 papers were identified and 65 cross-listed with Monperrus, but does not document how many were found at each stage of screening."
    159       }
    160     },
    161     "limitations_and_scope": {
    162       "limitations_section_present": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated limitations or threats-to-validity section. The paper discusses limitations of other work but does not have a section discussing limitations of its own review methodology."
    166       },
    167       "threats_to_validity_specific": {
    168         "applies": true,
    169         "answer": false,
    170         "justification": "No threats to validity are discussed for this review itself. Issues like selection bias (only five venues), time-boundedness (2018-2023), keyword-based search potentially missing relevant papers, and the subjective abstract screening process are not addressed."
    171       },
    172       "scope_boundaries_stated": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 1.1 explicitly states the scope: 'we take a narrower view, examining all of the papers published at the top five software engineering venues since 2018 and focusing on ML in APR.' The venues and time period are specified. However, the paper does not explicitly state what the results do NOT show, just what was covered."
    176       }
    177     },
    178     "data_integrity": {
    179       "raw_data_available": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "The corpus of 118 papers and the extracted metadata (keywords, ML techniques, benchmarks used, etc.) are not made available for independent verification."
    183       },
    184       "data_collection_described": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Section 1.1 describes the data collection procedure: seeding from Monperrus's living review, identifying top five venues by volume, searching with primary and secondary keywords, manually reviewing abstracts, covering January 2018 through September 2023. The total of 118 papers and overlap with the living review (65) are reported."
    188       },
    189       "recruitment_methods_described": {
    190         "applies": false,
    191         "answer": false,
    192         "justification": "No human participants are involved. The data source is published papers from known venues."
    193       },
    194       "data_pipeline_documented": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The paper describes the high-level pipeline (living review seed → venue selection → keyword search → abstract review → 118 papers) but does not document intermediate counts at each stage. How many papers were returned by keyword search before abstract filtering? How many were excluded at each stage and why? These details are missing."
    198       }
    199     },
    200     "conflicts_of_interest": {
    201       "funding_disclosed": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "No funding information, acknowledgments section, or grant numbers are provided anywhere in the paper."
    205       },
    206       "affiliations_disclosed": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Author affiliations are clearly listed: Renzullo and Reiter at Arizona State University, Weimer at University of Michigan, Forrest at Arizona State University (also Santa Fe Institute). These are academic institutions with no apparent conflict regarding the APR tools reviewed."
    210       },
    211       "funder_independent_of_outcome": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding is disclosed, so independence cannot be assessed. The absence of funding disclosure is the issue, not a known conflict."
    215       },
    216       "financial_interests_declared": {
    217         "applies": true,
    218         "answer": false,
    219         "justification": "No competing interests or financial interests statement is present in the paper. Note that Weimer and Forrest are co-authors of GenProg, which is discussed in the paper (Section 6.1), but this is not disclosed as a potential conflict."
    220       }
    221     },
    222     "contamination": {
    223       "training_cutoff_stated": {
    224         "applies": false,
    225         "answer": false,
    226         "justification": "This paper does not evaluate any pre-trained model on a benchmark. It is a survey reviewing how others handle contamination."
    227       },
    228       "train_test_overlap_discussed": {
    229         "applies": false,
    230         "answer": false,
    231         "justification": "The paper does not evaluate any model itself. It reviews contamination practices in the literature."
    232       },
    233       "benchmark_contamination_addressed": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "No benchmark evaluation is performed. The paper surveys contamination risks but does not run its own model evaluation."
    237       }
    238     },
    239     "human_studies": {
    240       "pre_registered": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants are involved in this survey."
    244       },
    245       "irb_or_ethics_approval": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants are involved."
    249       },
    250       "demographics_reported": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants are involved."
    254       },
    255       "inclusion_exclusion_criteria": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants are involved."
    259       },
    260       "randomization_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants are involved."
    264       },
    265       "blinding_described": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants are involved."
    269       },
    270       "attrition_reported": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants are involved."
    274       }
    275     },
    276     "cost_and_practicality": {
    277       "inference_cost_reported": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "This is a survey paper with no computational method whose cost needs reporting."
    281       },
    282       "compute_budget_stated": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "This is a survey paper with no computational experiments."
    286       }
    287     }
    288   },
    289   "claims": [
    290     {
    291       "claim": "ML techniques now pervade APR, with more than three in four papers published by 2023 employing ML techniques, up from fewer than one in six in 2018.",
    292       "evidence": "Section 3 and Table 1 report keyword analysis of 118 papers across five top SE venues (2018-2023), showing 29% list ML-related keywords overall with a sharp upward trend.",
    293       "supported": "moderate"
    294     },
    295     {
    296       "claim": "Five of the surveyed papers that use LLMs for APR do not consider the impact of data leakage and contamination.",
    297       "evidence": "Section 4.2 names the five specific papers: [4, 34, 37, 93, 120] (SynShine, Fan et al., VulRepair, Rete, and Xia & Zhang 2022).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "APR benchmarks are too small to train ML tools and systematically differ from ML datasets in construction and purpose.",
    302       "evidence": "Section 5.4 argues that APR benchmarks consist of limited bug/fix pairs with test cases, while ML datasets must be orders of magnitude larger and lack execution environments. Specific examples cited include CrossVul and CURE's training data.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "Defects4J is far more widely adopted than ManyBugs, with 44 papers in the corpus evaluating on Defects4J vs. 5 on ManyBugs.",
    307       "evidence": "Section 5.6 provides explicit counts: 44 papers for Defects4J and 5 for ManyBugs, with references listed. The paper attributes this partly to Java's simpler build infrastructure vs. C's build complexity.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "NPR techniques have not converged on a standard metric for search efficiency, with beam sizes ranging from 50 to 1000.",
    312       "evidence": "Section 5.7 documents beam sizes across tools: 50 (VRepair, SeqTrans), 200 (RewardRepair), 1000 (Knod, Rete), and notes that generative vocabulary size is not always reported, making search efficiency comparisons difficult.",
    313       "supported": "strong"
    314     },
    315     {
    316       "claim": "StandUp4NPR, while promising for standardization, has design limitations that prevent fair comparison of traditional APR algorithms.",
    317       "evidence": "Section 6.1 explains that StandUp4NPR assumes all patches are generated simultaneously and evaluated independently, which prevents search-based techniques like GenProg and ARJA from using evaluation feedback loops.",
    318       "supported": "moderate"
    319     }
    320   ],
    321   "methodology_tags": [
    322     "meta-analysis",
    323     "qualitative"
    324   ],
    325   "key_findings": "ML techniques have rapidly taken over APR research, rising from fewer than 1-in-6 papers in 2018 to more than 3-in-4 by 2023 across top SE venues. The paper identifies three critical methodological problems: (1) data leakage and contamination are inadequately addressed, with five LLM-based APR papers ignoring the issue entirely; (2) popular benchmarks like Defects4J were not designed for ML evaluation and are too small for training; (3) inconsistent reporting of correctness criteria, fault localization assumptions, and search efficiency metrics makes cross-study comparison infeasible. The authors discuss StandUp4NPR and APR-COMP 2024 as partial solutions but note both have significant design limitations.",
    326   "red_flags": [
    327     {
    328       "flag": "No limitations section for own methodology",
    329       "detail": "The paper extensively critiques methodological weaknesses in the reviewed literature but does not discuss limitations of its own survey methodology. Potential issues include: coverage limited to five venues (missing workshops, arXiv-only papers), keyword-based search may miss relevant papers, and subjective abstract screening with unclear criteria."
    330     },
    331     {
    332       "flag": "Authors evaluate their own tool without disclosure",
    333       "detail": "Weimer and Forrest are co-authors of GenProg, which is discussed in Section 6.1 in the context of StandUp4NPR's limitations. While the discussion is fair, the relationship is not disclosed as a potential conflict of interest."
    334     },
    335     {
    336       "flag": "No data release for a survey paper",
    337       "detail": "The corpus of 118 papers, keyword analysis, and categorization data are not released. This makes it impossible to independently verify the paper counts, keyword trends, or the specific claims about which papers do or do not address contamination."
    338     },
    339     {
    340       "flag": "Incomplete filtering pipeline documentation",
    341       "detail": "The paper search methodology (Section 1.1) does not report how many papers were returned by keyword search before abstract screening, nor how many were excluded at each stage. The 118 final papers cannot be traced back through documented filtering steps."
    342     }
    343   ],
    344   "cited_papers": [
    345     {
    346       "title": "Automated Program Repair in the Era of Large Pre-trained Language Models",
    347       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    348       "year": 2023,
    349       "relevance": "Directly evaluates LLMs for APR and is one of the few papers that reports contamination information for evaluated PLMs."
    350     },
    351     {
    352       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair via Zero-Shot Learning",
    353       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    354       "year": 2022,
    355       "relevance": "Applies zero-shot LLMs to APR, cited as one of five papers that do not consider data leakage impact."
    356     },
    357     {
    358       "title": "Evaluating Large Language Models Trained on Code",
    359       "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"],
    360       "year": 2021,
    361       "arxiv_id": "2107.03374",
    362       "relevance": "Introduces Codex and HumanEval, foundational to LLM code generation evaluation and contamination concerns."
    363     },
    364     {
    365       "title": "Extracting Training Data from Large Language Models",
    366       "authors": ["Nicholas Carlini", "Florian Tramèr", "Eric Wallace"],
    367       "year": 2021,
    368       "relevance": "Demonstrates memorization and data extraction from LLMs, directly relevant to contamination and data integrity in AI evaluations."
    369     },
    370     {
    371       "title": "Deduplicating Training Data Makes Language Models Better",
    372       "authors": ["Katherine Lee", "Daphne Ippolito", "Andrew Nystrom"],
    373       "year": 2022,
    374       "arxiv_id": "2107.06499",
    375       "relevance": "Shows deduplication reduces memorization in language models, directly relevant to benchmark contamination mitigation."
    376     },
    377     {
    378       "title": "StandUp4NPR: Standardizing SetUp for Empirically Comparing Neural Program Repair Systems",
    379       "authors": ["Wenkang Zhong", "Hongliang Ge", "Hongfei Ai"],
    380       "year": 2022,
    381       "relevance": "Proposes standardized evaluation framework for NPR tools, extensively discussed in Section 6.1 as a partial solution with limitations."
    382     },
    383     {
    384       "title": "RunBugRun – An Executable Dataset for Automated Program Repair",
    385       "authors": ["Julian Aron Prenner", "Romain Robbes"],
    386       "year": 2023,
    387       "arxiv_id": "2304.01102",
    388       "relevance": "Addresses the build reproducibility problem for ML-based APR evaluation with an executable benchmark dataset."
    389     },
    390     {
    391       "title": "GPT-4 Technical Report",
    392       "authors": ["OpenAI"],
    393       "year": 2023,
    394       "arxiv_id": "2303.08774",
    395       "relevance": "Describes GPT-4's decontamination approach (multi-sample substring matching), relevant to contamination methodology in LLM evaluations."
    396     },
    397     {
    398       "title": "MultiPL-E: A Scalable and Polyglot Approach to Benchmarking Neural Code Generation",
    399       "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"],
    400       "year": 2023,
    401       "doi": "10.1109/TSE.2023.3267446",
    402       "relevance": "Multi-language code generation benchmark, relevant to expanding APR evaluation beyond Java."
    403     },
    404     {
    405       "title": "Impact of Code Language Models on Automated Program Repair",
    406       "authors": ["Nan Jiang", "Kevin Liu", "Thibaud Lutellier", "Lin Tan"],
    407       "year": 2023,
    408       "doi": "10.1109/ICSE48619.2023.00125",
    409       "relevance": "Evaluates the impact of code LLMs on APR, directly relevant to understanding how language models change the APR landscape."
    410     },
    411     {
    412       "title": "A Systematic Evaluation of Large Language Models of Code",
    413       "authors": ["Frank F Xu", "Uri Alon", "Graham Neubig", "Vincent Josua Hellendoorn"],
    414       "year": 2022,
    415       "relevance": "Systematic evaluation of code LLMs, relevant to understanding code duplication and evaluation methodology concerns."
    416     },
    417     {
    418       "title": "The Adverse Effects of Code Duplication in Machine Learning Models of Code",
    419       "authors": ["Miltiadis Allamanis"],
    420       "year": 2019,
    421       "doi": "10.1145/3359591.3359735",
    422       "relevance": "Shows near-duplicate code inflates ML performance metrics by up to 100%, directly motivating the contamination concerns in this paper."
    423     }
    424   ]
    425 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs