scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25271B)
      1 {
      2   "paper": {
      3     "title": "A Survey of Learning-based Automated Program Repair",
      4     "authors": [
      5       "Quanjun Zhang",
      6       "Chunrong Fang",
      7       "Yuxiang Ma",
      8       "Weisong Sun",
      9       "Zhenyu Chen"
     10     ],
     11     "year": 2023,
     12     "venue": "ACM Transactions on Software Engineering and Methodology",
     13     "arxiv_id": "2301.03270",
     14     "doi": "10.1145/nnnnnnn.nnnnnnn"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["survey_methodology"],
     18   "methodology_tags": ["meta-analysis"],
     19   "key_findings": "This systematic review of 112 learning-based APR papers (2016–2022) finds the field has grown rapidly since 2020, with Java dominating (44%) as the target language. The survey categorizes techniques by code representation (sequence, tree, graph), documents 53 datasets and two metric families (execution-based and match-based), and identifies pre-trained models as the dominant recent trend. A critical open science audit reveals many papers fail to release source code, datasets, or trained models, hindering reproducibility.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The paper provides a public GitHub repository: https://github.com/iSEngLab/AwesomeLearningAPR, mentioned in the abstract and Section 1 ('Our artifacts are publicly available at the repository')."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The paper states 'All artifacts of this study are available' at the GitHub repository, which serves as the curated dataset of 112 collected papers and their classifications."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "No environment or dependency specifications are provided. As a survey paper with no computational analysis scripts, no environment setup is documented."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Section 2 provides a detailed search methodology: explicit search strings for two keyword groups, three databases (Google Scholar, ACM, IEEE), filtering criteria (before 2016, less than 7 pages, deduplication), and a snowballing procedure, shown in Figure 1."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": false,
     46         "answer": false,
     47         "justification": "This is a survey paper that does not run experiments or produce statistical results requiring confidence intervals."
     48       },
     49       "significance_tests": {
     50         "applies": false,
     51         "answer": false,
     52         "justification": "This is a survey paper that makes no comparative statistical claims requiring significance testing."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": false,
     56         "answer": false,
     57         "justification": "This is a survey paper with no experimental results requiring effect size reporting."
     58       },
     59       "sample_size_justified": {
     60         "applies": false,
     61         "answer": false,
     62         "justification": "This is a survey paper; the number of collected papers (112) is determined by the search and filtering process, not by statistical power considerations."
     63       },
     64       "variance_reported": {
     65         "applies": false,
     66         "answer": false,
     67         "justification": "This is a survey paper with no experimental runs requiring variance reporting."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper compares against prior surveys by Gazzola et al. [53] (up to January 2017) and Monperrus et al. [135], explicitly distinguishing its focus on learning-based APR and its more recent coverage (Section 1, 'Comparison with Existing Surveys')."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The compared prior surveys (Gazzola et al. 2019, Monperrus 2018) are the most relevant existing APR surveys, and the paper clearly explains why a new survey focusing on learning-based techniques is needed."
     80       },
     81       "ablation_study": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "This is a survey paper with no system components to ablate."
     85       },
     86       "multiple_metrics": {
     87         "applies": false,
     88         "answer": false,
     89         "justification": "This is a survey paper that does not run experiments or report evaluation metrics for its own system."
     90       },
     91       "human_evaluation": {
     92         "applies": false,
     93         "answer": false,
     94         "justification": "This is a survey paper with no system outputs to evaluate."
     95       },
     96       "held_out_test_set": {
     97         "applies": false,
     98         "answer": false,
     99         "justification": "This is a survey paper with no experimental evaluation requiring train/test splits."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The survey provides extensive breakdowns: papers by year (Figure 2), by programming language (Figure 3), by bug type (semantic/syntax/vulnerability), by code representation (sequence/tree/graph), by model architecture, by dataset, and by tool availability (Tables 1-7)."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 7.3 discusses the open science problem in detail, identifying papers that fail to release artifacts. Section 8 discusses challenges and limitations of existing learning-based APR techniques across 10 guideline areas."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports concerning findings: the open science problem (Section 7.3), the overfitting issue (Section 4.7), that match-based metrics may be improper for APR (Section 6.2), and that many training datasets contain noise (Section 8, I&G3)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims to survey learning-based APR, detail crucial components, discuss datasets/metrics, summarize empirical studies, discuss repair domains and industrial deployment, and provide guidelines. All these are addressed in Sections 2–8 of the paper."
    122       },
    123       "causal_claims_justified": {
    124         "applies": false,
    125         "answer": false,
    126         "justification": "The paper is a survey that reports observations and trends. It does not make causal claims about interventions; interpretive statements about why trends occur ('One reason behind this phenomenon is...') are speculative observations, not tested causal claims."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": true,
    131         "justification": "The paper explicitly states its scope: papers collected through November 2022, focused on learning-based APR (not traditional APR), with the search process described in Section 2. The title clearly bounds the scope to 'Learning-based Automated Program Repair.'"
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper does not discuss alternative explanations for its survey findings. For example, the observed growth trend in learning-based APR papers could reflect publication venue expansion or keyword inflation, but no such alternatives are considered. There is no threats-to-validity section for the survey methodology itself."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper's claims match the granularity of its measurements. It counts papers, categorizes techniques, and reports trends — it does not claim these proxies represent broader constructs beyond what is directly measured."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": false,
    147         "answer": false,
    148         "justification": "This is a survey paper that does not use any ML models."
    149       },
    150       "prompts_provided": {
    151         "applies": false,
    152         "answer": false,
    153         "justification": "This is a survey paper that does not use prompting."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": false,
    157         "answer": false,
    158         "justification": "This is a survey paper with no models or hyperparameters."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "This is a survey paper with no agentic scaffolding."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 2 documents the paper selection pipeline: automated search (342 papers) → filter by year and page count (283) → manual screening for relevance (87) → snowballing (112). The search string, databases, and filtering criteria are explicitly stated."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "There is no dedicated limitations or threats-to-validity section for the survey itself. Section 8 discusses limitations of the surveyed APR techniques (not the survey methodology), and Section 9 is a brief conclusion."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No threats to validity are discussed for the survey methodology itself. Potential issues such as search string completeness, manual screening reliability, or temporal bias in paper collection are not addressed."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper states it covers papers through November 2022 (Section 2), focuses on learning-based APR (distinct from traditional APR), and explicitly compares its scope with prior surveys by Gazzola et al. and Monperrus et al. in Section 1."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The artifact repository at https://github.com/iSEngLab/AwesomeLearningAPR provides the collected papers list and tool availability data for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section 2 describes the data collection in detail: search databases (Google Scholar, ACM, IEEE), search string with two keyword groups, time period ('end of November 2022'), and the filtering process with specific criteria."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants; data sources are academic databases and digital libraries."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Figure 1 and Section 2 document the full pipeline: automated search (342 papers) → year and page filtering with deduplication (283) → manual relevance screening (87) → snowballing of missed citations (112). Each stage has counts and criteria."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": true,
    214         "justification": "The Acknowledgments section lists funding: National Natural Science Foundation of China (61932012, 62141215, 62372228), CCF-Huawei Populus Grove Fund, and Shenzhen Municipality commission."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "All authors are from State Key Laboratory for Novel Software Technology, Nanjing University. They are not evaluating their own products."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": true,
    224         "justification": "Funding is from NSFC (government grants) and CCF-Huawei academic fund. While Huawei has software engineering interests, the survey's findings do not evaluate Huawei products. The funders have no obvious stake in the survey's conclusions."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial disclosure statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "This is a survey paper that does not evaluate a pre-trained model on any benchmark."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this survey."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this survey."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this survey."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this survey."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this survey."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this survey."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this survey."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": false,
    289         "answer": false,
    290         "justification": "This is a survey paper with no computational method of its own."
    291       },
    292       "compute_budget_stated": {
    293         "applies": false,
    294         "answer": false,
    295         "justification": "This is a survey paper with no computational experiments."
    296       }
    297     },
    298     "survey_methodology": {
    299       "prisma_or_structured_protocol": {
    300         "applies": true,
    301         "answer": true,
    302         "justification": "Section 2 explicitly follows Petersen et al. [153] and Kitchenham et al. [82] systematic review methodology. The paper provides a structured search strategy with reproducible queries (exact search string provided), three databases, explicit filtering criteria, and a snowballing phase. Figure 1 shows the workflow diagram."
    303       },
    304       "quality_assessment_of_sources": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The survey does not assess the methodological quality of the 112 included papers. All papers are treated equally regardless of their experimental rigor, sample sizes, or evaluation quality. Tables summarize techniques and datasets but include no quality scoring."
    308       },
    309       "publication_bias_discussed": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The survey does not discuss publication bias. There is no consideration of whether the collected papers skew toward positive results, no funnel plots, and no acknowledgment that published APR papers may disproportionately report successful repair techniques."
    313       }
    314     }
    315   },
    316   "claims": [
    317     {
    318       "claim": "112 relevant learning-based APR studies were collected from 2016 to 2022, with rapid growth since 2020 (47 papers in 2022 alone).",
    319       "evidence": "Section 2 describes the collection methodology; Figure 2 shows the year-by-year distribution: 3 (2016), 4 (2017), 6 (2018), 13 (2019), 13 (2020), 25 (2021), 47 (2022).",
    320       "supported": "strong"
    321     },
    322     {
    323       "claim": "Java is the most targeted programming language in learning-based APR, accounting for 44% of papers.",
    324       "evidence": "Figure 3 shows the language distribution: Java 44%, C 20%, Python 18%, JavaScript 13%, C++ 5%.",
    325       "supported": "strong"
    326     },
    327     {
    328       "claim": "Pre-trained models have significantly influenced program repair, with techniques divided into universal (e.g., CodeT5) and specific (e.g., AlphaRepair) categories.",
    329       "evidence": "Section 5 provides a comprehensive taxonomy of pre-trained model-based APR techniques with Table 3 listing specific approaches (TFix, CIRCLE, VulRepair, AlphaRepair, SYNSHINE).",
    330       "supported": "strong"
    331     },
    332     {
    333       "claim": "53 datasets are used across learning-based APR studies, with Defects4J being the most widely adopted.",
    334       "evidence": "Table 4 lists all 53 datasets with their languages, sizes, and which studies use them. Defects4J appears in the most studies' evaluation columns.",
    335       "supported": "strong"
    336     },
    337     {
    338       "claim": "Many learning-based APR papers fail to provide adequate open science artifacts (source code, datasets, or trained models).",
    339       "evidence": "Section 7.3 and Table 7 audit tool availability. Several papers fail to provide source code, datasets, or trained models. Some studies cannot be reproduced due to missing hyperparameters, complex environment settings, or insufficient documentation.",
    340       "supported": "moderate"
    341     },
    342     {
    343       "claim": "The overfitting issue is more significant in learning-based APR than in traditional APR due to the black-box nature of end-to-end NMT repair.",
    344       "evidence": "Section 4.7 discusses this: 'The overfitting issue in learning-based APR is more significant and severe' citing [198], with examples of plausible patches that pass test suites but are not semantically correct.",
    345       "supported": "weak"
    346     }
    347   ],
    348   "red_flags": [
    349     {
    350       "flag": "No quality assessment of included studies",
    351       "detail": "The survey collects 112 papers but applies no quality scoring rubric or risk-of-bias assessment. All papers are treated equally regardless of their methodological quality, sample sizes, or evaluation rigor. This risks laundering weak results alongside strong ones."
    352     },
    353     {
    354       "flag": "No threats to validity for the survey itself",
    355       "detail": "The paper has no limitations or threats-to-validity section addressing its own methodology. Issues such as search string completeness, potential missed papers despite snowballing, inter-rater reliability of manual screening, or temporal bias are not discussed."
    356     },
    357     {
    358       "flag": "Publication bias not addressed",
    359       "detail": "The survey does not consider whether the 112 collected papers skew toward positive results. Published APR papers may disproportionately report successful techniques, inflating the apparent progress of the field."
    360     },
    361     {
    362       "flag": "Uncritical repetition of claimed results",
    363       "detail": "The survey reports claimed bug counts and performance numbers from individual papers (e.g., 'CoCoNut fixes 509 bugs') without verifying these claims or noting that different papers use different evaluation settings, making direct comparisons misleading."
    364     }
    365   ],
    366   "cited_papers": [
    367     {
    368       "title": "Practical Program Repair in the Era of Large Pre-trained Language Models",
    369       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    370       "year": 2022,
    371       "relevance": "First extensive evaluation of 9 large pre-trained language models for automated program repair across 3 languages, directly relevant to LLM capability assessment."
    372     },
    373     {
    374       "title": "Less Training, More Repairing Please: Revisiting Automated Program Repair Via Zero-shot Learning",
    375       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    376       "year": 2022,
    377       "relevance": "AlphaRepair demonstrates zero-shot APR with CodeBERT, showing pre-trained models can repair without task-specific fine-tuning — key evidence for LLM capability claims."
    378     },
    379     {
    380       "title": "CodeT5: Identifier-aware Unified Pre-trained Encoder-decoder Models for Code Understanding and Generation",
    381       "authors": ["Yue Wang", "Weishi Wang", "Shafiq Joty", "Steven CH Hoi"],
    382       "year": 2021,
    383       "relevance": "Foundational pre-trained model for code tasks including program repair, widely used as backbone in APR systems."
    384     },
    385     {
    386       "title": "CodeBERT: A Pre-trained Model for Programming and Natural Languages",
    387       "authors": ["Zhangyin Feng", "Daya Guo", "Duyu Tang"],
    388       "year": 2020,
    389       "relevance": "Influential pre-trained model used across multiple APR techniques for code understanding and generation."
    390     },
    391     {
    392       "title": "VulRepair: A T5-based Automated Software Vulnerability Repair",
    393       "authors": ["Michael Fu", "Chakkrit Tantithamthavorn", "Trung Le", "Van Nguyen", "Phung Dinh"],
    394       "year": 2022,
    395       "relevance": "Applies pre-trained CodeT5 to security vulnerability repair, relevant to AI safety and automated code security."
    396     },
    397     {
    398       "title": "CIRCLE: Continual Repair across Programming Languages",
    399       "authors": ["Wei Yuan", "Quanjun Zhang", "Tieke He", "Chunrong Fang"],
    400       "year": 2022,
    401       "relevance": "Demonstrates continual learning for multi-language program repair with T5, relevant to LLM generalization across programming languages."
    402     },
    403     {
    404       "title": "CURE: Code-aware Neural Machine Translation for Automatic Program Repair",
    405       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    406       "year": 2021,
    407       "relevance": "Combines pre-trained programming language model with code-aware beam search for APR, influential in NMT-based repair."
    408     },
    409     {
    410       "title": "Recoder: A Syntax-guided Edit Decoder for Neural Program Repair",
    411       "authors": ["Qihao Zhu", "Zeyu Sun", "Yuan-an Xiao"],
    412       "year": 2021,
    413       "relevance": "First learning-based APR technique to outperform traditional techniques on Defects4J, demonstrating the maturation of neural repair approaches."
    414     },
    415     {
    416       "title": "Defects4J: A Database of Existing Faults to Enable Controlled Testing Studies for Java Programs",
    417       "authors": ["René Just", "Darioush Jalali", "Michael D Ernst"],
    418       "year": 2014,
    419       "relevance": "The most widely used benchmark in APR research; its characteristics shape how the entire field evaluates repair techniques."
    420     },
    421     {
    422       "title": "Break-It-Fix-It: Unsupervised Learning for Program Repair",
    423       "authors": ["Michihiro Yasunaga", "Percy Liang"],
    424       "year": 2021,
    425       "relevance": "Demonstrates unsupervised learning for program repair using a fixer-breaker loop, relevant to self-supervised AI code generation."
    426     },
    427     {
    428       "title": "Self-supervised Bug Detection and Repair",
    429       "authors": ["Miltiadis Allamanis", "Henry Jackson-Flux", "Marc Brockschmidt"],
    430       "year": 2021,
    431       "relevance": "BUGLAB from Microsoft: self-supervised bug detection and repair using detector-selector learning, demonstrating industrial interest in automated code repair."
    432     },
    433     {
    434       "title": "An Empirical Study on Learning Bug-fixing Patches in the Wild Via Neural Machine Translation",
    435       "authors": ["Michele Tufano", "Cody Watson", "Gabriele Bavota", "Massimiliano Di Penta", "Martin White", "Denys Poshyvanyk"],
    436       "year": 2019,
    437       "relevance": "First systematic empirical study on NMT for bug-fixing, establishing foundational datasets (BFP) used across the field."
    438     }
    439   ],
    440   "engagement_factors": {
    441     "practical_relevance": {
    442       "score": 2,
    443       "justification": "Provides a useful taxonomy and reading list for researchers entering the learning-based APR field, with a curated GitHub repository, but is not an immediately usable tool."
    444     },
    445     "surprise_contrarian": {
    446       "score": 0,
    447       "justification": "Confirms the expected narrative of growing interest in DL for APR without challenging any conventional wisdom."
    448     },
    449     "fear_safety": {
    450       "score": 0,
    451       "justification": "No safety or security concerns raised; the survey covers vulnerability repair as one application domain but does not raise novel risk concerns."
    452     },
    453     "drama_conflict": {
    454       "score": 0,
    455       "justification": "No controversy or conflict; the paper is a straightforward literature survey."
    456     },
    457     "demo_ability": {
    458       "score": 1,
    459       "justification": "GitHub repository with a curated paper list is available but there is no interactive tool or demo to try."
    460     },
    461     "brand_recognition": {
    462       "score": 1,
    463       "justification": "Published in TOSEM (reputable SE venue) by Nanjing University researchers, but not from a high-profile AI lab."
    464     }
    465   }
    466 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs