scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28136B)
      1 {
      2   "paper": {
      3     "title": "A Survey on Automated Program Repair Techniques",
      4     "authors": [
      5       "Kai Huang",
      6       "Zhengzi Xu",
      7       "Su Yang",
      8       "Hongyu Sun",
      9       "Xuejun Li",
     10       "Zheng Yan",
     11       "Yuqing Zhang"
     12     ],
     13     "year": 2023,
     14     "venue": "ACM Computing Surveys",
     15     "arxiv_id": "2303.18184",
     16     "doi": "10.48550/arXiv.2303.18184"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["survey_methodology"],
     20   "methodology_tags": ["meta-analysis", "qualitative"],
     21   "key_findings": "This survey classifies APR techniques into four categories (search-based, constraint-based, template-based, learning-based) and reviews 140 representative works from 2005-2022. The paper finds that learning-based APR tools now outperform traditional techniques on the Defects4J benchmark, and that LLMs are transforming the field through both fine-tuning and prompt learning paradigms. The survey reveals significant methodological problems in APR research including unfair experimental comparisons due to inconsistent training datasets, dataset overlap (data leakage) between training and test sets, and dataset quality issues (e.g., 20.71% invalid samples in CoCoNut). Key challenges remain in multi-hunk repair, patch overfitting, and industrial deployment.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "No analysis code or scripts for the survey are released. A GitHub URL is provided for supplementary dataset overlap data (https://github.com/huangkNIPC/APR-Survey/tree/main/dataset_overlap), but no code for reproducing the survey's analysis."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "The paper provides some dataset overlap data on GitHub, but does not release the full structured survey dataset (paper list, evaluation criteria results, or analysis data) in a downloadable format. Tables in the paper constitute the primary data presentation."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No environment specifications provided for the preliminary experiments (Tables 5, 7, 9). No requirements.txt, Dockerfile, or environment details are given."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided for either the survey methodology or the preliminary experiments."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": false,
     48         "answer": false,
     49         "justification": "This is primarily a survey paper. The preliminary experiments (Tables 5, 7) report point estimates but are supplementary, not the main contribution."
     50       },
     51       "significance_tests": {
     52         "applies": false,
     53         "answer": false,
     54         "justification": "As a survey paper, the main contribution is taxonomic and qualitative. The preliminary experiments in Findings 7 and 14 make comparative claims without statistical tests, but these are supplementary observations."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": false,
     58         "answer": false,
     59         "justification": "Survey paper; statistical effect sizes are not applicable to the primary taxonomic analysis."
     60       },
     61       "sample_size_justified": {
     62         "applies": false,
     63         "answer": false,
     64         "justification": "Survey paper. The selection of 140 papers is explained by page limitations and manual selection criteria, but formal sample size justification is not applicable."
     65       },
     66       "variance_reported": {
     67         "applies": false,
     68         "answer": false,
     69         "justification": "Survey paper. The preliminary experiments report single-run results without variance, but these are supplementary."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "The survey explicitly compares against prior surveys by Gazzola et al. [30], Le Goues et al. [35], and Monperrus [96, 97], and positions its contributions relative to these prior works in Section 1."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "The compared prior surveys include Monperrus's living review (2022) [97] and Gazzola et al. (2019) [30], which are contemporary. The paper explicitly states it fills gaps left by these recent surveys."
     82       },
     83       "ablation_study": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "This is a survey paper with no system components to ablate."
     87       },
     88       "multiple_metrics": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "Survey paper; no system is being evaluated with metrics."
     92       },
     93       "human_evaluation": {
     94         "applies": false,
     95         "answer": false,
     96         "justification": "Survey paper; human evaluation of system outputs is not applicable."
     97       },
     98       "held_out_test_set": {
     99         "applies": false,
    100         "answer": false,
    101         "justification": "Survey paper; no train/test split is applicable."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The survey provides detailed per-category breakdowns: four main APR categories (search-based, constraint-based, template-based, learning-based) with subcategories. Table 2 compares tools across 7 metrics, Tables 10-13 provide per-category summaries, and Table 3 summarizes pros and cons by approach."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper extensively discusses failure cases and limitations of each APR approach class in Table 3 (pros/cons), Section 5.5 (findings revealing problems), and Section 7.1 (existing challenges including patch overfitting, repair quality, efficiency problems)."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Multiple negative findings are reported: Code Abs does not significantly improve LLM repair (Table 5, Finding 7), dataset overlap problems exist (Table 9, Finding 15), 20.71% invalid samples in CoCoNut dataset (Finding 15), and unfair experimental comparisons across APR works (Finding 11, 15)."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "The abstract claims a systematic taxonomy, uniform evaluation criteria, and discussion of future directions — all of which are delivered in the paper body. The taxonomy is in Section 5, criteria in Section 4 (Table 1), and challenges/directions in Section 7."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "Finding 7 claims 'Code Abs does not work well on LLMs' based on a preliminary experiment (Table 5) without statistical tests or controlled design. Finding 5 claims 'more accurate FL might further alleviate the patch overfitting problem' based on observational comparison (Table 4) without controlling for confounds. These causal claims are not adequately justified."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper bounds its scope: papers up to December 2022, 140 selected works, focus on APR for software bugs/errors/vulnerabilities. Section 2 explicitly notes page limitations and possible omissions: 'it is impossible to cover all typical works in our survey.'"
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Finding 7 provides two explanations for why Code Abs doesn't help LLMs: (1) code abstraction loses semantic information from method/variable names, and (2) LLMs are pre-trained on raw source code. Finding 5 notes multiple factors affecting patch overfitting including FL accuracy, patch generation, and validation."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper distinguishes between different measurement approaches: correct patches vs. plausible patches, repair accuracy (exact match) vs. test suite validation. Finding 13 explicitly discusses limitations of exact match as a validation strategy. The paper's evaluation criteria (Section 4) carefully distinguish between different quality and effectiveness measures."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": false,
    149         "answer": false,
    150         "justification": "Survey paper. The preliminary experiments reference LLMs (CodeBERT, GraphCodeBERT, PLBART, CodeT5) by name but defer to prior work [28, 161] for implementation details. Model versioning is not the focus."
    151       },
    152       "prompts_provided": {
    153         "applies": false,
    154         "answer": false,
    155         "justification": "Survey paper that does not use prompting in its own methodology."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": false,
    159         "answer": false,
    160         "justification": "Survey paper. The preliminary experiments reference prior work for implementation details without reporting hyperparameters."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "Survey paper with no agentic scaffolding."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 2 describes the paper selection pipeline: collection from APR community (405 papers), Monperrus living review (401 papers), and own search on Google Scholar/DBLP/arXiv using keywords 'Automated/Automatic Program Repair' for 2022. Filtering excluded short papers (≤6 pages), then manual review selected 140 works. Fig. 1 shows the workflow."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "There is no dedicated limitations or threats-to-validity section for the survey methodology itself. Section 7 discusses challenges and directions for APR research generally, not limitations of the survey's own methodology (selection bias, coverage gaps, etc.)."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No specific threats to the validity of the survey itself are discussed. The paper does not address potential selection bias in their manual paper filtering, coverage limitations, or how their subjective 'typical works' criterion might skew the findings."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "The paper states temporal scope (papers up to December 2022), paper count (140 selected works), and focus areas (data-driven APR techniques). Section 2 notes: 'Due to page limitations, we cannot review all APR works in detail' and acknowledges 'possible omissions from the manual review.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "The full list of surveyed papers, evaluation criteria assessments, and analysis data are not available as a downloadable dataset. Only partial data on dataset overlap is provided via GitHub."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 2 describes data collection: three sources (APR community website, Monperrus living review, own search), time period (up to December 2022), search keywords, and filtering steps. Fig. 1 provides a workflow diagram."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants in this survey paper."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "While Section 2 describes the pipeline stages (collection → dedup → filter short papers → manual review → 140 works) and gives input counts (405 + 401 papers), intermediate counts are missing (how many after dedup? how many after short paper filter?). The manual selection criteria are vague: 'typical works' that 'represent the characteristics of certain types of techniques.'"
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Funding is disclosed: 'National Natural Science Foundation of China under Grant 62072351' and 'Academy of Finland under Grant 345072 and Grant 350464.'"
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "All author affiliations are clearly listed: Xidian University, Nanyang Technological University, University of Chinese Academy of Sciences, and Aalto University."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "Funders are government/academic research agencies (Chinese NSF, Academy of Finland) with no commercial stake in APR tool outcomes."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Survey paper that does not evaluate a pre-trained model's capability on a benchmark."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "Survey paper that does not evaluate a pre-trained model on a benchmark."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "Survey paper that does not evaluate a pre-trained model on a benchmark."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this survey paper."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this survey paper."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this survey paper."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this survey paper."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this survey paper."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this survey paper."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this survey paper."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": false,
    291         "answer": false,
    292         "justification": "Survey paper; no system with inference cost to report."
    293       },
    294       "compute_budget_stated": {
    295         "applies": false,
    296         "answer": false,
    297         "justification": "Survey paper; no significant compute budget applies."
    298       }
    299     },
    300     "survey_methodology": {
    301       "prisma_or_structured_protocol": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Section 2 and Fig. 1 describe a paper selection workflow with defined sources and keywords, but the core filtering step relies on subjective manual selection of 'typical works' without reproducible inclusion/exclusion criteria. The protocol is semi-structured but not systematic enough for PRISMA compliance — no registered protocol, and the manual selection criteria are undefined beyond 'represent the characteristics of certain types of techniques.'"
    305       },
    306       "quality_assessment_of_sources": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The survey proposes evaluation criteria for APR tools (Table 1, Section 4) but does not systematically apply quality scoring to the 140 reviewed papers. All papers are treated with equal weight regardless of their methodological rigor, and no risk-of-bias assessment is performed on included studies."
    310       },
    311       "publication_bias_discussed": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No discussion of publication bias. The survey does not consider whether its sources skew toward positive results, does not include funnel plots, and does not acknowledge that published APR papers may over-represent successful repair approaches."
    315       }
    316     }
    317   },
    318   "claims": [
    319     {
    320       "claim": "Learning-based APR tools outperform traditional APR tools on the Defects4J benchmark",
    321       "evidence": "Finding 6, Table 4 (Section 5.5.2): AlphaRepair achieves 74/109 correct/plausible patches with PFL on Defects4J V1.2.0, compared to TBar (68/95) and jGenProg (6/16). Multiple learning-based tools exceed traditional ones.",
    322       "supported": "moderate"
    323     },
    324     {
    325       "claim": "Code abstraction (Code Abs) does not significantly improve the repair performance of most LLMs",
    326       "evidence": "Finding 7, Table 5: Preliminary experiments fine-tuning CodeBERT, GraphCodeBERT, PLBART, and CodeT5 on BFP dataset with and without Code Abs. Most LLMs show similar or lower performance with Code Abs. However, no statistical tests are reported.",
    327       "supported": "weak"
    328     },
    329     {
    330       "claim": "Using BLEU as model evaluation metric can improve repair accuracy to 55% vs. 44% with loss on VulRepair",
    331       "evidence": "Finding 14, Table 7: Preliminary experiment fine-tuning CodeT5 on VulRepair dataset. Best BLEU model achieves 55% accuracy vs. 44% for VulRepair's original loss-based selection.",
    332       "supported": "weak"
    333     },
    334     {
    335       "claim": "Multiple large-scale training datasets have data overlap with the Defects4J test benchmark",
    336       "evidence": "Finding 15, Table 9: Exact matching detected overlaps — MegaDiff overlaps with 99 bugs in Defects4J, CodRep with 61, CodeSearchNet with 48. Data and method provided via GitHub link.",
    337       "supported": "moderate"
    338     },
    339     {
    340       "claim": "The CoCoNut dataset contains 20.71% invalid or incorrect samples",
    341       "evidence": "Finding 15: '671,497 (671,497/3,241,966=20.71%) invalid or incorrect samples in CoCoNut, and the bug codes and fix codes in these samples were identical.'",
    342       "supported": "moderate"
    343     },
    344     {
    345       "claim": "More accurate fault localization may further alleviate the patch overfitting problem",
    346       "evidence": "Finding 5, Table 4: APR tools using perfect fault localization (PFL) generally show higher percentages of correct patches vs. those using off-the-shelf FL tools. E.g., AlphaRepair: 67.89% with PFL vs. 55.56% without.",
    347       "supported": "moderate"
    348     },
    349     {
    350       "claim": "Multi-hunk bug repair remains a long-term challenge with three main approaches showing distinct limitations",
    351       "evidence": "Finding 18, Fig. 7: Reviews iterative, holistic, and synchronous repair approaches. Iterative repair creates exponential search spaces; holistic repair treats bug code and context uniformly; synchronous repair (DEAR) requires AST conversion limiting applicability.",
    352       "supported": "moderate"
    353     }
    354   ],
    355   "red_flags": [
    356     {
    357       "flag": "Subjective manual paper selection",
    358       "detail": "The survey's core filtering step uses manual selection of 'typical works' without reproducible inclusion/exclusion criteria. This introduces selection bias and makes the survey non-reproducible. The authors acknowledge 'possible omissions from the manual review' but do not discuss how this affects their findings."
    359     },
    360     {
    361       "flag": "No quality assessment of source papers",
    362       "detail": "The survey does not assess the methodological quality of the 140 reviewed papers. All works are treated with equal weight regardless of their rigor, potentially laundering the signal-to-noise ratio of its sources. The proposed evaluation criteria (Table 1) are not applied as a quality assessment."
    363     },
    364     {
    365       "flag": "Preliminary experiments lack statistical rigor",
    366       "detail": "Tables 5, 7, and 9 present quantitative findings from preliminary experiments without error bars, significance tests, multiple runs, or variance reporting. The claim that 'Code Abs does not significantly improve repair performance' (Finding 7) is made without any statistical significance test."
    367     },
    368     {
    369       "flag": "No survey limitations discussed",
    370       "detail": "The paper lacks a limitations section for its own survey methodology. Section 7 discusses challenges in the APR field, but does not address the survey's own potential biases, coverage gaps, or methodological limitations."
    371     },
    372     {
    373       "flag": "Incomplete data pipeline documentation",
    374       "detail": "The paper collection pipeline (Section 2) omits intermediate counts: starting from 405+401+additional papers, the final count is 140, but the number after deduplication and after short-paper filtering are not reported, making it impossible to assess the filtering funnel."
    375     }
    376   ],
    377   "cited_papers": [
    378     {
    379       "title": "Less training, more repairing please: revisiting automated program repair via zero-shot learning",
    380       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    381       "year": 2022,
    382       "relevance": "Proposes AlphaRepair, demonstrating LLM zero-shot learning for program repair using CodeBERT, outperforming all prior APR tools on Defects4J."
    383     },
    384     {
    385       "title": "VulRepair: a T5-based automated software vulnerability repair",
    386       "authors": ["Michael Fu", "Chakkrit Tantithamthavorn", "Trung Le", "Van Nguyen", "Dinh Phung"],
    387       "year": 2022,
    388       "relevance": "Fine-tunes CodeT5 for automated vulnerability repair, demonstrating LLM fine-tuning as an effective approach for security bug fixing."
    389     },
    390     {
    391       "title": "CIRCLE: continual repair across programming languages",
    392       "authors": ["Wei Yuan", "Quanjun Zhang", "Tieke He", "Chunrong Fang"],
    393       "year": 2022,
    394       "relevance": "Uses continual learning to achieve cross-language program repair in a single model, addressing multi-language repair scalability."
    395     },
    396     {
    397       "title": "Repair is nearly generation: Multilingual program repair with llms",
    398       "authors": ["Harshit Joshi", "José Cambronero", "Sumit Gulwani", "Vu Le"],
    399       "year": 2022,
    400       "arxiv_id": "2208.11640",
    401       "relevance": "Demonstrates LLM-based multilingual program repair using Codex with few-shot learning across 6 programming languages."
    402     },
    403     {
    404       "title": "Repairing Bugs in Python Assignments Using Large Language Models",
    405       "authors": ["Jialu Zhang", "José Cambronero", "Sumit Gulwani", "Vu Le"],
    406       "year": 2022,
    407       "arxiv_id": "2209.14876",
    408       "relevance": "Proposes MMAPR system using Codex with multi-modal prompts for automated program error repair, demonstrating LLM prompt engineering for repair."
    409     },
    410     {
    411       "title": "Practical Program Repair in the Era of Large Pre-trained Language Models",
    412       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    413       "year": 2022,
    414       "arxiv_id": "2210.14179",
    415       "relevance": "Large-scale empirical study exploring LLM capabilities for practical program repair in zero/few-shot settings."
    416     },
    417     {
    418       "title": "Examining Zero-Shot Vulnerability Repair with Large Language Models",
    419       "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad", "Ramesh Karri", "Brendan Dolan-Gavitt"],
    420       "year": 2022,
    421       "relevance": "Empirical study of LLM zero-shot capability for security vulnerability repair, relevant to understanding LLM limitations in code security."
    422     },
    423     {
    424       "title": "CURE: Code-Aware Neural Machine Translation for Automatic Program Repair",
    425       "authors": ["Nan Jiang", "Thibaud Lutellier", "Lin Tan"],
    426       "year": 2021,
    427       "relevance": "Introduces pre-trained programming language model with code-aware beam search for program repair, pioneering the pre-training+fine-tuning paradigm in APR."
    428     },
    429     {
    430       "title": "SapFix: automated end-to-end repair at scale",
    431       "authors": ["Alexandru Marginean", "Johannes Bader", "Satish Chandra"],
    432       "year": 2019,
    433       "relevance": "First automated end-to-end program repair deployed at scale in Meta's industrial CI/CD environment."
    434     },
    435     {
    436       "title": "Getafix: learning to fix bugs automatically",
    437       "authors": ["Johannes Bader", "Andrew Scott", "Michael Pradel", "Satish Chandra"],
    438       "year": 2019,
    439       "relevance": "Facebook's industrial APR tool using static analysis instead of test cases, demonstrating practical deployment of automated repair."
    440     },
    441     {
    442       "title": "Neural Transfer Learning for Repairing Security Vulnerabilities in C Code",
    443       "authors": ["Zimin Chen", "Steve Kommrusch", "Martin Monperrus"],
    444       "year": 2022,
    445       "relevance": "Proposes VRepair using transfer learning from bug repair to vulnerability repair, demonstrating cross-domain knowledge transfer for code fixing."
    446     },
    447     {
    448       "title": "TFix: Learning to Fix Coding Errors with a Text-to-Text Transformer",
    449       "authors": ["Berkay Berabi", "Jingxuan He", "Veselin Raychev", "Martin T. Vechev"],
    450       "year": 2021,
    451       "relevance": "Uses pre-trained T5 model fine-tuned for JavaScript error repair with multi-task learning, demonstrating NL model transfer to PL repair."
    452     },
    453     {
    454       "title": "A syntax-guided edit decoder for neural program repair",
    455       "authors": ["Qihao Zhu", "Zeyu Sun", "Yuan-an Xiao", "Wenjie Zhang"],
    456       "year": 2021,
    457       "relevance": "Proposes Recoder, the first DL-based APR approach outperforming traditional techniques on Defects4J, using syntax-guided decoding."
    458     },
    459     {
    460       "title": "An extensive study on pre-trained models for program understanding and generation",
    461       "authors": ["Zhengran Zeng", "Hanzhuo Tan", "Haotian Zhang", "Jing Li"],
    462       "year": 2022,
    463       "relevance": "Large-scale study comparing pre-trained code models (CodeBERT, GraphCodeBERT, PLBART, CodeT5) on program understanding and generation tasks."
    464     }
    465   ],
    466   "engagement_factors": {
    467     "practical_relevance": {
    468       "score": 1,
    469       "justification": "Survey provides a useful taxonomy and comparison of APR tools, but is not directly usable as a tool or technique by practitioners."
    470     },
    471     "surprise_contrarian": {
    472       "score": 1,
    473       "justification": "Reveals dataset quality and overlap problems in APR research that are somewhat surprising, but largely confirms known trends about LLM dominance."
    474     },
    475     "fear_safety": {
    476       "score": 0,
    477       "justification": "No AI safety or security concerns raised; focused on software repair methodology."
    478     },
    479     "drama_conflict": {
    480       "score": 1,
    481       "justification": "Points out unfair experimental comparisons, dataset overlap, and 20.71% invalid data in CoCoNut, but frames these diplomatically as findings rather than accusations."
    482     },
    483     "demo_ability": {
    484       "score": 0,
    485       "justification": "No code, demo, or tool released; purely a literature review paper."
    486     },
    487     "brand_recognition": {
    488       "score": 0,
    489       "justification": "Authors are from Xidian University and NTU; no famous lab or product brand recognition."
    490     }
    491   }
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs