scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (27394B)
      1 {
      2   "paper": {
      3     "title": "APPATCH: Automated Adaptive Prompting Large Language Models for Real-World Software Vulnerability Patching",
      4     "authors": ["Yu Nong", "Haoran Yang", "Long Cheng", "Hongxin Hu", "Haipeng Cai"],
      5     "year": 2024,
      6     "venue": "USENIX Security 2025",
      7     "arxiv_id": "2408.13597",
      8     "doi": ""
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": true,
     15         "justification": "The paper states 'The source code and documentation of APPATCH as well as our experimental results have been made available at https://zenodo.org/records/14741018' in the Open Science section."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper states that experimental results have been made available at the Zenodo repository. Additionally, the exemplar datasets are drawn from publicly available PatchDB and CVEFixes, and the Zero-Day dataset was collected from publicly disclosed CVEs."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided in the paper. The paper mentions using Joern and four LLM APIs but does not specify library versions or dependency details beyond the LLM model names."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper describes the algorithmic design in detail but does not include step-by-step reproduction instructions with specific commands to run. The Zenodo link is provided but the paper itself contains no README-style instructions."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "All results in Tables 2-7 are reported as single point estimates (e.g., '36.46% F1') with no confidence intervals, error bars, or uncertainty measures."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes numerous comparative claims (e.g., 'APPATCH achieves the best overall effectiveness') based solely on comparing numbers without any statistical significance tests such as p-values, t-tests, or bootstrap tests."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports percentage improvements with baseline context. For example, 'up to 28.33% in F1 and 182.26% in recall over the best baseline' in the abstract, and Tables 2-4 provide full baseline comparisons allowing readers to assess effect magnitude."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The Zero-Day dataset has 97 samples and ExtractFix has 20 samples, but the paper does not justify why these sizes are adequate for the claims being made. The limitations section acknowledges 'we only collect 306+76+20 patching samples for evaluation' but does not provide a power analysis or formal justification."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. The paper does not state whether results are from single runs or averaged over multiple runs. LLM outputs are non-deterministic, making variance reporting important."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper includes multiple baselines: standard prompting, zero-shot completion (Pearce et al.), direct reasoning, random exemplars, manual exemplars (prior work), and four non-LLM-based tools (VulRepair, Getafix, ExtractFix, VulnFix)."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The baselines include recent techniques: VulRepair (2022), VulnFix (2022), ExtractFix (2021), and the state-of-the-art zero-shot completion approach by Pearce et al. (2023 S&P). The paper also uses its own prior work (Manual Exemplars, arXiv 2024) as a baseline."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Comprehensive ablation study in RQ2 (Section 5.2, Table 2): removes multi-faceted patch validation ('No Validation'), semantics-aware scoping ('No Slicing'), dynamic adaptive prompting ('Random Exemplars'), automated exemplar generation ('Manual Exemplars'), and vulnerability semantics reasoning ('Direct Reasoning')."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper uses multiple metrics: four correctness categories (SynEq, SemEq, Plausible, Correct), three measures (Recall, Precision, F1), as well as time cost, token usage, and API cost (Table 5)."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper states 'we manually check each generated patch' (Section 5, Metrics). An inter-rater agreement/consensus procedure was applied for labeling vulnerability manifestation locations: 'each author independently labeled them, followed by cross-checking outcomes and discussions to resolve disagreements.'"
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The Zero-Day dataset was specifically collected with vulnerabilities disclosed after the latest LLM cutoff date (04/2024) to avoid data leakage. The exemplar pool (training) and testing sets are explicitly separated with no overlap confirmed."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Results are broken down per model (GPT-4, Gemini-1.5, Claude-3.5, Llama-3.1), per dataset (Zero-Day, ExtractFix), per correctness category (SynEq, SemEq, Plausible, Correct), and per CWE in Table 8. Table 3 provides breakdowns for interprocedural samples specifically."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 6.3 'How and Why APPATCH Fails' provides detailed failure analysis with symptom groups (incorrect vulnerability identification 35.69%, insufficient code modification 31.23%), root causes (misunderstanding vulnerabilities 41.33%, inadequate context analysis 25.21%), and concrete examples in Figures 10-11."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports that code-specific LLMs (CodeLlama, CodeQwen-1.5, DeepSeek-Coder-V2) performed very poorly with APPATCH (1.21%-10.12% F1), which was unexpected given they are code-specific. The failure analysis in Section 6.3 documents where APPATCH falls short."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims 'up to 28.33% in F1 and 182.26% in recall over the best baseline' are supported by Table 2 data. For example, Claude-3.5 APPATCH achieves 36.46% F1 on Zero-Day vs. Manual Exemplars' 28.41% for Claude-3.5 (28.33% improvement). Recall improvements similarly match."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper makes causal claims through ablation studies (e.g., removing slicing reduces F1, removing validation reduces precision). The ablation design is controlled single-variable manipulation across all four models and both datasets, providing adequate causal evidence for component contributions."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The paper tests only on C language with 6 CWE types, yet the title says 'Real-World Software Vulnerability Patching' without language or CWE qualification. Section 6.5 discusses extensibility to other languages/CWEs but admits the evaluation is limited to C. The abstract claim of 'real-world vulnerabilities' is broader than the tested setting."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "Section 6.1 discusses why APPATCH works (three specific advantages), Section 6.3 analyzes failure root causes, and Appendix E addresses the alternative explanation that ExtractFix results might be due to data leakage rather than APPATCH's effectiveness. The paper considers confounds like token limitations and LLM non-determinism."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "Table 1 specifies exact model versions: gemini-1.5-pro, claude-3.5-sonnet, gpt-4-turbo, llama-3.1-70b, along with parameter counts, max tokens, vendor, release dates, and cutoff dates."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper provides the actual prompt templates used for root cause generation (Section 3.3.2), exemplar selection (Section 3.4.2), patch generation (Section 3.4.2), and patch validation (Section 3.4.3). These include the actual text with placeholders and concrete fill examples in Figures 5-8."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "The paper does not report temperature, top-p, or other sampling parameters used with the LLM APIs. This is a significant omission since LLM output is sensitive to these settings and the paper acknowledges LLMs are 'subject to hallucination and non-deterministic responses.'"
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The agentic scaffolding is described in detail: Algorithms 1-3 provide pseudocode for semantics-aware scoping, dynamic adaptive prompting (with progressive context expansion), and multi-faceted patch validation. The workflow includes tool use (Joern for SDG construction), iterative LLM querying, and cross-validation across models."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 5 (Datasets) describes the preprocessing: selecting samples from PatchDB and CVEFixes for the most popular CWEs, filtering out inappropriate samples via manual inspection, removing overlapping samples, and confirming no overlap between training and testing sets. They start with PatchDB (12073) + CVEFixes (4120) and filter down to 306 exemplar samples."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 6.4 'Limitations' provides a dedicated subsection discussing two main limitations: inaccurate root cause analysis at testing time and limited dataset size. Additional limitations are discussed regarding dataset quality and compilability requirements."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The limitations are specific to this study: 'the root cause analysis may not be accurate at the testing phase' (specific to APPATCH's design), 'we only collect 306+76+20 patching samples for evaluation, because collecting these samples require manual works to label' (specific sample size constraint), and dataset limitations regarding compilability and test cases."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 6.5 (Extensibility) explicitly states 'the current implementation and the evaluation are based on C language with six common CWEs.' The paper also states that ExtractFix and VulnFix comparisons are limited because 'these samples are not compilable' for the Zero-Day dataset. Section 6.4 explicitly notes the datasets lack exploits and dynamic tracing."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The Zenodo repository (https://zenodo.org/records/14741018) contains 'the codebase, datasets, and experimental results' according to the Open Science section. The exemplar datasets PatchDB and CVEFixes are also publicly available."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 5 describes data collection in detail: exemplars from PatchDB and CVEFixes filtered by CWE type, Zero-Day dataset collected from CVEs reported after April 2024, ExtractFix dataset (20 reproducible vulnerabilities). The inter-rater agreement procedure for labeling is also described."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were recruited for this study. The manual evaluation was performed by the authors themselves, and the end-to-end experiment used one graduate student for simulating developer inspection. This is a benchmark evaluation, not a human subjects study."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The data pipeline is documented: PatchDB (12073) + CVEFixes (4120) → filter by 6 CWEs → manual inspection to remove inappropriate samples → inter-rater agreement → remove overlaps → 306 exemplar samples. The Zero-Day collection is also described with temporal constraints (post April 2024 disclosure) and project coverage (18 projects, 97 samples)."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgments section lists specific grants: Army Research Office (ARO) Grant W911NF-21-1-002, NSF grants CCF-2146233, CCF-2505223, 2239605, 2228616, 2228617, 2120369, 2129164, and Office of Naval Research (ONR) Grant N000142212111."
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly stated: University at Buffalo, Washington State University, and Clemson University. None of the authors are affiliated with the LLM vendors being evaluated (OpenAI, Google, Anthropic, Meta)."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The funders are government agencies (ARO, NSF, ONR) that have no direct financial stake in whether APPATCH outperforms specific LLMs or vulnerability patching tools."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests statement is provided in the paper. While the Ethics Considerations section exists, it does not include a declaration of financial interests or competing interests."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Table 1 explicitly states the cutoff dates for each model: Gemini-1.5 (Nov. 2023), Claude-3.5 (Apr. 2024), GPT-4 (Oct. 2023), and Llama-3.1 (Dec. 2023)."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "The paper extensively discusses potential data leakage. The Zero-Day dataset is specifically designed to contain only post-cutoff vulnerabilities. Appendix E provides a detailed data leakage assessment for the ExtractFix dataset using three progressively informative queries (Table 11), finding LLMs mostly could not recall vulnerability information."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": true,
    231         "justification": "The Zero-Day dataset was specifically constructed to contain vulnerabilities disclosed after April 2024 (the latest cutoff date) to eliminate contamination. For the ExtractFix dataset (released 2021, before cutoffs), the paper conducted explicit leakage tests (Appendix E) and honestly acknowledges 'we can not ensure that there was no leakage.'"
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "This is not a human subjects study. The manual evaluation was performed by the paper's authors, not recruited participants."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants were recruited. The Ethics Considerations section addresses responsible disclosure and security risks rather than human subjects ethics."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants were recruited for this study. The one graduate student mentioned in the end-to-end experiment is briefly characterized (3-year relevant experience) but this is not a human subjects study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "Not a human subjects study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "Not a human subjects study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "Not a human subjects study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "Not a human subjects study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Table 5 reports per-sample costs: GPT-4 at $0.0823, Claude-3.5 at $0.0336, Gemini-1.5 at $0.0026, and Llama-3.1 at $0.0203. Time costs (37-50 seconds), context tokens (5,684-6,802), and generated tokens (584-886) are also reported per sample."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "While per-sample costs are reported, the total computational budget (total API spend across all experiments, total GPU hours for Llama-3.1, total time for the full evaluation) is not stated. With 97+20 test samples across 4 LLMs and multiple ablation conditions, the total spend is not quantified."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "APPATCH achieves up to 36.46% and 73.86% F1 on the Zero-Day and ExtractFix datasets, outperforming other prompting approaches.",
    287       "evidence": "Table 2 shows Claude-3.5 with APPATCH achieves 36.46% F1 on Zero-Day and 73.86% F1 on ExtractFix for the Correct metric, which is the highest among all prompting approaches for each respective LLM.",
    288       "supported": "strong"
    289     },
    290     {
    291       "claim": "APPATCH outperforms state-of-the-art non-LLM-based techniques by up to 182.26% in recall on the Zero-Day dataset.",
    292       "evidence": "Table 4 shows APPATCH (Claude-3.5) achieves 49.48% recall vs. VulRepair's 17.53% on the Zero-Day dataset. (49.48-17.53)/17.53 = 182.26% improvement.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Each component of APPATCH substantially contributes to its effectiveness.",
    297       "evidence": "Table 2 ablation results show consistent F1 drops when removing each component: No Validation, No Slicing, Random Exemplars, Manual Exemplars, and Direct Reasoning all show lower F1 than full APPATCH across all four LLMs and both datasets.",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "APPATCH is efficient, taking on average 37-50 seconds and 5,684-6,802 tokens per patch.",
    302       "evidence": "Table 5 provides per-sample time and token costs across all four LLMs. Time ranges from 37.148s (Gemini-1.5) to 50.209s (Claude-3.5), context tokens from 5,684 to 6,802.",
    303       "supported": "strong"
    304     },
    305     {
    306       "claim": "The vulnerability semantics reasoning approach achieves higher correct reasoning rates than alternatives.",
    307       "evidence": "Table 10 shows Vulnerability Semantics Reasoning achieves 63.41%-82.64% correct reasoning rates across models, vs. Direct Reasoning (50.13%-74.18%) and No Slicing (51.97%-79.41%).",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "APPATCH works well in end-to-end integration with CodeQL, achieving 23.85%-27.29% F1 in realistic scenarios.",
    312       "evidence": "Table 7 shows APPATCH with Claude-3.5 achieves 23.85% F1 in the fully automated scenario and 27.29% in the realistic scenario, outperforming all baseline approaches in both settings.",
    313       "supported": "moderate"
    314     }
    315   ],
    316   "methodology_tags": ["benchmark-eval"],
    317   "key_findings": "APPATCH introduces an automated adaptive prompting framework for LLM-based vulnerability patching that uses vulnerability semantics reasoning (program slicing to focus on vulnerability-relevant code), dynamic exemplar selection based on root cause similarity, and multi-faceted patch validation across multiple LLMs. Evaluated on 97 zero-day and 20 existing vulnerabilities across four LLMs (GPT-4, Gemini-1.5, Claude-3.5, Llama-3.1), APPATCH achieves up to 36.46% F1 on zero-day vulnerabilities and 73.86% F1 on existing vulnerabilities, substantially outperforming both standard prompting approaches and non-LLM-based patching tools. The ablation study demonstrates that each component (semantics-aware scoping, dynamic adaptive prompting, vulnerability semantics reasoning, multi-faceted validation) contributes meaningfully to performance.",
    318   "red_flags": [
    319     {
    320       "flag": "No variance or uncertainty reporting",
    321       "detail": "All results are single point estimates with no error bars, confidence intervals, or repeated-run variance. LLM outputs are inherently non-deterministic, so single-run results may not be reliable. The paper does not state whether experiments were run once or multiple times."
    322     },
    323     {
    324       "flag": "No statistical significance tests",
    325       "detail": "Comparative claims across many conditions are made purely by comparing point estimates. With small test sets (97 and 20 samples), observed differences could be due to chance."
    326     },
    327     {
    328       "flag": "Very small ExtractFix test set",
    329       "detail": "The ExtractFix dataset has only 20 samples. At this sample size, each sample accounts for 5% of the recall, making precise performance comparisons unreliable. Yet the paper draws strong comparative conclusions from this dataset."
    330     },
    331     {
    332       "flag": "Missing hyperparameters",
    333       "detail": "Temperature and other sampling parameters for the four LLM APIs are not reported. These significantly affect output quality and reproducibility, especially given the acknowledged non-determinism of LLMs."
    334     },
    335     {
    336       "flag": "Manual evaluation by authors",
    337       "detail": "Patch correctness was manually evaluated by the authors themselves (inter-rater agreement procedure among authors). This introduces potential bias since the developers of APPATCH are also its evaluators. Independent evaluators were not used."
    338     }
    339   ],
    340   "cited_papers": [
    341     {
    342       "title": "Examining zero-shot vulnerability repair with large language models",
    343       "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad", "Ramesh Karri", "Brendan Dolan-Gavitt"],
    344       "year": 2023,
    345       "relevance": "Key baseline for LLM-based vulnerability patching using zero-shot code completion approach, published at S&P 2023."
    346     },
    347     {
    348       "title": "VulRepair: A T5-based automated software vulnerability repair",
    349       "authors": ["Michael Fu", "Chakkrit Tantithamthavorn", "Trung Le", "Van Nguyen", "Dinh Phung"],
    350       "year": 2022,
    351       "relevance": "State-of-the-art deep learning-based vulnerability repair baseline using fine-tuned CodeT5."
    352     },
    353     {
    354       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    355       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    356       "year": 2022,
    357       "relevance": "Foundational prompting technique that APPATCH builds upon for vulnerability semantics reasoning."
    358     },
    359     {
    360       "title": "How effective are neural networks for fixing security vulnerabilities",
    361       "authors": ["Yi Wu", "Nan Jiang", "Hung Viet Pham"],
    362       "year": 2023,
    363       "relevance": "Evaluates effectiveness of neural networks for vulnerability repair, directly relevant to understanding LLM limitations in security patching."
    364     },
    365     {
    366       "title": "Chain-of-thought prompting of large language models for discovering and fixing software vulnerabilities",
    367       "authors": ["Yu Nong", "Mohammed Aldeen", "Long Cheng"],
    368       "year": 2024,
    369       "arxiv_id": "2402.17230",
    370       "relevance": "Prior work by same authors serving as the Manual Exemplars baseline; demonstrates CoT prompting for vulnerability patching."
    371     },
    372     {
    373       "title": "Large language models for code: Security hardening and adversarial testing",
    374       "authors": ["Jingxuan He", "Martin Vechev"],
    375       "year": 2023,
    376       "relevance": "Studies LLMs for secure code generation, relevant to understanding LLM capabilities in security-related code tasks."
    377     },
    378     {
    379       "title": "LLMs cannot reliably identify and reason about security vulnerabilities (yet?): A comprehensive evaluation, framework, and benchmarks",
    380       "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar"],
    381       "year": 2024,
    382       "relevance": "Comprehensive evaluation of LLM limitations in vulnerability identification and reasoning, directly relevant to understanding APPATCH's challenges."
    383     },
    384     {
    385       "title": "Enhancing static analysis for practical bug detection: An LLM-integrated approach",
    386       "authors": ["Haonan Li", "Yu Hao", "Yizhuo Zhai", "Zhiyun Qian"],
    387       "year": 2024,
    388       "relevance": "Demonstrates LLM-integrated static analysis approach similar to APPATCH's progressive prompting scheme, published at OOPSLA."
    389     },
    390     {
    391       "title": "Large language model for vulnerability detection and repair: Literature review and roadmap",
    392       "authors": ["Xin Zhou", "Sicong Cao", "Xiaobing Sun", "David Lo"],
    393       "year": 2024,
    394       "relevance": "Survey paper covering the LLM-based vulnerability detection and repair landscape, contextualizing APPATCH's contributions."
    395     },
    396     {
    397       "title": "Beyond tests: Program vulnerability repair via crash constraint extraction",
    398       "authors": ["Xiang Gao", "Bo Wang", "Gregory J Duck"],
    399       "year": 2021,
    400       "relevance": "ExtractFix technique and dataset used as a key baseline and evaluation benchmark in APPATCH's experiments."
    401     },
    402     {
    403       "title": "Program vulnerability repair via inductive inference",
    404       "authors": ["Yuntong Zhang", "Xiang Gao", "Gregory J Duck", "Abhik Roychoudhury"],
    405       "year": 2022,
    406       "relevance": "VulnFix state-of-the-art vulnerability patching baseline compared against APPATCH."
    407     }
    408   ]
    409 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs