scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (32417B)
      1 {
      2   "paper": {
      3     "title": "LLM4CVE: Enabling Iterative Automated Vulnerability Repair with Large Language Models",
      4     "authors": [
      5       "Mohamad Fakih",
      6       "Rahul Dharmaji",
      7       "Halima Bouzidi",
      8       "Gustavo Quiros Araya",
      9       "Oluwatosin Ogundare",
     10       "Mohammad Abdullah Al Faruque"
     11     ],
     12     "year": 2025,
     13     "venue": "Euromicro Symposium on Digital Systems Design",
     14     "arxiv_id": "2501.03446",
     15     "doi": "10.1109/DSD67783.2025.00087"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval", "case-study"],
     20   "key_findings": "LLM4CVE proposes an iterative pipeline combining prompt engineering, CodeBLEU-based feedback, and LoRA fine-tuning for automated vulnerability repair of C code. Llama 3 70B with the full pipeline achieves an 8.51/10 human-verified correctness score and a 20% improvement in CodeBLEU similarity to ground-truth fixes over the unguided baseline. The iterative feedback mechanism consistently improves output quality across all four tested LLMs (GPT-3.5, GPT-4o, Llama 3 8B, Llama 3 70B). End-to-end compilation was demonstrated on a single real-world vulnerability (CVE-2016-4303 in iperf3).",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Section 9 states 'We publish our testing apparatus, fine-tuned weights, and experimental data on our website' with a URL (https://sites.google.com/view/llm4cve) provided in footnotes."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The primary dataset is CVEFixes, a publicly available dataset (Section 5.1, reference [110]). The authors also state they publish their experimental data on their website."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Section 5 mentions 'multiple compute nodes equipped with one Nvidia A100, 48 CPU cores, and 256GB of system memory' but no software dependencies, library versions, requirements.txt, or Dockerfile are provided."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided in the paper. The website is referenced but the paper itself contains no README-style reproduction guide."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Figure 6 reports CodeBLEU scores as point estimates without error bars or confidence intervals. Human quality scores in Table 4 are also single values with no uncertainty measures."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims improvements (e.g., '20% increase in semantic similarity') based solely on comparing raw numbers. No statistical significance tests (t-tests, Mann-Whitney, etc.) are applied to any comparison."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Figure 6 reports percentage improvements with baseline context: '+20.01%' and '+8.24%' increases in CodeBLEU scores. Human scores are reported on a 1-10 scale with baseline (ground truth) comparison in Table 4."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The dataset contains 697 unique CVEs after filtering, but no justification is given for why this sample size is adequate. The 'guided+feedback' configuration uses a 50% random sample without justification. No power analysis is provided."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No standard deviations, variance, or spread measures are reported for any experimental results. All results appear to be single-run numbers."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Three pipeline configurations serve as baselines: 'unguided' (zero-shot), 'guided' (one-shot), and 'guided+feedback' (one-shot with feedback). Results are compared across all four LLMs (Table 3, Figure 6)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "The baselines are only the paper's own pipeline variants. No comparison is made against existing vulnerability repair tools (VRepair, VulRepair, CREAM, etc.) that are discussed extensively in Section 3. The paper lacks any external baseline."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The three configurations (unguided → guided → guided+feedback) function as an ablation, progressively adding prompt engineering and iterative feedback. Results across all configurations are reported in Figure 6."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 5.8 describes four metrics: CodeBLEU scores (Section 6.1), human quality scores (Section 6.2), end-to-end compilation (Section 6.3), and engineering effort (Section 6.4)."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Section 6.2 reports a human study where participants rated patches on vulnerability elimination (1-10) and code style (1-10). GPT-4o and Llama 3 70B outputs plus ground truth were evaluated (Table 4)."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Section 5.4 states 'We employ an 90/10 train/test split to ensure sufficient data is available for our evaluation' for the LoRA fine-tuning experiments."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": false,
    105         "justification": "Table 1 lists 8 CWE categories with counts, but the results in Figure 6 and Table 4 are aggregated across all CWEs. No per-CWE performance breakdown is provided despite the paper evaluating 8 distinct vulnerability types."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "Section 7.2 shows a successful patch example but no failure cases are discussed. The paper mentions ~5% malformed outputs (Section 5.7) but does not analyze where the pipeline fails to produce correct fixes."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "Every experiment shows improvement from unguided to guided to guided+feedback. No approaches that were tried and abandoned, or configurations that hurt performance, are reported."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims 'human-verified quality score of 8.51/10' (confirmed in Table 4 for Llama 3 70B correctness) and '20% increase in ground-truth code similarity' (confirmed in Figure 6 for Llama 3 70B guided+feedback vs unguided)."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper's main causal claim is that the iterative feedback mechanism improves code quality. The ablation design (unguided → guided → guided+feedback) with controlled single-variable manipulation supports this, showing consistent improvement across models."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title claims 'Automated Vulnerability Repair' broadly, but the evaluation is limited to C code, 8 CWE types, and the CVEFixes dataset. The abstract claims the pipeline 'robustly fixes vulnerable functions in real-world code' without bounding to the tested setting."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No alternative explanations for the results are discussed. For example, the CodeBLEU improvement could be partly due to the feedback loop encouraging more conservative (closer to input) changes rather than better fixes, but this is not considered."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Section 6.1 acknowledges 'a candidate patch with a CodeBLEU score less than 1.0 can still be a viable fix, as there are often multiple solutions to the given vulnerability.' The paper supplements CodeBLEU with human evaluation and end-to-end compilation to address this proxy gap."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Section 5.3 states 'GPT-3.5-Turbo and GPT-4o models available from OpenAI' and 'Llama 3 8B' and 'Llama 3 70B' without specific version dates or snapshot identifiers. No API version dates are provided."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Section 4.3 describes the 'guided' and 'unguided' prompt types in natural language (e.g., 'provides the name of the CVE and CWE, a description of both') but the actual prompt text is never provided in the paper or appendix."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No LLM inference hyperparameters (temperature, top-p, max tokens) are reported. For LoRA training, no learning rate, rank, alpha, or number of epochs are stated. Only the 90/10 train/test split and token limit of 500 are mentioned."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 4 and Figure 4 describe the iterative pipeline: code generation → CodeBLEU feedback → re-prompting with feedback, limited to two iterations (Section 4.5). The feedback mechanism and code extraction logic (Section 5.7) are detailed."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.2 documents the preprocessing pipeline: extracting function-level changes from CVEFixes, filtering by CWE, selecting before/after pairs, filtering by token count (≤500), resulting in 8 CWEs with ≥100 pairs representing 697 unique CVEs."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "There is no dedicated limitations section. The introduction mentions 'known limitations and future improvements' in Sections 7-8, but Section 7 covers Discussion (impact, examples, GPT vs. Llama, ethics) and Section 8 is a brief conclusion with no substantive limitations discussion."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "No threats to validity are discussed. There is no mention of specific risks such as the representativeness of CVEFixes, the adequacy of CodeBLEU as a metric, or the generalizability beyond C code."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "While the paper targets C code and 8 CWEs, it does not explicitly state what the results do NOT show. The broad framing ('robustly fixes vulnerable functions in real-world code') lacks explicit scope boundaries."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": true,
    193         "justification": "CVEFixes is a publicly available dataset (reference [110]). Section 9 states the authors publish their experimental data on their website, potentially including raw outputs."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 5.1 describes CVEFixes as the primary dataset and Section 5.2 details the extraction and filtering pipeline from the SQL database through to the final set of 697 CVEs across 8 CWE types."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "The human study (Section 6.2) describes participants as having 'at least several years of experience in programming' but does not describe how they were recruited, how many participated, or from what population they were drawn."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 5.2 documents the pipeline: CVEFixes SQL → function-level extraction by CVE/language → CWE filtering (excluding noinfo/other) → before/after pair matching → token count filtering (≤500) → final 8 CWEs with ≥100 pairs."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding disclosure or acknowledgments section is present in the paper, despite two authors being affiliated with Siemens Technology (a commercial entity)."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: University of California Irvine (academic) and Siemens Technology, Princeton (industry), with corresponding email addresses."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "Siemens Technology, which employs two co-authors, has a commercial interest in automated vulnerability repair for industrial systems. No statement of funder independence is provided, and no funding is disclosed."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is provided anywhere in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": true,
    236         "answer": false,
    237         "justification": "No training data cutoff dates are stated for any of the four LLMs used (GPT-3.5, GPT-4o, Llama 3 8B, Llama 3 70B). This is critical since CVEFixes contains publicly available vulnerability data."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": true,
    241         "answer": false,
    242         "justification": "No discussion of whether CVEFixes data (derived from public open-source repositories and CVE databases) appeared in the training data of the LLMs. These fixes are publicly available on GitHub and could easily be in training data."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": true,
    246         "answer": false,
    247         "justification": "CVEFixes contains publicly available code fixes from open-source projects, published well before the training cutoffs of GPT-4o and Llama 3. The contamination risk is high and completely unaddressed."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": true,
    253         "answer": false,
    254         "justification": "No mention of pre-registration for the human evaluation study described in Section 6.2."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": true,
    258         "answer": true,
    259         "justification": "Section 6.2 footnote states: 'We received prior approval to conduct this study from an institutional IRB through an exemption due to the strictly academic nature of our questionnaire.'"
    260       },
    261       "demographics_reported": {
    262         "applies": true,
    263         "answer": false,
    264         "justification": "Participants are described only as having 'at least several years of experience in programming.' No number of participants, gender distribution, geographic info, specific experience levels, or other demographics are reported."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": true,
    268         "answer": false,
    269         "justification": "The only stated criterion is 'at least several years of experience in programming,' which is vague ('several' is undefined). No exclusion criteria or screening process is described."
    270       },
    271       "randomization_described": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "Participants were shown three patches per function (GPT-4o, Llama 3 70B, ground truth) but no randomization of presentation order is described. The assignment to condition is fixed (all participants see the same patches)."
    275       },
    276       "blinding_described": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "Section 6.2 states: 'Participants understood the purpose of the study – including the presence of one ground-truth patch and two LLM-generated patches per example – but were not told which LLM created each candidate patch.'"
    280       },
    281       "attrition_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No information on how many participants started versus completed the study, or any dropout analysis."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "Table 5 provides rough estimates of setup and execution time (e.g., '5 minutes' for GPT, '10 minutes' for open-source) but these are approximate engineering effort estimates, not measured inference costs, API expenses, or tokens consumed per example."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Hardware is described (Nvidia A100, 48 cores, 256GB RAM) and Table 5 estimates '24 hours' setup for open-source LLMs, but total GPU hours, API spend, and training compute budget are not quantified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single runs with no exploration of result variance across seeds."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The paper uses pass@1 (k=1) as described in Section 5.8, but does not state how many times the pipeline was executed per example or whether results represent single or averaged runs."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search is described for LLM inference settings or LoRA training. The iteration limit of 2 and token limit of 500 appear chosen without documented justification."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": true,
    318         "justification": "All three pipeline configurations (unguided, guided, guided+feedback) and all four models are reported in Figure 6 and discussed. The paper does not selectively report only the best configuration."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The paper compares only its own pipeline configurations without external baselines and does not acknowledge the bias of authors evaluating their own system against only weaker variants of itself."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Models range from 8B to >1760B parameters with vastly different compute requirements, but performance is not analyzed as a function of compute. The cost-performance tradeoff is not discussed."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "CVEFixes is used as the benchmark without discussing whether it representatively captures real-world vulnerability repair scenarios. CodeBLEU as a metric is acknowledged to be imperfect (Section 6.1) but no deeper construct validity analysis is provided for the benchmark itself."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "All four models are evaluated within the same pipeline scaffolding (LLM4CVE), with the same three configurations applied consistently across models, controlling for scaffold effects in model comparisons."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "CVEFixes contains publicly available vulnerability fixes from before the training cutoffs of all tested models. The possibility that LLMs memorized these exact fixes is not discussed."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The 'guided' configuration provides CVE/CWE descriptions as input features. Whether providing the exact CVE identifier enables the model to recall memorized fixes (rather than reasoning about the vulnerability) is not discussed."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "For the LoRA 90/10 split, no analysis is provided of whether train and test examples share structural similarities (e.g., from the same project, same developer, or near-duplicate fixes)."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or temporal splits are applied."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "LLM4CVE achieves a human-verified quality score of 8.51/10 with Llama 3 70B",
    372       "evidence": "Table 4 (Section 6.2) reports correctness score of 8.51 for Llama 3 70B guided+feedback configuration, based on human evaluation study with IRB approval.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "The full pipeline achieves a 20% increase in ground-truth code similarity with Llama 3 70B",
    377       "evidence": "Figure 6 (Section 6.1) shows CodeBLEU score improvement of +20.01% from unguided to guided+feedback for Llama 3 70B.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Each stage of LLM4CVE improves code quality and output coherence across all models",
    382       "evidence": "Figure 6 shows consistent improvement from unguided → guided → guided+feedback across GPT-3.5, GPT-4o, Llama 3 8B, and Llama 3 70B. No statistical tests confirm these differences.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "Llama 3 70B with LoRA fine-tuning matches or outperforms GPT-4o",
    387       "evidence": "Section 7.3 and Figure 6 show Llama 3 70B outperforming GPT-4o in the guided+feedback configuration on CodeBLEU scores.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "The pipeline can fix real-world vulnerabilities end-to-end",
    392       "evidence": "Section 6.3 demonstrates a single end-to-end compilation test on CVE-2016-4303 (CWE-120) in iperf3/cJSON, where the candidate patch prevented exploitation.",
    393       "supported": "weak"
    394     },
    395     {
    396       "claim": "LLM4CVE is the first automated, iterative LLM process for systematically correcting vulnerabilities in code",
    397       "evidence": "Stated in the contributions (Section 1). Prior work on iterative LLM repair exists (e.g., Xia & Zhang 2023, ChatRepair), making this novelty claim questionable.",
    398       "supported": "weak"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No external baselines",
    404       "detail": "Despite discussing VRepair, VulRepair, CREAM, and other prior vulnerability repair methods extensively in Section 3, the paper never compares against any of them. All comparisons are between the paper's own pipeline configurations."
    405     },
    406     {
    407       "flag": "Single end-to-end case study",
    408       "detail": "The end-to-end compilation evaluation (Section 6.3) — the most convincing metric — is demonstrated on only one CVE (CVE-2016-4303). This is far too few to support claims about real-world applicability."
    409     },
    410     {
    411       "flag": "Severe contamination risk",
    412       "detail": "CVEFixes contains publicly available vulnerability fixes from open-source projects (GitHub commits). GPT-4o and Llama 3 were trained on internet data that likely includes these exact fixes. The paper never addresses this, making it impossible to know whether the LLMs are reasoning about vulnerabilities or simply recalling memorized patches."
    413     },
    414     {
    415       "flag": "No statistical tests on any comparison",
    416       "detail": "All performance comparisons are based on raw numbers without any statistical significance testing. The claimed 20% improvement could be within noise for all the reader can tell."
    417     },
    418     {
    419       "flag": "Incomplete human study reporting",
    420       "detail": "The human evaluation (Section 6.2) does not report the number of participants, detailed demographics, how many patches were evaluated, or how many functions were included. This makes the 8.51/10 score difficult to interpret."
    421     },
    422     {
    423       "flag": "Guided+feedback uses 50% sample without justification",
    424       "detail": "Section 5.5 states 'we use a random sample consisting of 50% of the full dataset for the guided+feedback configuration' without explaining why, introducing potential selection effects and making results not directly comparable across configurations."
    425     },
    426     {
    427       "flag": "No limitations section",
    428       "detail": "The paper lacks any substantive discussion of limitations, threats to validity, or scope boundaries despite making broad claims about real-world applicability."
    429     }
    430   ],
    431   "cited_papers": [
    432     {
    433       "title": "Pre-trained model-based automated software vulnerability repair: How far are we?",
    434       "authors": ["Quanjun Zhang", "Chunrong Fang", "Bowen Yu", "Weisong Sun", "Tongke Zhang", "Zhenyu Chen"],
    435       "year": 2023,
    436       "relevance": "Directly evaluates pre-trained language models for automated vulnerability repair, providing state-of-the-art baselines this paper should have compared against."
    437     },
    438     {
    439       "title": "VulRepair: a T5-based automated software vulnerability repair",
    440       "authors": ["Michael Fu", "Chakkrit Tantithamthavorn", "Trung Le", "Van Nguyen", "Dinh Phung"],
    441       "year": 2022,
    442       "relevance": "Key prior work using CodeT5 for vulnerability repair that outperformed VRepair; represents the pre-LLM state of the art."
    443     },
    444     {
    445       "title": "Examining zero-shot vulnerability repair with large language models",
    446       "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad", "Ramesh Karri", "Brendan Dolan-Gavitt"],
    447       "year": 2023,
    448       "relevance": "One of the first works on zero-shot LLM vulnerability repair, directly comparable to the unguided configuration of LLM4CVE."
    449     },
    450     {
    451       "title": "Conversational automated program repair",
    452       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    453       "year": 2023,
    454       "arxiv_id": "2301.13246",
    455       "relevance": "Proposes conversational/iterative LLM program repair with test feedback, closely related to LLM4CVE's iterative approach."
    456     },
    457     {
    458       "title": "Keep the conversation going: Fixing 162 out of 337 bugs for $0.42 each using ChatGPT",
    459       "authors": ["Chunqiu Steven Xia", "Lingming Zhang"],
    460       "year": 2023,
    461       "arxiv_id": "2304.00385",
    462       "relevance": "Demonstrates cost-effective iterative LLM bug repair with ChatGPT, relevant to automated program repair scalability."
    463     },
    464     {
    465       "title": "RepairLlama: Efficient representations and fine-tuned adapters for program repair",
    466       "authors": ["André Silva", "Sen Fang", "Martin Monperrus"],
    467       "year": 2023,
    468       "arxiv_id": "2312.15698",
    469       "relevance": "Uses LoRA fine-tuning for program repair with Llama models, directly comparable approach to LLM4CVE's LoRA augmentation."
    470     },
    471     {
    472       "title": "Automated program repair in the era of large pre-trained language models",
    473       "authors": ["Chunqiu Steven Xia", "Yuxiang Wei", "Lingming Zhang"],
    474       "year": 2023,
    475       "relevance": "Comprehensive evaluation of LLM-based automated program repair providing context for the field."
    476     },
    477     {
    478       "title": "Neural transfer learning for repairing security vulnerabilities in C code",
    479       "authors": ["Zimin Chen", "Steve Kommrusch", "Martin Monperrus"],
    480       "year": 2022,
    481       "relevance": "VRepair framework using transfer learning for C vulnerability repair, a direct predecessor approach."
    482     },
    483     {
    484       "title": "Code Llama: Open foundation models for code",
    485       "authors": ["Baptiste Rozière", "Jonas Gehring", "Fabian Gloeckle"],
    486       "year": 2023,
    487       "arxiv_id": "2308.12950",
    488       "relevance": "Foundation code model relevant to LLM-based code generation and repair capabilities."
    489     },
    490     {
    491       "title": "CVEFixes: automated collection of vulnerabilities and their fixes from open-source software",
    492       "authors": ["Guru Bhandari", "Amara Naseer", "Leon Moonen"],
    493       "year": 2021,
    494       "relevance": "The primary dataset used in LLM4CVE and many other vulnerability repair studies."
    495     },
    496     {
    497       "title": "AutoSafeCoder: A multi-agent framework for securing LLM code generation through static analysis and fuzz testing",
    498       "authors": ["Ana Nunez", "Nafis Tanveer Islam", "Sumit Kumar Jha", "Peyman Najafirad"],
    499       "year": 2024,
    500       "relevance": "Multi-agent LLM framework for code security combining generation, static analysis, and fuzzing."
    501     },
    502     {
    503       "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation",
    504       "authors": ["Jiawei Liu", "Chunqiu Steven Xia", "Yuyao Wang", "Lingming Zhang"],
    505       "year": 2023,
    506       "relevance": "Rigorous evaluation of LLM code generation correctness, relevant to understanding LLM code quality limitations."
    507     },
    508     {
    509       "title": "A survey of learning-based automated program repair",
    510       "authors": ["Quanjun Zhang", "Chunrong Fang", "Yuxiang Ma", "Weisong Sun", "Zhenyu Chen"],
    511       "year": 2023,
    512       "relevance": "Comprehensive survey of learning-based APR methods providing taxonomic context for LLM4CVE."
    513     },
    514     {
    515       "title": "InferFix: End-to-end program repair with LLMs",
    516       "authors": ["Matthew Jin", "Syed Shahriar", "Michele Tufano"],
    517       "year": 2023,
    518       "relevance": "End-to-end LLM program repair using static analysis tool outputs, achieving 76.8% repair rate on Java bugs."
    519     }
    520   ],
    521   "engagement_factors": {
    522     "practical_relevance": {
    523       "score": 2,
    524       "justification": "The pipeline addresses a real need (automated vulnerability patching in legacy code) and releases code/weights, though it requires significant setup and is limited to C."
    525     },
    526     "surprise_contrarian": {
    527       "score": 0,
    528       "justification": "Confirms the expected finding that LLMs with fine-tuning and iterative feedback produce better code fixes than zero-shot prompting."
    529     },
    530     "fear_safety": {
    531       "score": 1,
    532       "justification": "Tangentially related to security (fixing vulnerabilities) but does not raise new AI safety concerns; the framing is defensive."
    533     },
    534     "drama_conflict": {
    535       "score": 0,
    536       "justification": "No controversy or provocative claims; straightforward engineering contribution."
    537     },
    538     "demo_ability": {
    539       "score": 1,
    540       "justification": "Code and weights are published on a website but not as a pip-installable tool or live demo; requires LoRA training setup."
    541     },
    542     "brand_recognition": {
    543       "score": 1,
    544       "justification": "Uses well-known models (GPT-4o, Llama 3) and involves Siemens, but the lab and conference are not widely known outside the field."
    545     }
    546   }
    547 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs