scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25427B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM4CVE: Enabling Iterative Automated Vulnerability Repair with Large Language Models",
      6     "authors": [
      7       "Mohamad Fakih",
      8       "Rahul Dharmaji",
      9       "Halima Bouzidi",
     10       "Gustavo Quiros Araya",
     11       "Oluwatosin Ogundare"
     12     ],
     13     "year": 2025,
     14     "venue": "Euromicro Symposium on Digital Systems Design",
     15     "arxiv_id": "2501.03446",
     16     "doi": "10.1109/DSD67783.2025.00087"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The 8.51/10 human correctness score (Table 4) and 20% CodeBLEU improvement for Llama 3 70B (Figure 6) are both substantiated in results; the website code release is referenced with a URL.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The three pipeline configurations (unguided, guided, guided+feedback) constitute an ablation study that isolates each component's contribution, adequately supporting causal claims about iterative feedback and prompt engineering.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The conclusion claims the work 'pave[s] the way towards achieving automated program repair without any intervention from trained experts,' far exceeding the scope of 8 CWEs in C-language function-level snippets from a single dataset.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No alternative explanations are considered — e.g., whether CodeBLEU improvements reflect actual security fixes, or whether human evaluators were biased by knowing they were scoring LLM vs. ground-truth patches.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "CodeBLEU (similarity to ground truth) is the primary metric but the paper claims 'high accuracy' vulnerability repair; only one CVE is tested end-to-end, and the gap between similarity scores and actual vulnerability elimination is not adequately discussed.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations or threats-to-validity section exists; Section 7 (Discussion) briefly notes some constraints but does not constitute a structured limitations treatment.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity are articulated — e.g., CodeBLEU as a proxy, undisclosed participant count in human study, or inconsistent evaluation subsets across configurations.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Constraints (C language, 8 CWEs, function-level snippets under 500 tokens) are described as methodology but never framed as explicit boundaries on what the results do NOT generalize to.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding source is disclosed anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly stated: UC Irvine (EECS) and Siemens Technology (Princeton).",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Siemens Technology employees are co-authors on a vulnerability repair tool that directly aligns with Siemens' commercial interests in industrial/legacy system security; no independence statement is provided.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests declaration is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms including CVE, CWE, LoRA, PEFT, CodeBLEU, and the three pipeline configurations are defined in the Background and Methodology sections.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Three explicit contributions are listed in the introduction: the automated iterative pipeline, the first iterative LLM correction process for vulnerabilities, and a multi-model evaluation study.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section 3 covers three related areas (classical repair, LLM code generation, LLM-guided repair) and situates LLM4CVE relative to VulRepair, VRepair, AutoSafeCoder, and InferFix with substantive comparison.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "The paper states 'we publish our testing apparatus, fine-tuned weights, and experimental data on our website' (Google Sites URL provided); this is a present-tense release claim, not a future promise.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "CVEFixes, the primary dataset, is a publicly available dataset with a published reference and SQL database.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Hardware is specified (Nvidia A100, 48 CPUs, 256GB RAM) but no software environment specification (requirements.txt, Dockerfile, or dependency list) is provided.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided in the paper; the website is referenced but no structured guide is included in the manuscript.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Figure 6 and Table 4 present point estimates only; no confidence intervals or error bars are reported for any result.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are applied despite multiple comparative claims across pipeline configurations and model types.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Percentage improvements in CodeBLEU scores are reported with baseline context ('+20.01%' for Llama 3 70B, '+8.24%' for GPT-4o in Figure 6).",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The 697 CVEs after filtering and 90/10 train/test split are stated without justification or power analysis.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or spread is reported for CodeBLEU or human quality scores across runs or examples.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Three pipeline configurations serve as ablation baselines: unguided (zero-shot), guided (one-shot), and guided+feedback (full pipeline) across all four models.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "Baselines are only ablations of the authors' own pipeline; no numerical comparison against contemporary external tools like VulRepair, AutoSafeCoder, or InferFix is performed despite their mention in related work.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "The three configurations (unguided, guided, guided+feedback) isolate contributions of CVE/CWE prompt context and the iterative feedback mechanism.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Four metrics are used: CodeBLEU scores, human quality scores (correctness + style), end-to-end compilation success, and engineering effort (time) comparison.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": true,
    206           "justification": "A human study with programmers evaluated vulnerability elimination correctness (scale 1-10) and code style for LLM-generated patches vs. ground truth; IRB exemption obtained.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "A 90/10 train/test split is used for LoRA evaluation; however, the guided+feedback configuration uses only 50% of the dataset while other configurations use 100%, creating an inconsistency.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Results are aggregated across all 8 CWEs; no per-CWE performance breakdown is provided in the main results (Table 1 shows dataset distribution only).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Only format failure rates (~5% malformed responses, <1% no code generated) are noted; no analysis of cases where the pipeline fails to actually fix the targeted vulnerability is presented.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": false,
    230           "justification": "No systematic negative results are reported; GPT-4o's inability to be fine-tuned is noted as a limitation but not framed as a finding.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Model names are provided (GPT-3.5-Turbo, GPT-4o, Llama 3 8B/70B) but no API snapshot dates or specific version identifiers are given for the GPT models.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "The content structure of guided vs. unguided prompts is described but the actual prompt text is not included in the paper.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No generation hyperparameters (temperature, top-p, max tokens) are reported, and LoRA training details (rank, learning rate, epochs) are absent.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The iterative pipeline is described in detail across Sections 4.4-4.6 and Figure 5, including the CodeBLEU-based divergence detection, 2-iteration limit, and candidate extraction logic.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 5.2 documents the CVEFixes preprocessing pipeline including language filtering, CWE exclusion criteria ('NVD-CWE-noinfo'), and 500-token truncation threshold.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "CVEFixes is publicly available and the authors state experimental data is published on their website.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 5.2 describes the extraction and filtering steps from the CVEFixes SQL database in sufficient procedural detail.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": true,
    281           "answer": false,
    282           "justification": "Human study participants are described only as having 'at least several years of experience in programming'; the number of participants, recruitment method, and compensation are not disclosed.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "The full pipeline from CVEFixes extraction through preprocessing, LLM inference, CodeBLEU evaluation, and human assessment is documented across Sections 5.1-5.8.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No training data cutoff is stated for any of the evaluated models (GPT-3.5, GPT-4o, Llama 3 8B/70B).",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether CVEFixes examples (from public open-source repositories) appeared in the training data of the evaluated LLMs.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "CVEFixes contains real-world CVEs from public repositories that may have been in GPT and Llama training corpora; this potential leakage is never addressed.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": true,
    315           "answer": false,
    316           "justification": "No pre-registration is mentioned for the human evaluation study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": true,
    321           "answer": true,
    322           "justification": "IRB exemption is explicitly stated in a footnote: 'We received prior approval to conduct this study from an institutional IRB through an exemption due to the strictly academic nature of our questionnaire.'",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": true,
    327           "answer": false,
    328           "justification": "Only 'at least several years of experience in programming' is stated; participant count, age, gender, and disciplinary background are not reported.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": true,
    333           "answer": false,
    334           "justification": "Only the vague criterion 'several years of experience in programming' is given; formal inclusion/exclusion criteria with operationalized definitions are absent.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": true,
    339           "answer": false,
    340           "justification": "No randomization of patch presentation order or participant assignment is described.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": true,
    345           "answer": false,
    346           "justification": "Evaluators were explicitly told they were scoring one ground-truth patch and two LLM-generated patches per example; only which specific LLM produced each patch was concealed — full blinding was not achieved.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": true,
    351           "answer": false,
    352           "justification": "No participant attrition or dropout is reported.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Table 5 reports execution latency for GPT models (5 minutes) and open-source LLMs+LoRAs (10 minutes) per vulnerability, providing practical latency context.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Hardware is described (Nvidia A100, 48 CPUs, 256GB RAM) but total GPU-hours or monetary compute cost for the full experimental run is not stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "The full LLM4CVE pipeline achieves a 20% improvement in CodeBLEU score for Llama 3 70B compared to zero-shot prompting",
    375       "evidence": "Figure 6 reports '+20.01%' for Llama 3 70B guided+feedback vs. unguided; however, the guided+feedback config uses only 50% of the dataset vs. 100% for unguided",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "LLM4CVE achieves a human-verified vulnerability elimination quality score of 8.51/10 for Llama 3 70B",
    380       "evidence": "Table 4 shows Llama 3 70B guided+feedback correctness of 8.51; the number of human evaluators is never disclosed, making this score uninterpretable statistically",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "LLM4CVE successfully repairs real-world CVEs end-to-end at the project compilation level",
    385       "evidence": "Section 6.3 demonstrates successful patching of a single CVE (CVE-2016-4303 in iperf3/cJSON); this is the only end-to-end compilation test performed",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "Llama 3 70B with LoRA fine-tuning consistently matches or outperforms GPT-4o in vulnerability repair",
    390       "evidence": "Figure 6 shows Llama 3 70B guided+feedback achieving higher CodeBLEU than GPT-4o; LoRA adaptation is only possible for open-source models, creating an asymmetric comparison",
    391       "supported": "moderate"
    392     },
    393     {
    394       "claim": "LLM-based repair reduces patching time from weeks (28-day human baseline) to minutes",
    395       "evidence": "Table 5 compares times; the 28-day human estimate is drawn from cited external literature [110], not measured in this study",
    396       "supported": "weak"
    397     },
    398     {
    399       "claim": "Iterative feedback consistently improves patch quality across all four evaluated LLMs",
    400       "evidence": "Figure 6 shows guided+feedback above guided configuration for all four models across both percentage metrics",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "LLM4CVE demonstrates that iterative feedback loops and structured prompt engineering improve LLM-generated vulnerability patches on the CVEFixes dataset, with Llama 3 70B achieving a 20% CodeBLEU improvement and 8.51/10 human correctness score over zero-shot baselines. LoRA fine-tuning on CVEFixes makes open-source Llama 3 70B competitive with or superior to GPT-4o in this setting. However, the primary evaluation metric (CodeBLEU) measures similarity to ground-truth patches rather than actual vulnerability elimination, end-to-end validation covers only a single CVE, and no comparison against contemporary external vulnerability repair tools is provided. The human study is substantially underpowered by undisclosed participant count.",
    409   "red_flags": [
    410     {
    411       "flag": "No external baseline comparison",
    412       "detail": "Results compare only ablations of the authors' own pipeline; no numerical comparison against state-of-the-art tools (VulRepair, AutoSafeCoder, InferFix) is provided despite their detailed discussion in related work."
    413     },
    414     {
    415       "flag": "Single end-to-end test case",
    416       "detail": "End-to-end compilation and actual vulnerability elimination is validated on exactly one CVE (CVE-2016-4303); all other evaluation relies on CodeBLEU as a proxy metric."
    417     },
    418     {
    419       "flag": "Human study: participant count never disclosed",
    420       "detail": "The number of human evaluators is never stated anywhere in the paper, making the 8.51/10 quality score statistically uninterpretable."
    421     },
    422     {
    423       "flag": "Inconsistent evaluation subsets",
    424       "detail": "The guided+feedback configuration uses only a 50% random sample of the dataset while unguided and guided use 100%, making direct CodeBLEU comparisons potentially confounded."
    425     },
    426     {
    427       "flag": "No statistical tests or error bars",
    428       "detail": "All CodeBLEU and human quality score comparisons are made without significance testing, confidence intervals, or variance measures across runs."
    429     },
    430     {
    431       "flag": "Contamination unaddressed",
    432       "detail": "CVEFixes draws from public open-source repositories; whether these CVEs and their fixes appeared in GPT or Llama training corpora is never discussed."
    433     },
    434     {
    435       "flag": "Conclusion overclaiming",
    436       "detail": "The conclusion claims to 'pave the way towards achieving automated program repair without any intervention from trained experts,' far exceeding the evidential scope of 8 CWEs in C function-level snippets."
    437     }
    438   ],
    439   "cited_papers": [
    440     {
    441       "title": "CVEFixes: Automated Collection of Vulnerabilities and Their Fixes from Open-Source Software",
    442       "relevance": "Primary training and evaluation dataset used throughout the paper"
    443     },
    444     {
    445       "title": "VulRepair: A T5-Based Automated Software Vulnerability Repair",
    446       "relevance": "State-of-the-art T5-based vulnerability repair baseline discussed in related work"
    447     },
    448     {
    449       "title": "Neural Transfer Learning for Repairing Security Vulnerabilities in C Code (VRepair)",
    450       "relevance": "Prior neural transfer learning approach to C vulnerability repair compared in related work"
    451     },
    452     {
    453       "title": "AutoSafeCoder: A Multi-Agent Framework for Securing LLM Code Generation through Static Analysis and Fuzz Testing",
    454       "relevance": "Contemporary multi-agent LLM security approach closely related to LLM4CVE"
    455     },
    456     {
    457       "title": "Examining Zero-Shot Vulnerability Repair with Large Language Models",
    458       "relevance": "Direct predecessor work on LLM zero-shot vulnerability repair that LLM4CVE builds upon"
    459     },
    460     {
    461       "title": "Conversational Automated Program Repair",
    462       "relevance": "Related iterative LLM repair approach using test-suite feedback; directly compared in motivation"
    463     },
    464     {
    465       "title": "CodeBLEU: A Method for Automatic Evaluation of Code Synthesis",
    466       "relevance": "Primary evaluation metric used throughout the paper for measuring patch quality"
    467     },
    468     {
    469       "title": "RepairLlama: Efficient Representations and Fine-Tuned Adapters for Program Repair",
    470       "relevance": "Related LoRA fine-tuning approach for program repair; directly cited as similar methodology"
    471     }
    472   ],
    473   "engagement_factors": {
    474     "practical_relevance": {
    475       "score": 2,
    476       "justification": "Addresses a real backlog of security vulnerabilities in legacy codebases; pipeline code and fine-tuned weights released for direct practitioner use."
    477     },
    478     "surprise_contrarian": {
    479       "score": 0,
    480       "justification": "Iterative prompting and LoRA fine-tuning improving LLM performance is expected; Llama 3 competing with GPT-4o is consistent with well-established 2024 trends."
    481     },
    482     "fear_safety": {
    483       "score": 2,
    484       "justification": "Addresses security vulnerabilities in critical infrastructure including IoT, autonomous vehicles, and the Linux kernel, with clear safety implications for real-world systems."
    485     },
    486     "drama_conflict": {
    487       "score": 0,
    488       "justification": "No controversy or conflict angle; straightforward systems engineering paper."
    489     },
    490     "demo_ability": {
    491       "score": 2,
    492       "justification": "Pipeline code, fine-tuned LoRA weights, and experimental data published on a publicly accessible website; practitioners can apply it to their own CVEs."
    493     },
    494     "brand_recognition": {
    495       "score": 1,
    496       "justification": "UC Irvine and Siemens Technology are recognizable but not top-tier AI research institutions."
    497     }
    498   },
    499   "hn_data": {
    500     "threads": [],
    501     "top_points": 0,
    502     "total_points": 0,
    503     "total_comments": 0
    504   }
    505 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs