scan-v5.json (18169B)
1 { 2 "scan_version": 5, 3 "paper_type": "survey", 4 "paper": { 5 "title": "LLMs in Software Security: A Survey of Vulnerability Detection Techniques and Insights", 6 "authors": [ 7 "Ze Sheng", 8 "Zhicheng Chen", 9 "Shuning Gu", 10 "Heqing Huang", 11 "Guofei Gu", 12 "Jeff Huang" 13 ], 14 "year": 2025, 15 "venue": "ACM Computing Surveys", 16 "arxiv_id": "2502.07049", 17 "doi": "10.1145/3769082" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "Abstract claims about LLM capabilities, traditional method limitations, and the survey's contributions are substantiated across all sections through literature synthesis and tabulated evidence.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": false, 30 "justification": "The paper makes causal-sounding claims like 'fine-tuning enhances detection' and 'CoT improves accuracy,' but these are inherited from primary studies without independent assessment of whether those studies' designs were adequate for causal inference.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": true, 36 "justification": "The survey explicitly bounds its scope to LLM-based vulnerability detection (2019–2024) in C/C++, Java, and Solidity at function/file level, and acknowledges gaps in repository-level and multi-language coverage.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "Trends such as decoder-only model dominance and C/C++ research concentration are presented as conclusions without considering confounding factors or alternative explanations (e.g., dataset availability driving language focus).", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper notes data leakage inflates benchmark metrics but does not systematically distinguish between controlled-dataset F1-scores and real-world vulnerability detection effectiveness across reviewed studies.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section 4 is a dedicated 'LIMITATIONS' section, though it is brief—one paragraph covering only two issues.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": false, 62 "justification": "The limitations mention ~60% preprint coverage and terminology variation, but do not quantify coverage gaps, address selection bias, discuss publication bias, or explain how these threats affect the survey's conclusions.", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "Section 2.1 explicitly excludes traditional ML approaches (CNN/RNN), malware analysis, and network intrusion detection, and restricts inclusion to LLM-based vulnerability detection in specific programming languages.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No acknowledgments section or funding disclosure is present anywhere in the paper.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All six authors list institutional affiliations (Texas A&M University, City University of Hong Kong) in the author block.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": false, 87 "answer": false, 88 "justification": "No funding is disclosed, so independence of any funder cannot be assessed.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears in the paper.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 2.3 defines LLMs; Section 2.4 formally defines vulnerability detection as a binary classification problem with mathematical notation, and separately defines vulnerability classification and severity prediction.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "The abstract explicitly lists three contributions: systematic analysis of LLM applications, a unified framework examining patterns across studies, and identification of key challenges and research directions.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section 2.2 compares this survey against three closely related prior surveys (Yao et al., Xu et al., Zhou et al.) on specific dimensions including model recency, benchmark coverage, and detection-specific depth.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "survey": { 121 "search_and_selection": { 122 "search_strategy_reproducible": { 123 "applies": true, 124 "answer": false, 125 "justification": "The search is described narratively (top-tier conferences, keyword extraction, iterative searches every 3 weeks) but lacks specific query strings, exact date ranges, and database-level documentation needed for independent reproduction.", 126 "source": "haiku" 127 }, 128 "inclusion_exclusion_explicit": { 129 "applies": true, 130 "answer": true, 131 "justification": "Section 2.1 explicitly states exclusion of traditional ML methods (CNN/RNN), papers unrelated to vulnerability detection, and restricts inclusion to LLM-based studies covering specific programming languages.", 132 "source": "haiku" 133 }, 134 "prisma_or_structured_protocol": { 135 "applies": true, 136 "answer": false, 137 "justification": "No PRISMA flow diagram or mention of any structured systematic review protocol is present; the search process is described informally without stage-by-stage counts.", 138 "source": "haiku" 139 }, 140 "search_terms_provided": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 2.1 explicitly lists search terms: 'vulnerability detection,' 'LLM,' 'large language model,' and 'AI.'", 144 "source": "haiku" 145 }, 146 "databases_listed": { 147 "applies": true, 148 "answer": false, 149 "justification": "Only specific venues/conferences are listed (S&P, USENIX Security, CCS, IEEE TSE) but not the actual electronic databases searched (e.g., ACM Digital Library, IEEE Xplore, arXiv, Semantic Scholar).", 150 "source": "haiku" 151 }, 152 "screening_process_documented": { 153 "applies": true, 154 "answer": false, 155 "justification": "Only a single aggregate count is provided (~500–600 papers screened to 58 selected) with no stage-by-stage flow documenting how papers were excluded at title, abstract, or full-text screening stages.", 156 "source": "haiku" 157 }, 158 "review_scope_justified": { 159 "applies": true, 160 "answer": true, 161 "justification": "The scope is justified by the emergence of LLMs as a distinct paradigm (post-2019) and the absence of prior surveys specifically addressing LLM-based vulnerability detection methodology in depth.", 162 "source": "haiku" 163 } 164 }, 165 "synthesis_quality": { 166 "conflicting_findings_acknowledged": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper notes conflicting results such as CoT improving precision but having variable recall impact, and dramatically different F1-scores for the same model across datasets (e.g., CodeBERT: 0.099 on PrimeVul vs. 0.66 on Choi2017).", 170 "source": "haiku" 171 }, 172 "quality_assessment_of_sources": { 173 "applies": true, 174 "answer": false, 175 "justification": "No structured quality assessment, risk-of-bias evaluation, or quality rubric is applied to reviewed papers; preprints and peer-reviewed studies are synthesized without differentiation by methodological quality.", 176 "source": "haiku" 177 }, 178 "publication_bias_discussed": { 179 "applies": true, 180 "answer": false, 181 "justification": "Publication bias is never mentioned; the survey does not acknowledge that published papers skew toward positive results or discuss the impact of unpublished negative findings.", 182 "source": "haiku" 183 }, 184 "quantitative_synthesis_present": { 185 "applies": true, 186 "answer": false, 187 "justification": "The paper presents descriptive statistics (usage counts, percentage breakdowns, tabulated F1-scores) but performs no meta-analysis, vote counting with confidence intervals, or effect size aggregation across studies.", 188 "source": "haiku" 189 }, 190 "recommendations_supported_by_evidence": { 191 "applies": true, 192 "answer": true, 193 "justification": "Research direction recommendations (repository-level datasets, cross-file detection, robustness improvements) are directly tied to documented gaps and challenges identified across the reviewed literature.", 194 "source": "haiku" 195 } 196 } 197 } 198 }, 199 "claims": [ 200 { 201 "claim": "GPT-4 is the most commonly used LLM in vulnerability detection research, appearing in 29 of 58 reviewed studies.", 202 "evidence": "Table 2 and RQ1 analysis enumerating usage frequency of 33 distinct LLMs across 58 studies.", 203 "supported": "strong" 204 }, 205 { 206 "claim": "Decoder-only models account for 67.1% of fine-tuning experiments, marking a shift from encoder-only architectures.", 207 "evidence": "Architecture categorization across 58 reviewed studies: 24.2% encoder-only, 8.7% encoder-decoder, 67.1% decoder-only in fine-tuning.", 208 "supported": "strong" 209 }, 210 { 211 "claim": "C/C++ dominates vulnerability detection research at 50% of studies, followed by Java at 21.1%.", 212 "evidence": "Figure 5 analysis of target programming languages across 56 selected papers.", 213 "supported": "strong" 214 }, 215 { 216 "claim": "41.3% of studies employ code processing techniques (AST, RAG, program slicing) to address LLMs' limited context windows.", 217 "evidence": "Finding III based on categorizing preprocessing technique usage across 58 reviewed studies.", 218 "supported": "moderate" 219 }, 220 { 221 "claim": "Fine-tuning large models (>10B parameters) with PEFT achieves F1-scores near 0.9 for vulnerability detection.", 222 "evidence": "Table 5 shows Alam et al. (0.99), Guo et al. (0.97), Luo et al. (0.90), Ma et al. (0.91) across various datasets.", 223 "supported": "weak" 224 }, 225 { 226 "claim": "Data leakage is pervasive in vulnerability detection datasets because LLMs train on GitHub sources that overlap with test sets.", 227 "evidence": "Challenge 4 section citing Wu et al. [121] and multiple papers documenting mislabeling and leakage from LLM training corpora.", 228 "supported": "moderate" 229 } 230 ], 231 "methodology_tags": [ 232 "survey", 233 "qualitative" 234 ], 235 "key_findings": "LLM-based vulnerability detection has rapidly shifted toward large decoder-only architectures (67.1% of fine-tuning studies), with GPT-4 as the dominant model and C/C++ as the primary target language (50% of studies). Fine-tuning with PEFT methods achieves near 0.9 F1-scores on controlled benchmarks, but these results are undermined by pervasive data leakage and label quality problems in existing datasets. Four major challenges are identified: narrow research scope (83% of studies analyze isolated functions rather than real-world codebases), semantic complexity of cross-file vulnerabilities, intrinsic LLM limitations (inconsistent explanations, low robustness to perturbations), and lack of high-quality repository-level datasets. Research directions proposed include repository-level analysis, vulnerability reproduction pipelines, and vulnerability-specific fine-tuning.", 236 "red_flags": [ 237 { 238 "flag": "No PRISMA protocol", 239 "detail": "The survey uses informal keyword searches without a structured systematic review protocol, PRISMA flow diagram, or stage-by-stage screening counts, undermining reproducibility of the selection process." 240 }, 241 { 242 "flag": "No quality assessment of sources", 243 "detail": "Reviewed papers are synthesized without any quality rating or risk-of-bias assessment, meaning low-quality preprints (acknowledged as ~60% of the corpus) are treated on equal footing with peer-reviewed studies." 244 }, 245 { 246 "flag": "Publication bias unaddressed", 247 "detail": "The survey never acknowledges that published papers skew positive, which is especially problematic when aggregating F1-scores that may reflect best-case dataset conditions." 248 }, 249 { 250 "flag": "No funding disclosure", 251 "detail": "No acknowledgments or funding section is present, making it impossible to assess potential conflicts of interest." 252 }, 253 { 254 "flag": "Databases not listed", 255 "detail": "The search describes venue-level sources (conferences, one journal) but not the actual electronic databases searched, leaving the search strategy incompletely reproducible." 256 }, 257 { 258 "flag": "Thin limitations section", 259 "detail": "Section 4 is a single short paragraph mentioning only preprint prevalence and terminology variation, omitting fundamental threats such as selection bias, publication bias, and the impact of evaluating primarily benchmark rather than real-world performance." 260 } 261 ], 262 "cited_papers": [ 263 { 264 "title": "Vulnerability Detection with Code Language Models: How Far Are We?", 265 "relevance": "Key empirical paper showing low F1-scores (0.21) even with fine-tuning on PrimeVul; central evidence for the gap between benchmark and real-world performance." 266 }, 267 { 268 "title": "DiverseVul: A New Vulnerable Source Code Dataset for Deep Learning Based Vulnerability Detection", 269 "relevance": "Major dataset providing diverse C/C++ vulnerability samples; cited extensively in the dataset and fine-tuning sections." 270 }, 271 { 272 "title": "CVEfixes: automated collection of vulnerabilities and their fixes from open-source software", 273 "relevance": "Widely used commit-level vulnerability dataset referenced throughout the benchmark and fine-tuning discussion." 274 }, 275 { 276 "title": "LLM4Vuln: A unified evaluation framework for decoupling and enhancing LLMs' vulnerability reasoning", 277 "relevance": "Framework for evaluating LLMs in vulnerability detection; cited for RAG-based knowledge base and CoT analysis." 278 }, 279 { 280 "title": "How far have we gone in vulnerability detection using large language models", 281 "relevance": "Directly relevant benchmarking study on LLM vulnerability detection capabilities; core reference for RQ3 findings." 282 }, 283 { 284 "title": "Large Language Model for Vulnerability Detection and Repair: Literature Review and Roadmap", 285 "relevance": "Closely related prior survey explicitly differentiated from in Section 2.2 on three specific dimensions." 286 }, 287 { 288 "title": "LLM-Assisted Static Analysis for Detecting Security Vulnerabilities", 289 "relevance": "Demonstrates LLM integration with static analysis for repository-level detection; source of CWE-Bench-Java dataset." 290 }, 291 { 292 "title": "How Effective Are Neural Networks for Fixing Security Vulnerabilities", 293 "relevance": "Cited for evidence of data leakage problems in vulnerability datasets, supporting Challenge 4." 294 } 295 ], 296 "engagement_factors": { 297 "practical_relevance": { 298 "score": 3, 299 "justification": "Directly actionable for security practitioners choosing LLMs and techniques for vulnerability detection pipelines, with comprehensive dataset and benchmark tables." 300 }, 301 "surprise_contrarian": { 302 "score": 1, 303 "justification": "Findings largely confirm expected trends; the data leakage problem is notable but not surprising to the security ML community." 304 }, 305 "fear_safety": { 306 "score": 2, 307 "justification": "Highlights that 83% of research operates on isolated code snippets far from real-world scenarios, and that benchmark F1-scores are inflated by data leakage—raising concern about false security from deployed LLM tools." 308 }, 309 "drama_conflict": { 310 "score": 1, 311 "justification": "No significant controversy; the survey takes a neutral synthesis stance without challenging prominent community positions." 312 }, 313 "demo_ability": { 314 "score": 2, 315 "justification": "Links to a maintained GitHub repository of findings, and many reviewed tools (GPT-4, CodeBERT, CodeLlama) are publicly accessible for practitioners to try." 316 }, 317 "brand_recognition": { 318 "score": 2, 319 "justification": "Published in ACM Computing Surveys (high-prestige venue) by Texas A&M authors; features prominent models including GPT-4, Claude 3.5, and DARPA's AIxCC competition." 320 } 321 }, 322 "hn_data": { 323 "threads": [ 324 { 325 "hn_id": "43042753", 326 "title": "LM2: Large Memory Models", 327 "points": 110, 328 "comments": 30, 329 "url": "https://news.ycombinator.com/item?id=43042753", 330 "created_at": "2025-02-13T23:21:21Z" 331 } 332 ], 333 "top_points": 110, 334 "total_points": 110, 335 "total_comments": 30 336 } 337 }