scan.json (31245B)
1 { 2 "paper": { 3 "title": "LLM-based Vulnerability Detection at Project Scale: An Empirical Study", 4 "authors": ["Fengjie Li", "Jiajun Jiang", "Dongchi Chen", "Yingfei Xiong"], 5 "year": 2026, 6 "venue": "arXiv.org", 7 "arxiv_id": "2601.19239", 8 "doi": "10.48550/arXiv.2601.19239" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "LLM-based vulnerability detectors achieve low recall (21.09% for C/C++, 33.82% for Java) on 222 known vulnerabilities but uncover more unique vulnerabilities than traditional static analyzers. Both LLM-based and traditional tools suffer from very high false discovery rates (best-performing tool averages 85.3% SFDR) when applied to 24 real-world open-source projects. Manual analysis of 385 sampled reports reveals shallow interprocedural reasoning (37.5%) and imprecise source/sink identification (19.0%) as the dominant false positive causes. LLM-based detection is computationally expensive, consuming up to hundreds of millions of tokens and requiring multi-day runtimes per project.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states: 'we release all experimental artifacts, including evaluation scripts, prompts, taxonomy labels, and detailed statistics, on our project homepage: https://github.com/Feng-Jay/LLM4Security' (Section I, final contribution bullet)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The in-house benchmark is drawn from three publicly available sources (ReposVul, CWE-Bench-Java, JLeaks), and the authors release taxonomy labels and detailed statistics on their homepage. The real-world project commits are specified in Table III." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "Section IV specifies hardware (dual Intel Xeon 6388 CPUs, 512 GB RAM, four NVIDIA A800 GPUs, Ubuntu 20.04.6 LTS) but provides no software dependency specifications such as library versions, requirements.txt, or Docker configuration." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": true, 34 "justification": "The paper releases evaluation scripts on the project homepage and provides detailed experimental setup in Section IV, including how to configure each tool. Specific commit hashes for all evaluated projects are listed in Table III." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Tables IV-VI report recall percentages and SFDR as point estimates only. No confidence intervals or error bars are provided for any results, despite the LLM-based tools having stochastic outputs." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes comparative claims (e.g., 'LLM-based methods achieve an average recall of 21.09%' vs traditional tools achieving near zero) based solely on comparing raw numbers without any statistical significance tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "While no formal effect sizes (Cohen's d, etc.) are reported, the paper provides detailed absolute recall and SFDR values with per-CWE breakdowns (Tables IV-VI) and per-tool comparisons that convey the magnitude of differences. E.g., 'RepoAudit 55.00% vs KNighter 0% on CWE-401.'" 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification is provided for why 222 vulnerabilities, 24 projects, or 385 sampled reports constitute adequate sample sizes. No power analysis is discussed. The JLeaks sample of 50 cases is described as 'randomly sampled' but the number is not justified." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "All results appear to be from single runs. No variance, standard deviation, or spread measures are reported across experimental runs, despite LLM-based tools using non-deterministic API calls." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares 5 LLM-based vulnerability detectors against 2 widely-used traditional static analyzers (CodeQL and Semgrep) as baselines across all experiments." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "All evaluated tools are recent: RepoAudit (ICML 2025), KNighter (SOSP 2025), IRIS (ICLR 2025), LLMDFA (NeurIPS 2024), INFERROI (ICSE 2025). CodeQL and Semgrep are actively maintained industry-standard tools." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No ablation study is performed. The paper could have controlled tool parameters (e.g., varying RepoAudit's call-graph exploration depth, or testing tools with/without validation steps) to isolate which design factors most affect performance, but does not do so." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper uses multiple metrics: Recall on the in-house dataset (Section V-A), SFDR on real-world projects (Section V-B), qualitative FP taxonomy (Section V-C), and token/time overhead measurements (Section V-D)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Two authors with security and software engineering backgrounds independently examined 385 sampled warnings, classifying each as true positive or false positive, requiring more than 150 human hours. Disagreements were resolved through discussion (Section III-D)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper explicitly constructs a real-world dataset separate from the in-house benchmark 'to complement the in-house dataset and avoid data leakage' (Section III-C). The real-world projects use the latest commits not present in the tools' original evaluations." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Tables IV-VI provide per-CWE-type breakdowns for each tool. Figure 5 shows per-FP-reason distributions. Tables V-VI break down results per project within each CWE type." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Extensive failure analysis in Section V-C with a taxonomy of FP causes (Figure 4), quantified distribution across tools (Figure 5), and detailed illustrative examples (Figures 2, 6-13, Appendix A)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper is primarily about negative findings: low recall, high false discovery rates, and computational infeasibility. All four key findings (Findings 1-4) highlight limitations of current tools." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims about low recall, high false discovery rates, shallow interprocedural reasoning as root cause, and substantial computational costs are all supported by detailed results in Sections V-A through V-D and corresponding tables." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Root cause claims (e.g., 'shallow dataflow reasoning and misidentified source/sink pairs as primary failure causes') are justified through systematic manual analysis of 385 sampled reports using inductive open-coding with double coding by two independent experts (Section V-C)." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section VI-B (External Validity) explicitly bounds findings to the 5 LLM-based and 2 traditional tools evaluated, C/C++ and Java only, and specific CWE types. The paper notes results 'may not generalize uniformly to systems written in Rust, Go, TypeScript, or large multi-language projects.'" 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section VI-B discusses internal validity threats (labeling bias from personal experience, code misinterpretation) and external validity threats (tool selection limitations, language coverage). Section V-C systematically considers multiple root causes for failures rather than attributing to a single factor." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper's measurements (recall, SFDR, token usage, time) directly measure what is claimed. Recall measures detection capability; SFDR measures false alarm burden; overhead measures computational cost. The paper does not frame these as proxies for broader claims beyond what was measured." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Table I lists LLM backbones as 'Claude 3.5 Sonnet', 'O3-mini', and 'GPT-4' without API version identifiers or snapshot dates. 'GPT-4' could refer to multiple versions with different capabilities. No API version strings are provided." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "The paper states that 'prompts' are released as part of experimental artifacts on the project homepage (Section I). Additionally, Figure 9 shows an actual prompt used by IRIS, and the tools' released replication packages include their prompts." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Section IV states they 'follow the backbone models and hyperparameters (e.g. function call-chain exploration depth, temperature, and top-p) recommended in the original papers' but does not report the actual values. The reader must consult 5 separate original papers to determine settings." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section III-B describes each tool's workflow: RepoAudit (multi-agent path exploration with LLM validation), KNighter (LLM-generated CSA checkers), IRIS (LLM-inferred source/sink labels embedded in CodeQL templates), LLMDFA (agent-centric full path exploration), INFERROI (LLM intent inference with CFG exploration). Table I summarizes workflows." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section III-C documents preprocessing: ReposVul filtered to Linux kernel post-2019 for buildability, CWE-Bench-Java filtered to Maven projects, 50 cases randomly sampled from JLeaks, all manually verified. Real-world project selection criteria documented in Section III-C. FP coding procedure described in Section V-C." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section VI-B 'Threats to Validity' contains substantive discussion of internal validity (labeling subjectivity, code misinterpretation) and external validity (tool representation, language coverage, evolving models)." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section VI-B discusses specific threats: classification accuracy influenced by personal experience, selected tools 'may not fully represent all possible architectures, languages, or prompting paradigms,' and 'may not generalize uniformly to systems written in Rust, Go, TypeScript.' Mitigation measures are described (double coding, released artifacts)." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper clearly states what it does NOT cover: other programming languages (Rust, Go, TypeScript), non-open-source tools, CWE types not in the study, and future models. External validity section explicitly lists what may not generalize." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper releases 'all experimental artifacts, including evaluation scripts, prompts, taxonomy labels, and detailed statistics' on their project homepage. The in-house dataset sources (ReposVul, CWE-Bench-Java, JLeaks) are publicly available benchmarks." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section III-C details data collection: three source benchmarks with specific filtering criteria (post-2019 Linux kernel for C/C++, Maven projects for CWE-Bench-Java, random sample of 50 from JLeaks), manual verification of correctness, validity, and compilation success." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants; the annotators are the authors themselves. Data sources are published benchmarks and publicly available open-source projects with documented selection criteria." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from source benchmarks through filtering criteria to final 222 vulnerabilities is documented (Section III-C). For FP analysis: tool outputs → sampling up to 10 per tool per project → independent double labeling → disagreement resolution → final labels (Section III-D, V-C)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding, grants, or acknowledgments section is present in the paper. Chinese academic papers typically have NSFC or other funding sources, making the absence notable." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: Tianjin University (College of Intelligence and Computing), International Joint Institute of Tianjin University, and Peking University (Key Laboratory of High Confidence Software Technologies). None are affiliated with the companies whose tools are evaluated." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Since no funding is disclosed, independence cannot be assessed. The authors are academic researchers from Chinese universities evaluating third-party tools, suggesting no obvious financial conflict, but the absence of a funding statement prevents verification." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not state training data cutoff dates for any of the LLMs used (GPT-4, Claude 3.5 Sonnet, O3-mini). The in-house benchmark contains known CVEs from published databases that the LLMs may have encountered during training." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "The paper mentions constructing a real-world dataset 'to avoid data leakage' (Section III-C) but does not discuss whether the LLMs used by the tools may have been trained on the specific CVEs in the in-house benchmark (sourced from public databases like ReposVul)." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "The in-house benchmark uses CVEs from published databases (ReposVul, CWE-Bench-Java, JLeaks) that were publicly available before the training cutoffs of GPT-4, Claude 3.5 Sonnet, and O3-mini. This contamination risk is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. The two annotators labeling false positives are the paper's own authors, not study participants." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates software tools on open-source codebases." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. The annotators are described only as 'two authors with a background in security and software engineering.'" 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants. Inclusion/exclusion criteria for projects and vulnerabilities are documented separately under data_integrity." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants or experimental conditions requiring randomization." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants. The annotators necessarily knew which tool produced each warning during FP labeling." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table VII reports min/max/avg input tokens, output tokens, and runtime (minutes) per tool. Tables X-XI provide per-project breakdowns. E.g., RepoAudit consumes up to 225M input tokens and requires up to 2,450 minutes per project." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Section IV specifies hardware (dual Intel Xeon 6388, 512GB RAM, four NVIDIA A800 GPUs). Section V-D and Tables VII/X/XI quantify total token consumption and wall-clock time for all experiments." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No multiple-seed results are reported. LLM-based tools use stochastic APIs (temperature/top-p settings) but results are reported from what appears to be a single run per tool per project." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not state how many times each experiment was run. Given the stochastic nature of LLM outputs, this omission is significant." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. The paper uses default settings from original tools ('follow the backbone models and hyperparameters recommended in the original papers') but does not quantify this as a search budget." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section IV states they use released replication packages with recommended hyperparameters from the original papers, providing a clear and principled rationale for configuration selection rather than cherry-picking the best result." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper makes many comparative claims across 7 tools × 8 CWE types × multiple metrics without any statistical tests, let alone corrections for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The paper does not discuss potential bias from how they configured or ran the evaluated tools. While they use released packages (mitigating re-implementation bias), they do not acknowledge or discuss this as a consideration per Lucic et al. (2018)." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Section V-D explicitly analyzes the tradeoff between computational cost and detection effectiveness. Table VII and the discussion note that tools with highest token consumption (RepoAudit, LLMDFA) do not necessarily achieve the best detection rates." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether its in-house benchmark of 222 vulnerabilities across 8 CWE types adequately represents 'real-world vulnerability detection capability.' The representativeness and coverage of the selected CWE types relative to the broader vulnerability landscape is not analyzed." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "Each tool is evaluated as a complete bundled product with its own scaffold and LLM backbone. The paper compares tools as integrated systems (Table I), not isolated models, so the scaffold IS the thing being tested." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The real-world dataset uses 'latest versions' to reduce leakage, but the in-house benchmark contains published CVEs from databases like ReposVul that were available before the LLMs' training cutoffs. No temporal analysis of when benchmark data was published relative to model training is provided." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. For example, tools are provided with source-sink pairs for the in-house dataset (Section IV), which could leak information about vulnerability locations." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of potential overlap or non-independence between the three in-house benchmark sources (ReposVul, CWE-Bench-Java, JLeaks) or between the in-house and real-world datasets." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is used. The paper mentions constructing a real-world dataset 'to avoid data leakage' but does not employ any systematic leakage detection (canary strings, membership inference, etc.)." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "LLM-based methods exhibit low average recall on known vulnerabilities: 21.09% for C/C++ and 33.82% for Java, but still uncover more unique vulnerabilities than traditional tools.", 365 "evidence": "Table IV shows per-CWE recall for all tools. Figure 3 shows unique vulnerability detections per tool, with LLM-based tools contributing substantially more unique detections (e.g., IRIS found 23 unique CWE-722 cases). Section V-A.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Both LLM-based and traditional tools exhibit very high false discovery rates in real-world projects, with even the best-performing tool averaging 85.3% SFDR.", 370 "evidence": "Tables V-VI report SFDR per tool per project. RepoAudit averages 97.0% SFDR on C/C++, IRIS averages 94.4% on Java. Based on manual examination of 385 sampled warnings requiring >150 human hours. Section V-B.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Shallow interprocedural reasoning (37.5%) and imprecise source/sink identification (19.0%) are the dominant root causes of false positives across all tools.", 375 "evidence": "Figure 5 quantifies FP reason distribution: A1 (shallow interprocedural) accounts for 136/363 FPs, B1 (incorrect source/sink) accounts for 69/363. Detailed examples in Figures 6-13 and Appendix A. Section V-C.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Project-scale LLM-based detection requires up to hundreds of millions of tokens and multi-day runtimes, making scalability a critical bottleneck.", 380 "evidence": "Table VII shows RepoAudit consuming up to 225M input tokens and 2,450 minutes; LLMDFA up to 38M input tokens and 4,638 minutes. Per-project details in Tables X-XI. Section V-D.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Most missed detections in traditional tools and KNighter stem from source-sink specification mismatches with project-specific APIs.", 385 "evidence": "Figure 1 shows 62/64, 64/64, and 64/64 missed C/C++ vulnerabilities for CodeQL, Semgrep, and KNighter respectively are due to source-sink mismatch. Figure 2 provides a detailed illustrative example. Section V-A.", 386 "supported": "strong" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "No error bars or variance on stochastic results", 392 "detail": "LLM-based tools use non-deterministic API calls (temperature, sampling), yet all results are reported from apparently single runs without any uncertainty quantification, variance, or multi-run averaging." 393 }, 394 { 395 "flag": "Small in-house benchmark without size justification", 396 "detail": "The in-house benchmark contains only 222 vulnerabilities across 8 CWE types (some CWEs have only 10-14 cases), with no power analysis or justification for why this sample size is sufficient for the comparative claims made." 397 }, 398 { 399 "flag": "Author-annotated FP labels with potential bias", 400 "detail": "False positive labeling was performed by two of the paper's own authors. While they used double coding with disagreement resolution, independent external annotators would strengthen the validity of the FP taxonomy, especially given >150 hours of effort." 401 }, 402 { 403 "flag": "Contamination risk for in-house benchmark", 404 "detail": "The in-house dataset uses known CVEs from published databases (ReposVul, CWE-Bench-Java). The LLMs powering the evaluated tools (GPT-4, Claude 3.5 Sonnet) may have been trained on information about these CVEs, potentially inflating or deflating recall in unpredictable ways." 405 }, 406 { 407 "flag": "No funding disclosure", 408 "detail": "The paper contains no funding acknowledgments or competing interests statement, which is unusual for an academic paper from major Chinese universities." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "GPTScan: Detecting Logic Vulnerabilities in Smart Contracts by Combining GPT with Program Analysis", 414 "authors": ["Y. Sun", "D. Wu", "Y. Xue"], 415 "year": 2024, 416 "relevance": "Combines LLM reasoning with program analysis for vulnerability detection, a key approach in the LLM-augmented code analysis space." 417 }, 418 { 419 "title": "LLM-Assisted Static Analysis for Detecting Security Vulnerabilities", 420 "authors": ["Z. Li", "S. Dutta", "M. Naik"], 421 "year": 2025, 422 "relevance": "Proposes IRIS, one of the five evaluated LLM-based vulnerability detectors, demonstrating LLM-CodeQL integration for project-scale analysis." 423 }, 424 { 425 "title": "RepoAudit: An Autonomous LLM-Agent for Repository-Level Code Auditing", 426 "authors": ["J. Guo", "C. Wang", "X. Xu"], 427 "year": 2025, 428 "relevance": "Multi-agent LLM framework for repository-level vulnerability detection, one of the primary tools evaluated in this study." 429 }, 430 { 431 "title": "KNighter: Transforming Static Analysis with LLM-Synthesized Checkers", 432 "authors": ["C. Yang", "Z. Zhao", "Z. Xie"], 433 "year": 2025, 434 "relevance": "Uses LLMs to automatically generate static analysis checkers, demonstrating LLM-augmented tool synthesis for vulnerability detection." 435 }, 436 { 437 "title": "LLMDFA: Analyzing Dataflow in Code with Large Language Models", 438 "authors": ["C. Wang", "W. Zhang", "Z. Su"], 439 "year": 2024, 440 "relevance": "Agent-centric LLM approach for dataflow analysis in vulnerability detection, evaluated in this study." 441 }, 442 { 443 "title": "Exploring ChatGPT's Capabilities on Vulnerability Management", 444 "authors": ["P. Liu", "J. Liu", "L. Fu"], 445 "year": 2024, 446 "relevance": "Evaluates general-purpose LLM capabilities for vulnerability detection, providing context on LLM hallucination and reasoning limitations." 447 }, 448 { 449 "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?)", 450 "authors": ["S. Ullah", "M. Han", "S. Pujar"], 451 "year": 2024, 452 "relevance": "Comprehensive evaluation of LLM vulnerability detection capabilities showing fundamental reasoning limitations." 453 }, 454 { 455 "title": "VulEval: Towards Repository-Level Evaluation of Software Vulnerability Detection", 456 "authors": ["X.-C. Wen", "X. Wang", "Y. Chen"], 457 "year": 2024, 458 "relevance": "Repository-level evaluation framework for LLM-based vulnerability detection, directly relevant to project-scale assessment." 459 }, 460 { 461 "title": "Top Score on the Wrong Exam: On Benchmarking in Machine Learning for Vulnerability Detection", 462 "authors": ["N. Risse", "J. Liu", "M. Böhme"], 463 "year": 2025, 464 "relevance": "Critiques benchmarking practices in ML vulnerability detection, addressing construct validity concerns relevant to this survey's methodology assessment." 465 }, 466 { 467 "title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation", 468 "authors": ["Q. Wu", "G. Bansal", "J. Zhang"], 469 "year": 2024, 470 "relevance": "Multi-agent LLM framework foundational to agentic vulnerability detection workflows like RepoAudit." 471 }, 472 { 473 "title": "An Empirical Study of Static Analysis Tools for Secure Code Review", 474 "authors": ["W. Charoenwet", "P. Thongtanunam", "V.-T. Pham"], 475 "year": 2024, 476 "relevance": "Prior empirical study of static analysis tools for security, providing methodological precedent for this study's approach." 477 }, 478 { 479 "title": "GPT-4 Technical Report", 480 "authors": ["OpenAI"], 481 "year": 2023, 482 "arxiv_id": "2303.08774", 483 "relevance": "Technical report for GPT-4, the backbone LLM used by three of the five evaluated vulnerability detectors." 484 } 485 ] 486 }