scan.json (32517B)
1 { 2 "paper": { 3 "title": "Specification-Guided Vulnerability Detection with Large Language Models", 4 "authors": [ 5 "Hao Zhu", 6 "Jia Li", 7 "Cuiyun Gao", 8 "Jiaru Qian", 9 "Yihong Dong", 10 "Huanyu Liu", 11 "Lecheng Wang", 12 "Ziliang Wang", 13 "Xiaolong Hu", 14 "Ge Li" 15 ], 16 "year": 2025, 17 "venue": "arXiv.org", 18 "arxiv_id": "2511.04014", 19 "doi": "10.48550/arXiv.2511.04014" 20 }, 21 "scan_version": 2, 22 "active_modules": ["experimental_rigor", "data_leakage"], 23 "methodology_tags": ["benchmark-eval", "case-study"], 24 "key_findings": "VulInstruct achieves 45.0% F1-score and 37.7% recall on PrimeVul under the strict CORRECT evaluation framework, representing 32.7% and 50.8% relative improvements over the strongest baselines respectively. The approach uniquely detects 24.3% of all identified vulnerabilities (2.4x more than any baseline) through specification-guided reasoning that extracts reusable security specifications from historical vulnerabilities. Ablation shows both general and domain-specific specifications contribute complementarily, with an inverted-U relationship between knowledge quality and coverage. A real-world case study discovered CVE-2025-56538 in the Cyrus IMAP server by recognizing specification violations.", 25 "checklist": { 26 "artifacts": { 27 "code_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The abstract states: 'All code and supplementary materials are available at https://github.com/zhuhaopku/VulInstruct-temp.' A working URL is provided." 31 }, 32 "data_released": { 33 "applies": true, 34 "answer": true, 35 "justification": "The evaluation uses publicly available datasets: PrimeVul (Section 5.1) and CORRECT. The domain evidence base is constructed from publicly available NVD data (Section 4.1.2). Supplementary materials are available at the GitHub URL." 36 }, 37 "environment_specified": { 38 "applies": true, 39 "answer": false, 40 "justification": "Section 5.4 lists models used (Qwen3-Embedding-0.6B, DeepSeek-V3, GPT-5) and retrieval parameters (top-k=10, threshold ≥6), but provides no requirements.txt, Dockerfile, or detailed environment specifications with library versions." 41 }, 42 "reproduction_instructions": { 43 "applies": true, 44 "answer": false, 45 "justification": "No step-by-step reproduction instructions are described in the paper. The GitHub link is provided but no README contents, commands to run, or 'Reproducing Results' section is included in the paper itself." 46 } 47 }, 48 "statistical_methodology": { 49 "confidence_intervals_or_error_bars": { 50 "applies": true, 51 "answer": false, 52 "justification": "Tables 1, 2, and 3 report only point estimates (e.g., '45.0% F1', '37.7% recall') with no confidence intervals, error bars, or ± notation anywhere in the paper." 53 }, 54 "significance_tests": { 55 "applies": true, 56 "answer": false, 57 "justification": "The paper claims improvements like '32.7% relative improvement over the strongest baseline' based solely on comparing point estimates in Table 1. No statistical tests (p-values, t-tests, bootstrap, etc.) are used anywhere." 58 }, 59 "effect_sizes_reported": { 60 "applies": true, 61 "answer": true, 62 "justification": "Relative improvements are consistently reported with baseline context: '45.0% F1-score (32.7% improvement)', '37.7% recall (50.8% improvement)' in Section 6.1. Table 1 shows absolute values for all methods, providing context for magnitude of differences." 63 }, 64 "sample_size_justified": { 65 "applies": true, 66 "answer": false, 67 "justification": "The PrimeVul test set is used as-is with no justification for its size. For the threshold analysis, the authors 'randomly sample 100 out of 435 vulnerability–patch pairs' (Section 6.2) without justifying why 100 is sufficient." 68 }, 69 "variance_reported": { 70 "applies": true, 71 "answer": false, 72 "justification": "No standard deviations, variance, or spread measures are reported for any experimental results. All tables (1, 2, 3) show single-point results with no indication of run-to-run variation." 73 } 74 }, 75 "evaluation_design": { 76 "baselines_included": { 77 "applies": true, 78 "answer": true, 79 "justification": "Table 1 compares against 5 baselines across 4 paradigms: prompting (CoT), fine-tuning (ReVD, VulTrial), agent-based (GPTLens, VulTrial without fine-tuning), and retrieval-augmented (Vul-RAG)." 80 }, 81 "baselines_contemporary": { 82 "applies": true, 83 "answer": true, 84 "justification": "All baselines are from 2024-2025: ReVD (Wen et al. 2025), VulTrial (Widyasari et al. 2025), GPTLens (Hu et al. 2023, recent for this domain), Vul-RAG (Du et al. 2024). These represent the current state of the art." 85 }, 86 "ablation_study": { 87 "applies": true, 88 "answer": true, 89 "justification": "Table 2 presents comprehensive ablation: removing domain-specific specifications (F1 drops to 40.4%), removing general specifications (F1 drops to 41.0%), and knowledge utilization analysis showing four mutually exclusive categories. Figure 4 shows threshold ablation." 90 }, 91 "multiple_metrics": { 92 "applies": true, 93 "answer": true, 94 "justification": "The paper uses 7 metrics: Accuracy, Precision, Recall, F1-Score, P-C (pairwise accuracy), VP-S (pairwise discrimination), and MATCH rate (Section 5.3, Figures 5a-5b)." 95 }, 96 "human_evaluation": { 97 "applies": true, 98 "answer": false, 99 "justification": "The formal evaluation uses LLM-as-Judge (GPT-5) for reasoning correctness under the CORRECT framework (Section 5.4). The manual case analysis in Appendix D (20 samples) is supplementary qualitative analysis, not a formal human evaluation with structured ratings or inter-rater reliability." 100 }, 101 "held_out_test_set": { 102 "applies": true, 103 "answer": true, 104 "justification": "PrimeVul uses temporal data splitting: 'training data comes from vulnerabilities before a cutoff date, while test data comes from after' (Section 5.1). The knowledge base is also temporally filtered to predate the test set cutoff." 105 }, 106 "per_category_breakdown": { 107 "applies": true, 108 "answer": true, 109 "justification": "Figure 5a provides a radar chart comparing MATCH rates across 8 CWE categories. Figure 5b shows head vs. tail CWE performance. Table 2 breaks down performance by knowledge source type." 110 }, 111 "failure_cases_discussed": { 112 "applies": true, 113 "answer": true, 114 "justification": "Appendix C provides detailed failure analysis of Vul-RAG (premature secure classification, Table 5a) and ReVD (categorized failure types including zero reasoning, wrong type, over-generalization, Table 5b). Section 6.2 discusses the 0.7% of cases with no matched knowledge yielding 0% F1." 115 }, 116 "negative_results_reported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The threshold analysis (Figure 4) shows that both too-lenient (3,3,3) and too-strict (9,9,9) thresholds hurt performance significantly. VulInstruct's precision (55.8%) is lower than fine-tuned VulTrial's (59.9%) in Table 1. Ablation shows domain-specific-only knowledge yields 0% F1." 120 } 121 }, 122 "claims_and_evidence": { 123 "abstract_claims_supported": { 124 "applies": true, 125 "answer": true, 126 "justification": "Abstract claims of 45.0% F1 (32.7% improvement), 37.7% recall (50.8% improvement), 24.3% unique detections (2.4x), and 32.3% pairwise improvement all match Table 1. The CVE-2025-56538 discovery is detailed in Appendix E." 127 }, 128 "causal_claims_justified": { 129 "applies": true, 130 "answer": true, 131 "justification": "The primary causal claims ('specifications improve detection') are supported by controlled ablation studies in Table 2: removing domain-specific specifications reduces F1 by 4.6%, removing general specifications reduces it by 4.0%. These are single-variable manipulations adequate for the causal claims." 132 }, 133 "generalization_bounded": { 134 "applies": true, 135 "answer": false, 136 "justification": "The title 'Specification-Guided Vulnerability Detection with Large Language Models' and claims like 'establishes new state-of-the-art' are broad, but results are exclusively on PrimeVul (C/C++ function-level detection). The paper does not explicitly bound claims to C/C++ or function-level analysis." 137 }, 138 "alternative_explanations_discussed": { 139 "applies": true, 140 "answer": false, 141 "justification": "Section 8 discusses implementation validity, experimental fairness, temporal validity, and LLM-as-Judge reliability, but does not consider alternative explanations for VulInstruct's improvements (e.g., whether providing any additional context rather than specifically specifications would help equally, or whether the retrieval mechanism matters more than specification quality)." 142 }, 143 "proxy_outcome_distinction": { 144 "applies": true, 145 "answer": true, 146 "justification": "The paper explicitly argues that binary classification is an inadequate proxy for vulnerability understanding (Section 2.1, 5.3) and adopts CORRECT evaluation requiring correct reasoning, not just correct labels. Claims are made at the granularity of the metrics (F1 under CORRECT, MATCH rate)." 147 } 148 }, 149 "setup_transparency": { 150 "model_versions_specified": { 151 "applies": true, 152 "answer": false, 153 "justification": "Section 5.4 and Tables 1/3 list models as 'DeepSeek-V3', 'GPT-5', 'GPT-OSS-120B', 'Claude-Sonnet-4', 'DeepSeek-R1', 'Qwen3-Embedding-0.6B'. None include snapshot dates or API versions. Per the schema, marketing names without snapshot dates do not count." 154 }, 155 "prompts_provided": { 156 "applies": true, 157 "answer": true, 158 "justification": "Appendix B (Section B) provides the full text of all key prompts: general specification extraction, detailed vulnerability case generation, knowledge scoring, domain-specific specification extraction, and vulnerability detection prompts, including format specifications and one-shot examples." 159 }, 160 "hyperparameters_reported": { 161 "applies": true, 162 "answer": false, 163 "justification": "Section 5.4 reports retrieval parameters (top-k=10, threshold ≥6) and embedding model, but does not report LLM generation parameters (temperature, top-p, max tokens) for any of the models used in the pipeline." 164 }, 165 "scaffolding_described": { 166 "applies": true, 167 "answer": true, 168 "justification": "The multi-stage pipeline is described in detail in Section 4 with Figure 3: offline knowledge construction, code context extraction (Eq. 2), dual-path retrieval, knowledge scoring, domain-specific specification generation, and structured reasoning. Appendix F describes the automatic context extraction tool." 169 }, 170 "data_preprocessing_documented": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 4.1.2 documents NVD crawling (173,070 valid cases, 15,391 rejected, 2002-2024). Section 5.1 documents temporal filtering (1,338 pairs from CORRECT predating PrimeVul test cutoff). Section 4.1.1 describes the context extraction pipeline." 174 } 175 }, 176 "limitations_and_scope": { 177 "limitations_section_present": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 8 'Threats to Validity' provides substantive discussion across four areas: implementation validity, experimental fairness, temporal validity, and LLM-as-Judge reliability. Section 7 'Discussion' also discusses directions for improvement." 181 }, 182 "threats_to_validity_specific": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 8 discusses study-specific threats: baseline failure analysis (Vul-RAG premature classification, ReVD reasoning failures), the choice to exclude CWE information from all methods for fairness, exclusion of SVEN dataset due to lacking CVE metadata, and the specific bias risk of using GPT-5 as judge." 186 }, 187 "scope_boundaries_stated": { 188 "applies": true, 189 "answer": false, 190 "justification": "The paper does not explicitly state what results do NOT show. It doesn't bound findings to C/C++ function-level detection, doesn't discuss limitations to the PrimeVul vulnerability types, and doesn't acknowledge that real-world deployment differs from benchmark evaluation. Section 7 discusses future work but not scope boundaries." 191 } 192 }, 193 "data_integrity": { 194 "raw_data_available": { 195 "applies": true, 196 "answer": false, 197 "justification": "While the GitHub URL is provided and input datasets are public (PrimeVul, NVD), the paper does not explicitly state that raw experimental outputs (per-example predictions, LLM responses, scoring details) are available for independent verification." 198 }, 199 "data_collection_described": { 200 "applies": true, 201 "answer": true, 202 "justification": "Section 4.1.2 describes NVD data collection (crawling, normalization, temporal range 2002-2024, fields extracted). Section 5.1 describes CORRECT and PrimeVul dataset roles. Section 4.1.1 describes context extraction with Eq. 2." 203 }, 204 "recruitment_methods_described": { 205 "applies": false, 206 "answer": false, 207 "justification": "No human participants. Data sources are standard benchmarks (PrimeVul, CORRECT) and public databases (NVD)." 208 }, 209 "data_pipeline_documented": { 210 "applies": true, 211 "answer": true, 212 "justification": "The full pipeline is documented: NVD crawling → normalization → temporal filtering (Section 4.1.2), CORRECT dataset → context extraction → specification extraction (Section 4.1.1), PrimeVul test set with temporal cutoff (Section 5.1). Figure 3 provides a visual overview." 213 } 214 }, 215 "conflicts_of_interest": { 216 "funding_disclosed": { 217 "applies": true, 218 "answer": false, 219 "justification": "No funding sources, grants, or acknowledgments section is present anywhere in the paper." 220 }, 221 "affiliations_disclosed": { 222 "applies": true, 223 "answer": true, 224 "justification": "All author affiliations are listed on the first page: Peking University, Tsinghua University, Harbin Institute of Technology, and New H3C Technologies Co., Ltd." 225 }, 226 "funder_independent_of_outcome": { 227 "applies": true, 228 "answer": false, 229 "justification": "No funding is disclosed, so independence cannot be assessed. One author (Xiaolong Hu) is affiliated with New H3C Technologies Co., Ltd, a commercial entity, but no funding relationship is stated." 230 }, 231 "financial_interests_declared": { 232 "applies": true, 233 "answer": false, 234 "justification": "No competing interests or financial interests statement is present in the paper." 235 } 236 }, 237 "contamination": { 238 "training_cutoff_stated": { 239 "applies": true, 240 "answer": false, 241 "justification": "The paper does not state the training data cutoff dates for any of the LLMs used (DeepSeek-V3, GPT-5, DeepSeek-R1, Claude-Sonnet-4). PrimeVul's temporal split is discussed but this concerns the knowledge base, not the LLM pre-training data." 242 }, 243 "train_test_overlap_discussed": { 244 "applies": true, 245 "answer": false, 246 "justification": "The paper discusses temporal filtering for the knowledge base (Section 5.1, Section 8) but does not discuss whether the LLMs (DeepSeek-V3, GPT-5, etc.) may have been pre-trained on PrimeVul test data or the CORRECT vulnerability descriptions used in evaluation." 247 }, 248 "benchmark_contamination_addressed": { 249 "applies": true, 250 "answer": false, 251 "justification": "PrimeVul was published in 2024. The LLMs used (DeepSeek-V3, GPT-5) could have been trained on PrimeVul data or the underlying CVE descriptions. The paper relies on PrimeVul's temporal split but does not address LLM pre-training contamination with the benchmark itself." 252 } 253 }, 254 "human_studies": { 255 "pre_registered": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "irb_or_ethics_approval": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "demographics_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "inclusion_exclusion_criteria": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "randomization_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "blinding_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 }, 285 "attrition_reported": { 286 "applies": false, 287 "answer": false, 288 "justification": "No human participants in this study." 289 } 290 }, 291 "cost_and_practicality": { 292 "inference_cost_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "VulInstruct involves multiple LLM calls per example (query generation, knowledge scoring for up to 20 items, domain-specific specification generation, and final detection), but no inference cost, latency, or tokens consumed are reported." 296 }, 297 "compute_budget_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "No total computational budget is stated. No API costs, GPU hours, or total processing time reported for the 435 test pairs or knowledge base construction." 301 } 302 }, 303 "experimental_rigor": { 304 "seed_sensitivity_reported": { 305 "applies": true, 306 "answer": false, 307 "justification": "No mention of multiple random seeds. All results in Tables 1, 2, and 3 appear to be from single experimental runs with no seed sensitivity analysis." 308 }, 309 "number_of_runs_stated": { 310 "applies": true, 311 "answer": false, 312 "justification": "The paper never explicitly states how many experimental runs produced the reported results. No 'averaged over K runs' or equivalent statement." 313 }, 314 "hyperparameter_search_budget": { 315 "applies": true, 316 "answer": false, 317 "justification": "Figure 4 shows a threshold analysis with 4 settings, but the total search budget for other hyperparameters (top-k, embedding model choice, scoring rubric design) is not reported. The threshold analysis was conducted on 100 test samples, not a validation set." 318 }, 319 "best_config_selection_justified": { 320 "applies": true, 321 "answer": false, 322 "justification": "The threshold analysis (Figure 4) was performed on 100 randomly sampled test pairs. Tuning hyperparameters on test data compromises the evaluation. Additionally, Figure 4 shows (5,5,5) as optimal but Section 5.4 uses (6,6,6) without explanation for the discrepancy." 323 }, 324 "multiple_comparison_correction": { 325 "applies": false, 326 "answer": false, 327 "justification": "No statistical tests are performed anywhere in the paper, so multiple comparison correction is not applicable." 328 }, 329 "self_comparison_bias_addressed": { 330 "applies": true, 331 "answer": false, 332 "justification": "The authors implement and evaluate their own system against baselines without acknowledging potential bias from evaluating their own system. While they analyze baseline failures (Appendix C), they don't acknowledge that their implementations or configurations of baselines might underperform." 333 }, 334 "compute_budget_vs_performance": { 335 "applies": true, 336 "answer": false, 337 "justification": "VulInstruct requires multiple LLM calls per example (retrieval queries, scoring each of ~20 retrieved items, specification generation, detection), while CoT baselines use a single call. This significant compute difference is never discussed or compared against performance gains." 338 }, 339 "benchmark_construct_validity": { 340 "applies": true, 341 "answer": true, 342 "justification": "The paper extensively discusses benchmark validity: Section 2.1 argues that binary classification is insufficient, Section 5.3 explains why CORRECT evaluation is needed to assess genuine vulnerability understanding, and Section 8 discusses why SVEN was excluded. The adoption of reasoning-aware evaluation directly addresses construct validity." 343 }, 344 "scaffold_confound_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "Table 1 compares VulInstruct (DeepSeek-V3) against ReVD (fine-tuned Qwen2.5-Coder), VulTrial (fine-tuned GPT-4o), and GPTLens (DeepSeek-V3), conflating model and method effects. While Table 3 partially addresses this by testing VulInstruct across multiple models, the main comparison does not control for the scaffolding confound." 348 } 349 }, 350 "data_leakage": { 351 "temporal_leakage_addressed": { 352 "applies": true, 353 "answer": true, 354 "justification": "Section 5.1 enforces strict temporal filtering: 'We use PrimeVul's test set cutoff date as our boundary. Only vulnerabilities published before this date are included in our knowledge base.' Section 8 further confirms CVE retrieval uses temporal and ID-ordering constraints." 355 }, 356 "feature_leakage_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "While temporal filtering prevents using future data, the paper does not discuss whether retrieved specifications or CVE cases could encode information about the specific test vulnerability (e.g., if a specification was extracted from a closely related vulnerability in the same project)." 360 }, 361 "non_independence_addressed": { 362 "applies": true, 363 "answer": false, 364 "justification": "The paper does not discuss whether the knowledge base (from CORRECT) and test set (from PrimeVul) share vulnerabilities from the same projects, codebases, or authors, which could create non-independence between training and test conditions." 365 }, 366 "leakage_detection_method": { 367 "applies": true, 368 "answer": true, 369 "justification": "Section 5.1 applies temporal filtering as a concrete prevention method: knowledge base entries must predate PrimeVul's test cutoff. Section 4.2 additionally filters domain-specific CVEs by publication date and CVE ID ordering relative to the target." 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "VulInstruct achieves 45.0% F1-score, a 32.7% relative improvement over the strongest baseline (GPTLens at 33.9%) on PrimeVul.", 376 "evidence": "Table 1 shows F1 scores for all methods. GPTLens achieves 33.9%, VulInstruct 45.0%. Section 6.1 discusses the results.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "VulInstruct achieves 37.7% recall, a 50.8% improvement over GPTLens (25.0%), detecting substantially more vulnerabilities.", 381 "evidence": "Table 1 shows recall values. VulInstruct's recall is highest at 37.7% vs GPTLens at 25.0%. Section 6.1.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "VulInstruct uniquely detects 24.3% of all identified vulnerabilities, 2.4x more than any baseline.", 386 "evidence": "Table 1 'Unique' column shows VulInstruct at 24.3% vs next best VulTrial at 10.2%. Section 6.1 defines Unique as percentage of vulnerabilities exclusively detected by one method among all 226 detected CVEs.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Both general and domain-specific specifications are complementary and contribute to detection performance.", 391 "evidence": "Table 2 ablation: removing domain-specific specs drops F1 from 45.0% to 40.4% (-4.6%); removing general specs drops to 41.0% (-4.0%). Utilization analysis shows 73.5% of samples use both sources, achieving 48.3% F1.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "VulInstruct generalizes effectively across CWE types and different base LLMs.", 396 "evidence": "Figure 5a shows consistently higher MATCH rates across 8 CWE categories. Figure 5b shows improvements on both head (38.4%) and tail (28.8%) CWEs. Table 3 shows improvements on GPT-OSS-120B, Claude-Sonnet-4, and DeepSeek-R1.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "An optimal threshold exists for balancing knowledge quality and coverage (inverted-U relationship).", 401 "evidence": "Figure 4 shows F1 peaks at (5,5,5) threshold (43.4%) and drops at both lenient (3,3,3: 40.2%) and strict (9,9,9: 31.1%) settings. Section 6.2.", 402 "supported": "moderate" 403 }, 404 { 405 "claim": "VulInstruct discovered a previously unknown high-severity vulnerability (CVE-2025-56538) in the Cyrus IMAP server.", 406 "evidence": "Section 7 and Appendix E detail the case study: a privilege bypass in mboxlist_renamemailbox via a goto statement bypassing ACL checks. The vulnerability was confirmed by developers, reproduced via integration tests, and patched.", 407 "supported": "strong" 408 } 409 ], 410 "red_flags": [ 411 { 412 "flag": "No uncertainty quantification", 413 "detail": "All results across Tables 1, 2, and 3 are single-point estimates with no confidence intervals, error bars, variance, or statistical tests. For LLM-based systems with inherent stochasticity, this is a significant omission that makes it impossible to assess result stability." 414 }, 415 { 416 "flag": "LLM-as-Judge circular dependency", 417 "detail": "The CORRECT evaluation framework uses GPT-5 to judge reasoning correctness. Using an LLM to evaluate other LLMs' vulnerability reasoning introduces potential bias. The paper acknowledges this in Section 8 but only notes they use a different model from those evaluated." 418 }, 419 { 420 "flag": "Threshold tuned on test data", 421 "detail": "The threshold analysis (Figure 4) randomly samples 100 of 435 test pairs to select the optimal threshold. Tuning hyperparameters on test data compromises evaluation integrity. Additionally, the analysis finds (5,5,5) optimal but Section 5.4 uses (6,6,6) without explaining the discrepancy." 422 }, 423 { 424 "flag": "Undisclosed compute costs", 425 "detail": "VulInstruct requires multiple LLM calls per example (query generation, scoring ~20 retrieved items, specification generation, detection), making it substantially more expensive than single-call baselines. This cost is never quantified or discussed." 426 }, 427 { 428 "flag": "LLM pre-training contamination unaddressed", 429 "detail": "While the knowledge base is temporally filtered, the underlying LLMs (DeepSeek-V3, GPT-5, etc.) may have been pre-trained on PrimeVul data, CVE descriptions, or patches used in evaluation. This contamination vector is not discussed." 430 }, 431 { 432 "flag": "Single real-world case study presented as evidence of practical utility", 433 "detail": "The discovery of CVE-2025-56538 (Appendix E) is compelling but represents a single case using a different LLM (Gemini-2.5-Pro) than the main experiments. Generalizing from one case to 'practical value for real-world vulnerability discovery' overstates the evidence." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "Vulnerability Detection with Code Language Models: How Far Are We?", 439 "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim"], 440 "year": 2024, 441 "arxiv_id": "2403.18624", 442 "relevance": "Introduces the PrimeVul benchmark used for evaluation and documents that LLMs achieve <12% accuracy distinguishing vulnerable from patched code." 443 }, 444 { 445 "title": "Everything You Wanted to Know About LLM-based Vulnerability Detection But Were Afraid to Ask", 446 "authors": ["Yue Li", "Xiao Li", "Hao Wu"], 447 "year": 2025, 448 "arxiv_id": "2504.13474", 449 "relevance": "Proposes the CORRECT evaluation framework requiring both correct predictions and valid reasoning, adopted as the primary evaluation methodology." 450 }, 451 { 452 "title": "Large Language Model-Powered Smart Contract Vulnerability Detection: New Perspectives", 453 "authors": ["Sihao Hu", "Tiansheng Huang", "Fatih İlhan"], 454 "year": 2023, 455 "relevance": "Proposes GPTLens multi-agent framework for vulnerability detection, used as a key baseline achieving the second-best F1 in comparisons." 456 }, 457 { 458 "title": "Let the Trial Begin: A Mock-Court Approach to Vulnerability Detection using LLM-Based Agents", 459 "authors": ["Ratnadira Widyasari", "Martin Weyssow"], 460 "year": 2025, 461 "arxiv_id": "2505.10961", 462 "relevance": "Introduces VulTrial multi-agent courtroom-inspired vulnerability detection framework, a key baseline with 102.39% improvement over single-agent approaches." 463 }, 464 { 465 "title": "Boosting Vulnerability Detection of LLMs via Curriculum Preference Optimization with Synthetic Reasoning Data", 466 "authors": ["Xin-Cheng Wen", "Yijun Yang", "Cuiyun Gao"], 467 "year": 2025, 468 "doi": "10.18653/v1/2025.findings-acl.467", 469 "relevance": "Proposes ReVD using reinforcement learning with reasoning distillation for vulnerability detection, a fine-tuning baseline." 470 }, 471 { 472 "title": "Vul-RAG: Enhancing LLM-based Vulnerability Detection via Knowledge-Level RAG", 473 "authors": ["Xueying Du", "Geng Zheng", "Kaixin Wang"], 474 "year": 2024, 475 "relevance": "RAG-based vulnerability detection approach used as a baseline; its retrieval-based decision mechanism is analyzed in the failure study." 476 }, 477 { 478 "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?)", 479 "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar"], 480 "year": 2024, 481 "arxiv_id": "2312.12575", 482 "relevance": "Comprehensive evaluation showing fundamental limitations of LLMs in vulnerability reasoning, motivating the specification-guided approach." 483 }, 484 { 485 "title": "Top Score on the Wrong Exam: On Benchmarking in Machine Learning for Vulnerability Detection", 486 "authors": ["Niklas Risse", "Jing Liu", "Marcel Böhme"], 487 "year": 2025, 488 "relevance": "Reports that most functions in vulnerability benchmarks cannot be classified without knowing invocation context, motivating richer context extraction." 489 }, 490 { 491 "title": "Harnessing Large Language Models for Software Vulnerability Detection: A Comprehensive Benchmarking Study", 492 "authors": ["Karl Tamberg", "Hayretdin Bahsi"], 493 "year": 2025, 494 "doi": "10.1109/access.2025.3541146", 495 "relevance": "Comprehensive benchmarking study of LLMs for vulnerability detection providing context for the state of the field." 496 }, 497 { 498 "title": "R2Vul: Learning to Reason about Software Vulnerabilities with Reinforcement Learning and Structured Reasoning Distillation", 499 "authors": ["Martin Weyssow", "Chengran Yang"], 500 "year": 2025, 501 "arxiv_id": "2504.04699", 502 "relevance": "Combines reinforcement learning with structured reasoning distillation for vulnerability detection with explanations." 503 }, 504 { 505 "title": "Benchmarking LLMs and LLM-based Agents in Practical Vulnerability Detection for Code Repositories", 506 "authors": ["Alperen Yildiz", "Sin G Teo"], 507 "year": 2025, 508 "arxiv_id": "2503.03586", 509 "relevance": "Benchmarks LLM-based agents for repository-level vulnerability detection, exploring ReAct agents for recursive callee retrieval." 510 }, 511 { 512 "title": "BugScope: Learn to Find Bugs Like Human", 513 "authors": ["Jinyao Guo", "Chengpeng Wang"], 514 "year": 2025, 515 "relevance": "Uses LLMs to summarize vulnerability cases and expand code patterns for vulnerability types, a related knowledge augmentation approach." 516 } 517 ] 518 }