scan.json (28877B)
1 { 2 "paper": { 3 "title": "LLM-BSCVM: An LLM-Based Blockchain Smart Contract Vulnerability Management Framework", 4 "authors": [ 5 "Yanli Jin", 6 "Chunpei Li", 7 "Peng Fan", 8 "Peng Liu", 9 "Xianxian Li", 10 "Chen Liu", 11 "Wangjie Qiu" 12 ], 13 "year": 2025, 14 "venue": "arXiv.org", 15 "arxiv_id": "2505.17416", 16 "doi": "10.48550/arXiv.2505.17416" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval", "case-study"], 21 "key_findings": "LLM-BSCVM proposes a six-agent pipeline for smart contract vulnerability management combining fine-tuned CodeLlama, static analysis, and RAG-based retrieval. The weighted fusion variant achieves 91.1% accuracy and 91.0% F1 on a benchmark dataset, with a 5.1% false positive rate (vs 7.2% for TrustLLM). However, only 21% of repaired contracts pass GPT-4 verification, indicating the repair capability remains limited.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper provides a GitHub link (https://github.com/sosol717/LLM-BSCVM) in both the abstract and Section I, stating the code is open-source." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The evaluation dataset uses publicly available sources: TrustLLM dataset from Solodit (263 audit reports) and Dappscan (1,199 open-source audit reports). These are existing public datasets." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No requirements.txt, Dockerfile, or environment specification is mentioned in the paper. No library versions or dependencies are listed." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No step-by-step reproduction instructions are provided in the paper. The paper does not describe how to set up and run the framework to replicate results." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results in Tables I–IV are point estimates only (e.g., '91.11% accuracy'). No confidence intervals, error bars, or ± notation are provided." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims LLM-BSCVM outperforms baselines and that FPR is reduced, but no statistical significance tests (p-values, t-tests, etc.) are used to support these claims." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The paper reports improvements with baseline context: FPR reduction from 7.2% to 5.1% (2.2pp), accuracy surpassing CodeLlama 13B by ~48 percentage points, and Table II provides full baseline results for context." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "No justification for the evaluation dataset size is provided. The paper states the data sources (263 + 1,199 audit reports) but does not justify why this is sufficient or discuss statistical power." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No standard deviations, variance, or multi-run results are reported. All results appear to be from single experimental runs." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Tables II and III compare against multiple baselines: CodeLlama 7B/13B, CodeBERT, CodeT5, Llama 8B (both zero-shot and LoRA fine-tuned), and TrustLLM." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Baselines include TrustLLM (2024), CodeLlama (2023), and CodeT5 (2021). The primary comparison target, TrustLLM, is contemporary." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Table IV presents an ablation study removing the static analysis module (w/o Static) and the RAG module (w/o RAG), showing their individual contributions to overall performance." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "The paper reports F1, Recall, Precision, Accuracy, and False Positive Rate across experiments." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "No human evaluation is performed. Patch verification uses GPT-4 as an automated verifier, not human experts. Vulnerability detection is evaluated purely via automated metrics." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": false, 101 "justification": "No explicit description of train/test splitting. The paper states it uses 'the same dataset as TrustLLM' but does not describe how the data was split between fine-tuning and evaluation, or whether a held-out test set was used." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": false, 106 "justification": "No per-vulnerability-type breakdown is provided. All detection results are aggregate metrics across all vulnerability types." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section IV.B discusses LLM-BSCVM(E)'s performance decline due to excessive contextual information, and acknowledges that only 21% of repaired contracts pass verification, with analysis of why repairs fail." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "LLM-BSCVM(E) shows worse performance (80.4% accuracy) when incorporating retrieved context directly into detection, contradicting the expected benefit. The 21% repair success rate is also candidly reported." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims of 91% F1 and accuracy match Table I (91.04% F1, 91.11% accuracy). FPR reduction from 7.2% to 5.1% is confirmed in Section IV.B. Claim of being 'comparable to SOTA' is supported by Table III comparison with TrustLLM." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The ablation study (Table IV) provides controlled single-variable manipulation evidence for claims about component contributions. The paper states 'removal of the static analysis module led to a significant decline' with ablation evidence." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper claims to provide 'comprehensive vulnerability management services' for 'the Web 3.0 ecosystem' (abstract and conclusion) but only tests on Solidity smart contracts from specific audit databases. The title claims 'Blockchain Smart Contract Vulnerability Management' broadly despite testing on a single platform." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "No alternative explanations for the observed results are discussed. The paper does not consider confounds such as dataset-specific patterns, selection bias in the audit report sources, or whether improvements come from ensembling rather than the proposed method." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper measures binary vulnerability detection accuracy and treats GPT-4 verification as ground truth for repair success, but does not acknowledge that automated detection accuracy is a proxy for actual security improvement, or that GPT-4 verification is a proxy for formal correctness verification." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "The paper mentions 'CodeLlama,' 'CodeLlama-13B,' 'CodeBERT,' 'CodeT5,' 'Llama,' and 'GPT-4' without specific version strings, snapshot dates, or API versions." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": false, 155 "justification": "Figure 4 shows a prompt template with placeholders like '{code}' and vague references to '<Audited Smart Contracts>', '<Vulnerability Description>', '<Best practices>'. The actual fill values and complete prompt text are not provided." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": false, 160 "justification": "Only the retrieval parameter k=5 and detection weight distribution (70%/10%/20%) are reported. No LLM inference hyperparameters (temperature, top-p, max tokens) or LoRA fine-tuning hyperparameters (learning rate, epochs, rank) are stated." 161 }, 162 "scaffolding_described": { 163 "applies": true, 164 "answer": true, 165 "justification": "The six-agent pipeline is described in detail in Section III.B with specific roles (Detector, Advisor, Assessor, Fixer, Verifier, Reporter), information flow between agents, and the RAG retrieval mechanism. Figures 1-3 illustrate the architecture." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": false, 170 "justification": "The knowledge base construction (TF-IDF vectorization, chunking, embedding) is described, but the evaluation data preprocessing is not documented. How the 263 TrustLLM and 1,199 Dappscan reports were processed into evaluation examples, or how labels were assigned, is not described." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "No dedicated limitations section exists. The conclusion mentions future work directions (symbolic execution integration, human-computer interaction) but does not substantively discuss the current approach's limitations." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No threats to validity are discussed anywhere in the paper." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "No explicit scope boundaries are stated. The paper does not describe what it did NOT test (e.g., non-Solidity contracts, real-world deployment, cross-chain vulnerabilities) or what claims it is NOT making." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "While the source datasets (TrustLLM, Dappscan) are publicly available, the specific processed evaluation dataset used in experiments is not released. The paper does not provide the exact set of contracts used for evaluation." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section IV.A describes the two data sources: TrustLLM from Solodit (263 smart contract audit reports) and Dappscan (1,199 open-source audit reports from 29 security teams)." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data sources are publicly available smart contract audit databases." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": false, 209 "justification": "The paper does not document how the raw audit reports were transformed into the evaluation dataset. The number of contracts actually used for evaluation, filtering criteria, and label assignment process are not described." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding source or acknowledgments section is present in the paper." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are listed: Guangxi Normal University, Zhongguancun Laboratory, and Beihang University. These are academic institutions not directly connected to the evaluated tools." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding is disclosed, making it impossible to assess funder independence." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests or financial interests statement is present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "No training data cutoff date is stated for CodeLlama or any other model used. The fine-tuning data's temporal scope is also not specified." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of potential train/test overlap. Critically, the RAG knowledge base is built from the same TrustLLM/Solodit data used for evaluation, creating a potential data leakage issue that is not addressed." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "No discussion of whether the benchmark contracts could have appeared in CodeLlama's pretraining data. The audit reports from Solodit and Dappscan are publicly available and could be in the training corpus." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No inference cost, latency, or token consumption is reported despite the framework invoking multiple LLM calls across six agents plus RAG retrieval and GPT-4 verification." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No GPU hours, training time for LoRA fine-tuning, total API spend, or hardware specifications are stated." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs is never stated. It is unclear whether results are from one run or averaged." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search budget is reported. The weight distribution (70%/10%/20%) and k=5 appear chosen without documented search process." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "The selection of the weighted fusion approach LLM-BSCVM(W) as the best configuration is presented post-hoc from Table I results, with no description of how the 70/10/20 weights were selected or validated." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "No acknowledgment that the authors are evaluating their own system. The baselines use the authors' implementations without discussion of potential bias." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "LLM-BSCVM uses fine-tuned CodeLlama + static analysis + RAG + GPT-4 verification, far more compute than bare baseline models, but no compute-matched comparison is provided or discussed." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "No discussion of whether the benchmark (binary safe/vulnerable classification of audit report contracts) actually measures vulnerability management effectiveness, or whether audit-report-derived labels represent real-world vulnerability prevalence." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": true, 343 "answer": false, 344 "justification": "LLM-BSCVM combines fine-tuning + RAG + static analysis + multi-agent scaffolding but is compared against bare models without any scaffolding. The improvement could come from the scaffolding ensemble rather than the proposed method's novelty. This confound is not addressed." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "No discussion of temporal leakage. The audit reports used for evaluation may predate CodeLlama's training data, meaning solutions could be in the pretraining corpus." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "The RAG retrieval corpus is sourced from the same TrustLLM/Solodit data used for evaluation. Retrieved similar contracts during detection may contain the evaluation target's ground truth label, constituting feature leakage. This is not addressed." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No discussion of whether fine-tuning data and evaluation data share contracts from the same audit reports, projects, or authors, which would violate independence assumptions." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention methods are applied. No decontamination, temporal splits, or membership inference tests are used." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "LLM-BSCVM(W) achieves 91.11% detection accuracy and 91.04% F1 score on the benchmark dataset.", 373 "evidence": "Table I shows LLM-BSCVM(W) with F1=0.9104, Recall=0.8743, Precision=0.9506, Accuracy=0.9111.", 374 "supported": "moderate" 375 }, 376 { 377 "claim": "LLM-BSCVM reduces the false positive rate from 7.2% (TrustLLM) to 5.1%.", 378 "evidence": "Section IV.B states the false positive rate comparison. No statistical test confirms the difference is significant.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "LLM-BSCVM outperforms all zero-shot baseline models, surpassing CodeLlama 13B by approximately 48 percentage points in accuracy.", 383 "evidence": "Table II shows CodeLlama 13B at 42.55% accuracy vs LLM-BSCVM(W) at 91.11%. However, the comparison is between a scaffolded, fine-tuned system and bare zero-shot models.", 384 "supported": "weak" 385 }, 386 { 387 "claim": "Approximately 21% of repaired contracts successfully passed independent GPT-4 validation.", 388 "evidence": "Section IV.B ('Vulnerability repair') states this result with acknowledgment that further optimization is needed.", 389 "supported": "weak" 390 }, 391 { 392 "claim": "Excessive contextual information in LLM-BSCVM(E) distracts the model and reduces detection accuracy.", 393 "evidence": "Table I shows LLM-BSCVM(E) at 80.42% accuracy vs 91.11% for LLM-BSCVM(W). The paper speculates this is due to attention distraction from excessive context.", 394 "supported": "moderate" 395 }, 396 { 397 "claim": "LLM-BSCVM is the first end-to-end smart contract vulnerability management framework.", 398 "evidence": "Stated as a contribution in Section I, but no systematic literature search or evidence is provided to confirm no prior end-to-end framework exists.", 399 "supported": "unsupported" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "Unfair baseline comparison", 405 "detail": "Table II compares LLM-BSCVM (fine-tuned CodeLlama + RAG + static analysis + multi-agent pipeline) against bare zero-shot models with no scaffolding, fine-tuning, or retrieval. The 48pp accuracy gap largely reflects the unfairness of the comparison rather than methodological novelty." 406 }, 407 { 408 "flag": "Potential data leakage between RAG corpus and evaluation set", 409 "detail": "The smart contract corpus used for RAG retrieval is sourced from the same TrustLLM/Solodit data used for evaluation. Retrieved similar contracts may carry ground truth labels or vulnerability information that leaks into the detection decision. This is never acknowledged." 410 }, 411 { 412 "flag": "No uncertainty quantification", 413 "detail": "All results are single-run point estimates with no error bars, confidence intervals, or multi-seed analysis. The 0.17pp difference between LLM-BSCVM(W) and TrustLLM in F1 (0.9104 vs 0.9121) could easily be within noise." 414 }, 415 { 416 "flag": "Low repair success rate underemphasized", 417 "detail": "Only 21% of repaired contracts pass GPT-4 verification, meaning 79% of repairs fail. This is buried in Section IV.B and not mentioned in the abstract, which focuses only on detection results. The abstract claim of 'enhancing the reliability of vulnerability management' is misleading given this repair failure rate." 418 }, 419 { 420 "flag": "GPT-4 as sole evaluator for repair quality", 421 "detail": "Patch correctness is evaluated solely by GPT-4, with no human expert review, formal verification, or test suite execution. GPT-4's own limitations in vulnerability assessment are not discussed." 422 }, 423 { 424 "flag": "Claims significantly outrun evidence", 425 "detail": "The paper claims to provide 'comprehensive vulnerability management services' for 'the Web 3.0 ecosystem' based on binary detection evaluation on Solidity contracts from two specific audit databases, with no cross-chain, cross-language, or real-world deployment evaluation." 426 }, 427 { 428 "flag": "Missing train/test split documentation", 429 "detail": "The paper does not describe how the dataset was split between fine-tuning and evaluation. Whether the LoRA fine-tuning data overlaps with test data is unknown." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "GPTScan: Detecting Logic Vulnerabilities in Smart Contracts by Combining GPT with Program Analysis", 435 "authors": ["Y. Sun", "D. Wu", "Y. Xue", "H. Liu", "H. Wang", "Z. Xu", "X. Xie", "Y. Liu"], 436 "year": 2024, 437 "relevance": "LLM-based smart contract vulnerability detection combining GPT with static analysis, directly comparable approach." 438 }, 439 { 440 "title": "Combining Fine-Tuning and LLM-Based Agents for Intuitive Smart Contract Auditing with Justifications", 441 "authors": ["W. Ma", "D. Wu", "Y. Sun", "T. Wang", "S. Liu", "J. Zhang", "Y. Xue", "Y. Liu"], 442 "year": 2024, 443 "arxiv_id": "2403.16073", 444 "relevance": "TrustLLM framework: primary comparison baseline and data source for LLM-BSCVM evaluation." 445 }, 446 { 447 "title": "Large Language Model-Powered Smart Contract Vulnerability Detection: New Perspectives", 448 "authors": ["S. Hu", "T. Huang", "F. Ilhan", "S. F. Tekin", "L. Liu"], 449 "year": 2023, 450 "relevance": "GPTLENS: adversarial framework using GPT-4 for smart contract vulnerability mining." 451 }, 452 { 453 "title": "LLM4Vuln: A Unified Evaluation Framework for Decoupling and Enhancing LLMs' Vulnerability Reasoning", 454 "authors": ["Y. Sun", "D. Wu", "Y. Xue"], 455 "year": 2024, 456 "arxiv_id": "2401.16185", 457 "relevance": "Framework for evaluating LLM vulnerability detection performance with knowledge enhancement." 458 }, 459 { 460 "title": "LLMSmartSec: Smart Contract Security Auditing with LLM and Annotated Control Flow Graph", 461 "authors": ["V. Mothukuri", "R. M. Parizi", "J. L. Massa"], 462 "year": 2024, 463 "relevance": "LLM-based smart contract security auditing using GPT-4 and graph analysis." 464 }, 465 { 466 "title": "Exploring ChatGPT's Capabilities on Vulnerability Management", 467 "authors": ["P. Liu", "J. Liu", "L. Fu", "K. Lu", "Y. Xia", "X. Zhang", "W. Chen", "H. Weng", "S. Ji", "W. Wang"], 468 "year": 2024, 469 "relevance": "USENIX Security paper evaluating ChatGPT for vulnerability management tasks — direct precursor to LLM-BSCVM's scope." 470 }, 471 { 472 "title": "ContractTinker: LLM-Empowered Vulnerability Repair for Real-World Smart Contracts", 473 "authors": ["C. Wang", "J. Zhang", "J. Gao", "L. Xia", "Z. Guan", "Z. Chen"], 474 "year": 2024, 475 "relevance": "LLM-based smart contract vulnerability repair, directly related to LLM-BSCVM's repair agent design." 476 }, 477 { 478 "title": "AgentVerse: Facilitating Multi-Agent Collaboration and Exploring Emergent Behaviors in Agents", 479 "authors": ["W. Chen", "Y. Su", "J. Zuo", "C. Yang"], 480 "year": 2023, 481 "arxiv_id": "2308.10848", 482 "relevance": "Multi-agent collaboration framework that inspired LLM-BSCVM's task decomposition approach." 483 }, 484 { 485 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 486 "authors": ["P. Lewis", "E. Perez", "A. Piktus"], 487 "year": 2020, 488 "relevance": "Foundational RAG paper; the retrieval-augmented generation technique is a core component of LLM-BSCVM." 489 }, 490 { 491 "title": "Code Llama: Open Foundation Models for Code", 492 "authors": ["B. Roziere", "J. Gehring", "F. Gloeckle"], 493 "year": 2023, 494 "arxiv_id": "2308.12950", 495 "relevance": "Base model used for LLM-BSCVM's fine-tuned vulnerability detection and multi-agent inference." 496 }, 497 { 498 "title": "DiverseVul: A New Vulnerable Source Code Dataset for Deep Learning Based Vulnerability Detection", 499 "authors": ["Y. Chen", "Z. Ding", "L. Alowain", "X. Chen", "D. Wagner"], 500 "year": 2023, 501 "relevance": "Vulnerability dataset for deep learning detection; representative of the deep learning approaches LLM-BSCVM compares against." 502 } 503 ], 504 "engagement_factors": { 505 "practical_relevance": { 506 "score": 2, 507 "justification": "Open-source framework for smart contract auditing that blockchain developers could potentially use, though 21% repair success rate limits practical utility." 508 }, 509 "surprise_contrarian": { 510 "score": 0, 511 "justification": "Confirms the expected direction that LLMs combined with RAG and multi-agent systems can improve vulnerability detection." 512 }, 513 "fear_safety": { 514 "score": 1, 515 "justification": "Addresses smart contract security which has caused significant financial losses, but the approach is defensive rather than demonstrating novel attacks." 516 }, 517 "drama_conflict": { 518 "score": 0, 519 "justification": "No controversy or provocative claims; standard incremental improvement narrative." 520 }, 521 "demo_ability": { 522 "score": 2, 523 "justification": "Code is open-sourced on GitHub, though no live demo or pip-installable package is provided." 524 }, 525 "brand_recognition": { 526 "score": 0, 527 "justification": "From Guangxi Normal University, Zhongguancun Laboratory, and Beihang University — not widely recognized in the Western AI research community." 528 } 529 } 530 }