scan.json (21953B)
1 { 2 "paper": { 3 "title": "Multi-Agent Collaborative Fuzzing with Continuous Reflection for Smart Contracts Vulnerability Detection", 4 "authors": ["Jie Chen", "Liangmin Wang"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2511.12164", 8 "doi": "10.48550/arXiv.2511.12164" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "SmartFuzz, an LLM-driven multi-agent collaborative fuzzer for smart contracts, detects 5.8%-74.7% more vulnerabilities than existing tools within 30 minutes. The continuous reflection process is critical, with a 90.3% performance drop when disabled. Code-specialized LLMs (CodeGemma, CodeQwen, CodeLlama) outperform general models by 10.4%-15.6%. On real-world contracts, SmartFuzz detects 97.2% of true vulnerabilities with only 3 false negatives across 108 contracts.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The paper states 'The source code of the SmartFuzz is available at' but the URL appears to be missing/redacted from the paper text. No working repository link is provided." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "D1 is derived from a labeled dataset [22] and D2 from MuFuzz [15], both publicly available. The 34 DApp projects are collected from GitHub and Etherscan with sources listed in Table 3." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "Hardware is described (Intel Xeon E5-2678 v3, 128GB RAM, four 2080Ti GPUs) and Ollama and CrewAI are mentioned, but no requirements.txt, library versions, or dependency specifications are provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The paper describes the system architecture but lacks a README or commands to replicate experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results in Tables 2 and 3 report only point estimates (TP, FN counts). No confidence intervals or error bars are provided." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims SmartFuzz outperforms baselines but provides no statistical significance tests. Comparisons are based solely on raw count differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports percentage improvements with baseline context, e.g., '74.7% more vulnerabilities than Mythril' with the calculation shown (150-35)/154, and '80% reduction in false negatives' ((5-1)/5)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification for why 85 contracts in D1 or 108 in D2 were selected beyond tool compatibility. No power analysis." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported across runs. Results appear to be from single runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Multiple baselines are included: Mythril, SmarTest, Smartian, ILF, RLF for RQ1; Oyente, sFuzz, ConFuzzius, MuFuzz for RQ3." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Baselines include recent tools like MuFuzz (2024), RLF (2022), and ConFuzzius (2021), which represent state-of-the-art in smart contract fuzzing." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "RQ2 ablates the continuous reflection process (SmartFuzzwor with max_reflection_round=0) and tests multiple LLM engines (CodeLlama, LLaMA3, CodeQwen, CodeGemma)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports true positives (TP), false negatives (FN), timeout/error cases (TE), and detection time, across multiple vulnerability categories." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "For DApp projects, the authors state 'we implement SmartFuzz to analyze more than 100 DApps in total and then conduct a manual audit for each project.'" 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "2-fold cross-validation is used for D1. For RQ3, D2 contracts are 'treated as unseen contracts' separate from the 10 D1 examples used for learning." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 2 breaks down by EL and SC vulnerability types. Table 3 provides breakdown across 6 vulnerability types (BD, UE, UD, EF, RE, TO)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The DApp projects table shows failure counts (#Fail column) and the paper notes 'an average of 12.36% of the analyzed contract files failed, with most failures resulting from exceeding the time limit.'" 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that SmartFuzzlm (LLaMA3) performs 10.4-15.6% worse than code-specialized models, and that hallucinations increase beyond 5 reflection rounds for some models." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims of '5.8%-74.7% more vulnerabilities within 30 minutes' and 'reduces false negatives by up to 80%' are supported by Tables 2 and 3." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The ablation study (SmartFuzzwor) provides controlled single-variable manipulation to justify the causal claim that the reflection process drives performance. The 90.3% drop with reflection disabled supports this." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title says 'Smart Contracts Vulnerability Detection' generally, but results are limited to Ethereum/Solidity contracts with specific vulnerability types. No discussion of whether results extend to other blockchain platforms." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for performance gains. Could the improvement come from simply using LLMs rather than the specific multi-agent architecture? No confound analysis." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures true positive vulnerability counts against labeled ground truth, which directly matches the claimed outcome of vulnerability detection. No proxy gap exists." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are listed as 'CodeGemma', 'CodeLlama', 'LLaMA3', 'CodeQwen', 'DeepSeek-R1' with rough parameter sizes (7b, 8b, 16b) but no specific version strings or snapshot dates." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper describes agent actions and roles but does not provide the actual prompt text used. Actions are described in natural language (e.g., 'findFuncs', 'pickVulFuncs') without showing the prompts." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Max reflection rounds (10) and timeout (30 minutes) are stated, but no LLM hyperparameters (temperature, top-p, max tokens) are reported." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The multi-agent architecture is described in detail: 6 agents (TxSeqDrafter, TxSeqRefiner, FunChecker, ArgChecker, SNDChecker, AMTChecker), their roles, permission-aware actions (Table 6), the RCC workflow, and feedback mechanisms." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "D1 filtering is described: 'we select only the 85 vulnerable contracts that can be successfully analyzed by all the evaluated tools.' D2 source and selection from MuFuzz is documented. DApp collection sources are listed." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations or threats-to-validity section in the paper." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed anywhere in the paper." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings, platforms, or vulnerability types are excluded from its claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (execution logs, per-contract results) is made available for independent verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "D1 source is described as derived from labeled dataset [22], D2 from MuFuzz [15], and DApp projects collected from GitHub and Etherscan platforms." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard benchmarks and public repositories." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from dataset selection (85 from D1 filtered by tool compatibility, 108 from D2, 34 DApps from public platforms) through execution and oracle-based verification is documented." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: Southeast University and Engineering Research Center of BASAM of Ministry of Education." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding information disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial disclosure statement is present." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper tests a fuzzing tool's ability to generate attack sequences, not a pre-trained model's knowledge of benchmark answers. The LLMs are used as reasoning engines, not evaluated on benchmark knowledge." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Not applicable — the paper evaluates a fuzzing tool, not model knowledge on benchmarks." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not applicable — the evaluation measures whether generated transaction sequences trigger known vulnerabilities, not whether the model has memorized solutions." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in the study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Wall-clock time is reported: 30-minute timeout for main experiments, and per-contract analysis times for DApps (ranging from 1m 48s to 5m 7s). Figure 4 shows vulnerability detection over time." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Hardware is specified: two Intel Xeon E5-2678 v3 CPUs (24 cores, 48 threads), 128GB RAM, four 2080Ti GPUs. Local Ollama deployment is described." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No reporting of results across multiple random seeds. The 2-fold cross-validation splits the dataset but does not assess seed sensitivity of the LLM-based fuzzing process." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs per configuration is not stated. It is unclear if results are from single runs or averaged." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. The max reflection round of 10 and RLF reward of 0.7 appear chosen without documented search." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "DeepSeek-R1 is used as the 'default' LLM engine without justification for why it was selected over the other tested models." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across many tool×vulnerability-type comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors compare their system against baselines without acknowledging potential bias from implementing/configuring the evaluation themselves." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "SmartFuzz uses LLM inference (multiple agents, multiple reflection rounds) which is far more compute-intensive than traditional fuzzers, but this compute difference is never discussed or controlled for." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the labeled vulnerability datasets adequately represent real-world vulnerability detection needs, or whether TP/FN on labeled data measures practical security impact." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "SmartFuzz bundles a specific multi-agent scaffold with specific LLMs. When comparing against non-LLM baselines, the scaffold effect is completely confounded with the LLM effect." 337 } 338 } 339 }, 340 "claims": [ 341 { 342 "claim": "SmartFuzz detects 5.8%-74.7% more vulnerabilities than existing state-of-the-art tools within 30 minutes", 343 "evidence": "Table 2 shows SmartFuzz finds 150/154 true vulnerabilities vs Mythril (35), SmarTest (103), Smartian (43), ILF (129), RLF (141). Section 4.2.", 344 "supported": "moderate" 345 }, 346 { 347 "claim": "SmartFuzz reduces false negatives by up to 80%", 348 "evidence": "Table 3 shows BD false negatives reduced from 5 (MuFuzz) to 1 (SmartFuzz), i.e., 80% reduction. Section 4.4.", 349 "supported": "moderate" 350 }, 351 { 352 "claim": "The continuous reflection process is critical, with 90.3% performance drop when disabled", 353 "evidence": "Figure 5 shows SmartFuzzwor (no reflection) detects only 11 vulnerabilities vs 150 with reflection. Section 4.3.", 354 "supported": "moderate" 355 }, 356 { 357 "claim": "Code-specialized LLMs outperform general models by 10.4%-15.6% in vulnerability detection", 358 "evidence": "Figure 6 shows CodeLlama (134/154), CodeQwen (137/154), CodeGemma (129/154) vs LLaMA3 (113/154). Section 4.3.", 359 "supported": "moderate" 360 }, 361 { 362 "claim": "SmartFuzz detects 97.2% of true vulnerabilities on real-world contracts", 363 "evidence": "Table 3 shows 105/108 true positives across 6 vulnerability categories on D2. Section 4.4.", 364 "supported": "moderate" 365 } 366 ], 367 "red_flags": [ 368 { 369 "flag": "No statistical tests", 370 "detail": "All performance comparisons are based on raw counts with no statistical significance testing, despite the stochastic nature of LLM-based fuzzing." 371 }, 372 { 373 "flag": "No variance or multiple runs reported", 374 "detail": "LLM-based generation is inherently stochastic, but results appear to be from single runs with no variance reporting. The 2-fold CV does not address run-to-run variability." 375 }, 376 { 377 "flag": "No limitations section", 378 "detail": "The paper has no discussion of limitations, threats to validity, or scope boundaries." 379 }, 380 { 381 "flag": "Unfair compute comparison", 382 "detail": "SmartFuzz uses multiple LLM agents with multiple reflection rounds (substantial GPU compute) while baselines use traditional algorithms. The 30-minute wall-clock comparison masks vastly different compute costs." 383 }, 384 { 385 "flag": "Missing code repository", 386 "detail": "The paper appears to reference a code repository but the URL is missing/redacted, preventing verification of claims." 387 }, 388 { 389 "flag": "Selection bias in D1 dataset", 390 "detail": "D1 filters to only 85 contracts 'that can be successfully analyzed by all the evaluated tools,' potentially excluding contracts where SmartFuzz might struggle." 391 } 392 ], 393 "cited_papers": [ 394 { 395 "title": "Why do multi-agent llm systems fail?", 396 "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"], 397 "year": 2025, 398 "arxiv_id": "2503.13657", 399 "relevance": "Directly studies failure modes in multi-agent LLM systems, relevant to understanding agentic AI reliability." 400 }, 401 { 402 "title": "FuzzGPT: Large language models are edge-case generators: Crafting unusual programs for fuzzing deep learning libraries", 403 "authors": ["Y. Deng", "C. S. Xia", "C. Yang"], 404 "year": 2024, 405 "relevance": "Uses LLMs for fuzzing deep learning libraries, directly relevant to LLM-driven testing and code generation." 406 }, 407 { 408 "title": "Fuzzing javascript interpreters with coverage-guided reinforcement learning for LLM-based mutation", 409 "authors": ["J. Eom", "S. Jeong", "T. Kwon"], 410 "year": 2024, 411 "relevance": "Combines LLMs with reinforcement learning for fuzzing, relevant to LLM-augmented software testing." 412 }, 413 { 414 "title": "On the reliability of coverage-based fuzzer benchmarking", 415 "authors": ["M. Böhme", "L. Szekeres", "J. Metzman"], 416 "year": 2022, 417 "relevance": "Meta-research on benchmarking methodology for fuzzers, relevant to evaluation rigor in software testing research." 418 }, 419 { 420 "title": "Mufuzz: Sequence-aware mutation and seed mask guidance for blockchain smart contract fuzzing", 421 "authors": ["P. Qian", "H. Wu", "Z. Du"], 422 "year": 2024, 423 "relevance": "State-of-the-art smart contract fuzzer used as primary baseline, relevant to AI-augmented security testing." 424 }, 425 { 426 "title": "Smartian: Enhancing smart contract fuzzing with static and dynamic data-flow analyses", 427 "authors": ["J. Choi", "D. Kim", "S. Kim"], 428 "year": 2021, 429 "relevance": "Smart contract fuzzing tool combining static and dynamic analysis, relevant to automated software testing." 430 }, 431 { 432 "title": "Effectively generating vulnerable transaction sequences in smart contracts with reinforcement learning-guided fuzzing", 433 "authors": ["J. Su", "H. Dai", "L. Zhao"], 434 "year": 2022, 435 "relevance": "RL-guided fuzzing for smart contracts, directly relevant to AI-driven security testing." 436 } 437 ] 438 }