scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21953B)
      1 {
      2   "paper": {
      3     "title": "Multi-Agent Collaborative Fuzzing with Continuous Reflection for Smart Contracts Vulnerability Detection",
      4     "authors": ["Jie Chen", "Liangmin Wang"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2511.12164",
      8     "doi": "10.48550/arXiv.2511.12164"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "SmartFuzz, an LLM-driven multi-agent collaborative fuzzer for smart contracts, detects 5.8%-74.7% more vulnerabilities than existing tools within 30 minutes. The continuous reflection process is critical, with a 90.3% performance drop when disabled. Code-specialized LLMs (CodeGemma, CodeQwen, CodeLlama) outperform general models by 10.4%-15.6%. On real-world contracts, SmartFuzz detects 97.2% of true vulnerabilities with only 3 false negatives across 108 contracts.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The paper states 'The source code of the SmartFuzz is available at' but the URL appears to be missing/redacted from the paper text. No working repository link is provided."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "D1 is derived from a labeled dataset [22] and D2 from MuFuzz [15], both publicly available. The 34 DApp projects are collected from GitHub and Etherscan with sources listed in Table 3."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "Hardware is described (Intel Xeon E5-2678 v3, 128GB RAM, four 2080Ti GPUs) and Ollama and CrewAI are mentioned, but no requirements.txt, library versions, or dependency specifications are provided."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided. The paper describes the system architecture but lacks a README or commands to replicate experiments."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results in Tables 2 and 3 report only point estimates (TP, FN counts). No confidence intervals or error bars are provided."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims SmartFuzz outperforms baselines but provides no statistical significance tests. Comparisons are based solely on raw count differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports percentage improvements with baseline context, e.g., '74.7% more vulnerabilities than Mythril' with the calculation shown (150-35)/154, and '80% reduction in false negatives' ((5-1)/5)."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for why 85 contracts in D1 or 108 in D2 were selected beyond tool compatibility. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported across runs. Results appear to be from single runs."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Multiple baselines are included: Mythril, SmarTest, Smartian, ILF, RLF for RQ1; Oyente, sFuzz, ConFuzzius, MuFuzz for RQ3."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent tools like MuFuzz (2024), RLF (2022), and ConFuzzius (2021), which represent state-of-the-art in smart contract fuzzing."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "RQ2 ablates the continuous reflection process (SmartFuzzwor with max_reflection_round=0) and tests multiple LLM engines (CodeLlama, LLaMA3, CodeQwen, CodeGemma)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper reports true positives (TP), false negatives (FN), timeout/error cases (TE), and detection time, across multiple vulnerability categories."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "For DApp projects, the authors state 'we implement SmartFuzz to analyze more than 100 DApps in total and then conduct a manual audit for each project.'"
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "2-fold cross-validation is used for D1. For RQ3, D2 contracts are 'treated as unseen contracts' separate from the 10 D1 examples used for learning."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 breaks down by EL and SC vulnerability types. Table 3 provides breakdown across 6 vulnerability types (BD, UE, UD, EF, RE, TO)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The DApp projects table shows failure counts (#Fail column) and the paper notes 'an average of 12.36% of the analyzed contract files failed, with most failures resulting from exceeding the time limit.'"
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that SmartFuzzlm (LLaMA3) performs 10.4-15.6% worse than code-specialized models, and that hallucinations increase beyond 5 reflection rounds for some models."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of '5.8%-74.7% more vulnerabilities within 30 minutes' and 'reduces false negatives by up to 80%' are supported by Tables 2 and 3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The ablation study (SmartFuzzwor) provides controlled single-variable manipulation to justify the causal claim that the reflection process drives performance. The 90.3% drop with reflection disabled supports this."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title says 'Smart Contracts Vulnerability Detection' generally, but results are limited to Ethereum/Solidity contracts with specific vulnerability types. No discussion of whether results extend to other blockchain platforms."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for performance gains. Could the improvement come from simply using LLMs rather than the specific multi-agent architecture? No confound analysis."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures true positive vulnerability counts against labeled ground truth, which directly matches the claimed outcome of vulnerability detection. No proxy gap exists."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are listed as 'CodeGemma', 'CodeLlama', 'LLaMA3', 'CodeQwen', 'DeepSeek-R1' with rough parameter sizes (7b, 8b, 16b) but no specific version strings or snapshot dates."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper describes agent actions and roles but does not provide the actual prompt text used. Actions are described in natural language (e.g., 'findFuncs', 'pickVulFuncs') without showing the prompts."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Max reflection rounds (10) and timeout (30 minutes) are stated, but no LLM hyperparameters (temperature, top-p, max tokens) are reported."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "The multi-agent architecture is described in detail: 6 agents (TxSeqDrafter, TxSeqRefiner, FunChecker, ArgChecker, SNDChecker, AMTChecker), their roles, permission-aware actions (Table 6), the RCC workflow, and feedback mechanisms."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "D1 filtering is described: 'we select only the 85 vulnerable contracts that can be successfully analyzed by all the evaluated tools.' D2 source and selection from MuFuzz is documented. DApp collection sources are listed."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is no dedicated limitations or threats-to-validity section in the paper."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed anywhere in the paper."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "No explicit scope boundaries are stated. The paper does not discuss what settings, platforms, or vulnerability types are excluded from its claims."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw experimental data (execution logs, per-contract results) is made available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "D1 source is described as derived from labeled dataset [22], D2 from MuFuzz [15], and DApp projects collected from GitHub and Etherscan platforms."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data sources are standard benchmarks and public repositories."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline from dataset selection (85 from D1 filtered by tool compatibility, 108 from D2, 34 DApps from public platforms) through execution and oracle-based verification is documented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Southeast University and Engineering Research Center of BASAM of Ministry of Education."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial disclosure statement is present."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper tests a fuzzing tool's ability to generate attack sequences, not a pre-trained model's knowledge of benchmark answers. The LLMs are used as reasoning engines, not evaluated on benchmark knowledge."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Not applicable — the paper evaluates a fuzzing tool, not model knowledge on benchmarks."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Not applicable — the evaluation measures whether generated transaction sequences trigger known vulnerabilities, not whether the model has memorized solutions."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in the study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in the study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in the study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in the study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in the study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in the study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in the study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Wall-clock time is reported: 30-minute timeout for main experiments, and per-contract analysis times for DApps (ranging from 1m 48s to 5m 7s). Figure 4 shows vulnerability detection over time."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": true,
    289         "justification": "Hardware is specified: two Intel Xeon E5-2678 v3 CPUs (24 cores, 48 threads), 128GB RAM, four 2080Ti GPUs. Local Ollama deployment is described."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No reporting of results across multiple random seeds. The 2-fold cross-validation splits the dataset but does not assess seed sensitivity of the LLM-based fuzzing process."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs per configuration is not stated. It is unclear if results are from single runs or averaged."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. The max reflection round of 10 and RLF reward of 0.7 appear chosen without documented search."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "DeepSeek-R1 is used as the 'default' LLM engine without justification for why it was selected over the other tested models."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across many tool×vulnerability-type comparisons."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors compare their system against baselines without acknowledging potential bias from implementing/configuring the evaluation themselves."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "SmartFuzz uses LLM inference (multiple agents, multiple reflection rounds) which is far more compute-intensive than traditional fuzzers, but this compute difference is never discussed or controlled for."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the labeled vulnerability datasets adequately represent real-world vulnerability detection needs, or whether TP/FN on labeled data measures practical security impact."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "SmartFuzz bundles a specific multi-agent scaffold with specific LLMs. When comparing against non-LLM baselines, the scaffold effect is completely confounded with the LLM effect."
    337       }
    338     }
    339   },
    340   "claims": [
    341     {
    342       "claim": "SmartFuzz detects 5.8%-74.7% more vulnerabilities than existing state-of-the-art tools within 30 minutes",
    343       "evidence": "Table 2 shows SmartFuzz finds 150/154 true vulnerabilities vs Mythril (35), SmarTest (103), Smartian (43), ILF (129), RLF (141). Section 4.2.",
    344       "supported": "moderate"
    345     },
    346     {
    347       "claim": "SmartFuzz reduces false negatives by up to 80%",
    348       "evidence": "Table 3 shows BD false negatives reduced from 5 (MuFuzz) to 1 (SmartFuzz), i.e., 80% reduction. Section 4.4.",
    349       "supported": "moderate"
    350     },
    351     {
    352       "claim": "The continuous reflection process is critical, with 90.3% performance drop when disabled",
    353       "evidence": "Figure 5 shows SmartFuzzwor (no reflection) detects only 11 vulnerabilities vs 150 with reflection. Section 4.3.",
    354       "supported": "moderate"
    355     },
    356     {
    357       "claim": "Code-specialized LLMs outperform general models by 10.4%-15.6% in vulnerability detection",
    358       "evidence": "Figure 6 shows CodeLlama (134/154), CodeQwen (137/154), CodeGemma (129/154) vs LLaMA3 (113/154). Section 4.3.",
    359       "supported": "moderate"
    360     },
    361     {
    362       "claim": "SmartFuzz detects 97.2% of true vulnerabilities on real-world contracts",
    363       "evidence": "Table 3 shows 105/108 true positives across 6 vulnerability categories on D2. Section 4.4.",
    364       "supported": "moderate"
    365     }
    366   ],
    367   "red_flags": [
    368     {
    369       "flag": "No statistical tests",
    370       "detail": "All performance comparisons are based on raw counts with no statistical significance testing, despite the stochastic nature of LLM-based fuzzing."
    371     },
    372     {
    373       "flag": "No variance or multiple runs reported",
    374       "detail": "LLM-based generation is inherently stochastic, but results appear to be from single runs with no variance reporting. The 2-fold CV does not address run-to-run variability."
    375     },
    376     {
    377       "flag": "No limitations section",
    378       "detail": "The paper has no discussion of limitations, threats to validity, or scope boundaries."
    379     },
    380     {
    381       "flag": "Unfair compute comparison",
    382       "detail": "SmartFuzz uses multiple LLM agents with multiple reflection rounds (substantial GPU compute) while baselines use traditional algorithms. The 30-minute wall-clock comparison masks vastly different compute costs."
    383     },
    384     {
    385       "flag": "Missing code repository",
    386       "detail": "The paper appears to reference a code repository but the URL is missing/redacted, preventing verification of claims."
    387     },
    388     {
    389       "flag": "Selection bias in D1 dataset",
    390       "detail": "D1 filters to only 85 contracts 'that can be successfully analyzed by all the evaluated tools,' potentially excluding contracts where SmartFuzz might struggle."
    391     }
    392   ],
    393   "cited_papers": [
    394     {
    395       "title": "Why do multi-agent llm systems fail?",
    396       "authors": ["M. Cemri", "M. Z. Pan", "S. Yang"],
    397       "year": 2025,
    398       "arxiv_id": "2503.13657",
    399       "relevance": "Directly studies failure modes in multi-agent LLM systems, relevant to understanding agentic AI reliability."
    400     },
    401     {
    402       "title": "FuzzGPT: Large language models are edge-case generators: Crafting unusual programs for fuzzing deep learning libraries",
    403       "authors": ["Y. Deng", "C. S. Xia", "C. Yang"],
    404       "year": 2024,
    405       "relevance": "Uses LLMs for fuzzing deep learning libraries, directly relevant to LLM-driven testing and code generation."
    406     },
    407     {
    408       "title": "Fuzzing javascript interpreters with coverage-guided reinforcement learning for LLM-based mutation",
    409       "authors": ["J. Eom", "S. Jeong", "T. Kwon"],
    410       "year": 2024,
    411       "relevance": "Combines LLMs with reinforcement learning for fuzzing, relevant to LLM-augmented software testing."
    412     },
    413     {
    414       "title": "On the reliability of coverage-based fuzzer benchmarking",
    415       "authors": ["M. Böhme", "L. Szekeres", "J. Metzman"],
    416       "year": 2022,
    417       "relevance": "Meta-research on benchmarking methodology for fuzzers, relevant to evaluation rigor in software testing research."
    418     },
    419     {
    420       "title": "Mufuzz: Sequence-aware mutation and seed mask guidance for blockchain smart contract fuzzing",
    421       "authors": ["P. Qian", "H. Wu", "Z. Du"],
    422       "year": 2024,
    423       "relevance": "State-of-the-art smart contract fuzzer used as primary baseline, relevant to AI-augmented security testing."
    424     },
    425     {
    426       "title": "Smartian: Enhancing smart contract fuzzing with static and dynamic data-flow analyses",
    427       "authors": ["J. Choi", "D. Kim", "S. Kim"],
    428       "year": 2021,
    429       "relevance": "Smart contract fuzzing tool combining static and dynamic analysis, relevant to automated software testing."
    430     },
    431     {
    432       "title": "Effectively generating vulnerable transaction sequences in smart contracts with reinforcement learning-guided fuzzing",
    433       "authors": ["J. Su", "H. Dai", "L. Zhao"],
    434       "year": 2022,
    435       "relevance": "RL-guided fuzzing for smart contracts, directly relevant to AI-driven security testing."
    436     }
    437   ]
    438 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs