scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25243B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "LLM-BSCVM: An LLM-Based Blockchain Smart Contract Vulnerability Management Framework",
      6     "authors": [
      7       "Yanli Jin",
      8       "Chunpei Li",
      9       "Peng Fan",
     10       "Peng Liu",
     11       "Xianxian Li"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv.org",
     15     "arxiv_id": "2505.17416",
     16     "doi": "10.48550/arXiv.2505.17416"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The abstract's claims of >91% F1/accuracy and FPR reduction from 7.2% to 5.1% are supported by Tables I–III. The 'comparable to SOTA' framing is accurate.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about RAG and static analysis improving performance are supported by ablation study in Table IV removing each component independently.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper evaluates only on Ethereum/Solidity contracts from two specific datasets but makes broad claims about 'Web 3.0 ecosystem' security without bounding scope to these settings.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper does not discuss whether fine-tuning alone explains most of the improvement, or whether overlap between the RAG knowledge base and test contracts inflates results.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Vulnerability repair is evaluated by GPT-4 as a verifier—an LLM validating another LLM's output—without acknowledging this is a proxy rather than ground-truth security verification.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion mentions future work but does not formally enumerate limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No threats to validity are discussed, including potential data leakage between the RAG knowledge base and test contracts, or limited dataset diversity.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The paper does not explicitly state that results are limited to Ethereum Solidity contracts or to the specific vulnerability types present in the test datasets.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment section is present anywhere in the paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations are clearly disclosed: Guangxi Normal University, Zhongguancun Laboratory, and Beihang University.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funding is disclosed, so funder independence is not applicable.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests or financial interests declaration is present in the paper.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms such as 'vulnerability management,' 'RAG,' and 'multi-agent collaboration' are explained adequately in Sections II and III, though informally.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper explicitly lists three contributions: the LLM-BSCVM framework, the Decompose-Retrieve-Generate method, and experimental validation.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II covers traditional methods, deep learning, and LLM-based approaches, clearly positioning LLM-BSCVM as addressing gaps (lack of repair, limited explainability) in prior work.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Source code is released at https://github.com/sosol717/LLM-BSCVM, mentioned both in abstract and conclusion.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "The paper uses two existing published datasets (TrustLLM/Solodit and DappScan/IEEE TSE) that are publicly available or citable.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements file, Dockerfile, or formal environment specification is mentioned; only model names without versioning or dependency details.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "No step-by-step reproduction instructions are provided in the paper; code is released but without documented setup or run procedures in the paper itself.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results in Tables I–IV are single point estimates with no confidence intervals or error bars reported.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used for any comparative claims between LLM-BSCVM and baselines.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute metric differences are reported (e.g., FPR 5.1% vs 7.2%, F1 0.9104 vs 0.8918) providing meaningful effect size context relative to baseline.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "The experiment uses 263 TrustLLM audit reports but no justification or power analysis for this sample size is provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "No variance, standard deviation, or multi-run results are reported; all metrics are single-run point estimates.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple baselines are included: Codellama 7B/13B, CodeBERT, CodeT5, Llama 8B in both zero-shot and LoRA fine-tuned variants, plus TrustLLM.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "TrustLLM (2024) and GPTScan (2024) are contemporary LLM-based baselines; CodeLlama and CodeBERT are standard code models.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table IV presents ablation removing static analysis (W/o Static) and RAG (W/o RAG) components, quantifying each contribution.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Four metrics are reported throughout: F1-score, recall, precision, and accuracy.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "No human evaluation of system outputs is conducted; repair quality is assessed against a single ground-truth example or via GPT-4 as verifier.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": false,
    212           "justification": "The paper does not describe a train/test split for the LoRA fine-tuned CodeLlama; it is unclear whether test contracts were seen during fine-tuning.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "No breakdown by vulnerability type (reentrancy, overflow, access control, etc.) is provided; only aggregate metrics are reported.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "The paper reports that only 21% of repaired contracts passed validation and discusses LLM-BSCVM(E) underperforming due to context overload.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "The 21% repair success rate and LLM-BSCVM(E)'s degraded performance are both reported and briefly analyzed.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Model sizes are stated (Codellama 7B, 13B, Llama 8B) and GPT-4 is mentioned, but no snapshot versions or release dates are provided.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Figure 4 shows the complete Detector prompt template with role, task description, expected output, and knowledge context sections.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "LoRA fine-tuning is mentioned but no hyperparameters (rank, learning rate, epochs, batch size, temperature, top-p) are reported.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "Section III describes the six-agent pipeline, each agent's role, interaction flow, and the three-stage Decompose-Retrieve-Generate method in detail.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "TF-IDF and vector embedding for knowledge bases are mentioned with a cosine similarity formula, but chunking parameters, preprocessing steps, and train/test split procedures are undocumented.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "The paper uses third-party datasets (TrustLLM, DappScan); no raw experimental results or processed evaluation data are released by the authors.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section IV.A describes the two data sources: 263 TrustLLM audit reports from Solodit and 1,199 DappScan reports from 29 security teams, with citations.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; standard benchmark datasets are used.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "Knowledge base construction is described conceptually but lacks detail on filtering, train/test splitting, and data cleaning steps.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "CodeLlama and GPT-4 training cutoffs are not stated; test contracts sourced from Solodit (a public website) may have appeared in pre-training data.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether test contracts from TrustLLM or DappScan appeared in CodeLlama's pre-training corpus.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "The TrustLLM dataset is sourced from Solodit (public website predating CodeLlama's training), and no contamination analysis is conducted.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants in this study.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants in this study.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants in this study.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants in this study.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants in this study.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants in this study.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost, latency, or API cost information is reported despite using GPT-4 as a verifier component.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "No compute budget, GPU specifications, or training time information is provided for the LoRA fine-tuning or evaluation experiments.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "LLM-BSCVM achieves vulnerability detection accuracy and F1 score exceeding 91% on benchmark datasets",
    375       "evidence": "Table I shows LLM-BSCVM(W): F1=0.9104, accuracy=0.9111; Table III confirms parity with TrustLLM (F1=0.9121)",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "LLM-BSCVM reduces false positive rate from 7.2% (TrustLLM) to 5.1%",
    380       "evidence": "Reported in text and supported by Table III comparison, but no statistical significance test accompanies this 2.2pp difference on a single run",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "Ablation confirms both static analysis and RAG components contribute to detection performance",
    385       "evidence": "Table IV: W/o Static F1=0.8848, W/o RAG F1=0.8440, vs full LLM-BSCVM(W) F1=0.9104",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "LLM-BSCVM is the first end-to-end smart contract vulnerability management framework",
    390       "evidence": "Claimed in abstract and contributions list without systematic literature search; prior work surveyed only on detection tools",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "Vulnerability repair succeeds for approximately 21% of contracts",
    395       "evidence": "Section IV.B: 'approximately 21% of the contracts successfully passed the validation' from independent LLM (GPT-4) verifier",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "LLM-BSCVM substantially outperforms zero-shot base models (e.g., ~48pp accuracy over Codellama 13B)",
    400       "evidence": "Table II: Codellama 13B zero-shot accuracy 0.4255 vs LLM-BSCVM 0.9111; consistent with known fine-tuning effects",
    401       "supported": "strong"
    402     }
    403   ],
    404   "methodology_tags": [
    405     "benchmark-eval",
    406     "case-study"
    407   ],
    408   "key_findings": "LLM-BSCVM achieves 91% F1 and accuracy on smart contract vulnerability detection by combining LoRA fine-tuned CodeLlama, static analysis pattern matching, and RAG over a vulnerability knowledge base. The framework matches TrustLLM on F1 while reducing false positive rate from 7.2% to 5.1%. However, automated vulnerability repair succeeds only ~21% of the time when validated by an independent LLM, revealing a large gap between detection and repair capability. Ablation confirms both static analysis and RAG contribute meaningfully, while excessive context injection (LLM-BSCVM-E) degrades performance.",
    409   "red_flags": [
    410     {
    411       "flag": "No train/test split described",
    412       "detail": "The LoRA fine-tuned CodeLlama is evaluated on the TrustLLM dataset but the paper does not describe how training and test data were separated, raising serious risk of data leakage inflating reported metrics."
    413     },
    414     {
    415       "flag": "LLM evaluating LLM repair",
    416       "detail": "Vulnerability repair quality is assessed using GPT-4 as an independent verifier — a circular evaluation where one LLM judges another's output without any ground-truth human validation."
    417     },
    418     {
    419       "flag": "No variance across runs",
    420       "detail": "All experimental results are single point estimates; no standard deviation or confidence intervals reported for any metric."
    421     },
    422     {
    423       "flag": "Benchmark contamination unaddressed",
    424       "detail": "Test contracts are sourced from Solodit (a public website), which likely predates CodeLlama's training cutoff; no overlap analysis is conducted."
    425     },
    426     {
    427       "flag": "21% repair rate understated",
    428       "detail": "The low repair success rate (21%) is mentioned briefly but is absent from the abstract and conclusions, creating a misleading impression of end-to-end capability."
    429     },
    430     {
    431       "flag": "'First framework' claim unsupported",
    432       "detail": "The paper claims to be 'the first' end-to-end smart contract vulnerability management framework without a systematic search of prior work to substantiate this."
    433     }
    434   ],
    435   "cited_papers": [
    436     {
    437       "title": "GPTScan: Detecting logic vulnerabilities in smart contracts by combining GPT with program analysis",
    438       "relevance": "Key baseline combining LLMs with static analysis for smart contract vulnerability detection"
    439     },
    440     {
    441       "title": "Combining fine-tuning and LLM-based agents for intuitive smart contract auditing with justifications (TrustLLM)",
    442       "relevance": "Primary baseline and source of the evaluation dataset; directly compared in Tables II–III"
    443     },
    444     {
    445       "title": "Large language model-powered smart contract vulnerability detection: New perspectives (GPTLENS)",
    446       "relevance": "Two-stage adversarial LLM framework for smart contract vulnerability mining"
    447     },
    448     {
    449       "title": "DappScan: Building large-scale datasets for smart contract weaknesses in DApp projects",
    450       "relevance": "One of two datasets used for evaluation experiments (1,199 audit reports)"
    451     },
    452     {
    453       "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks",
    454       "relevance": "Foundational RAG paper that LLM-BSCVM's knowledge retrieval stage is built upon"
    455     },
    456     {
    457       "title": "Code Llama: Open foundation models for code",
    458       "relevance": "Primary model used throughout the pipeline, fine-tuned with LoRA for vulnerability detection"
    459     },
    460     {
    461       "title": "LLM4Vuln: A unified evaluation framework for decoupling and enhancing LLMs' vulnerability reasoning",
    462       "relevance": "Related LLM vulnerability evaluation framework and baseline comparison"
    463     },
    464     {
    465       "title": "AgentVerse: Facilitating multi-agent collaboration and exploring emergent behaviors in agents",
    466       "relevance": "Conceptual basis for the multi-agent collaboration architecture used in LLM-BSCVM"
    467     }
    468   ],
    469   "engagement_factors": {
    470     "practical_relevance": {
    471       "score": 2,
    472       "justification": "Smart contract security is directly applicable to Web3 developers and the code is open-sourced, but 21% repair success rate severely limits immediate practical utility."
    473     },
    474     "surprise_contrarian": {
    475       "score": 1,
    476       "justification": "The 21% repair success rate is a surprising failure given the high detection claims, but detection results are incremental improvements over existing work."
    477     },
    478     "fear_safety": {
    479       "score": 2,
    480       "justification": "The paper cites $20B+ in cumulative smart contract losses and addresses a real, ongoing threat to decentralized finance infrastructure."
    481     },
    482     "drama_conflict": {
    483       "score": 1,
    484       "justification": "No notable controversy; framework competes with TrustLLM but no public dispute or contested methodology."
    485     },
    486     "demo_ability": {
    487       "score": 2,
    488       "justification": "Code is open-sourced on GitHub, allowing practitioners to run the framework on their own Solidity contracts."
    489     },
    490     "brand_recognition": {
    491       "score": 0,
    492       "justification": "Authors are from Guangxi Normal University, Zhongguancun Laboratory, and Beihang University — no famous AI lab affiliation."
    493     }
    494   },
    495   "hn_data": {
    496     "threads": [
    497       {
    498         "hn_id": "23345379",
    499         "title": "GPT-3: Language Models Are Few-Shot Learners",
    500         "points": 431,
    501         "comments": 201,
    502         "url": "https://news.ycombinator.com/item?id=23345379"
    503       },
    504       {
    505         "hn_id": "43636596",
    506         "title": "ProtoGS: Efficient and High-Quality Rendering with 3D Gaussian Prototypes",
    507         "points": 22,
    508         "comments": 0,
    509         "url": "https://news.ycombinator.com/item?id=43636596"
    510       },
    511       {
    512         "hn_id": "41491707",
    513         "title": "Show HN: Turn Any ArXiv Paper into a 200-Page Prerequisite Reading Book",
    514         "points": 6,
    515         "comments": 3,
    516         "url": "https://news.ycombinator.com/item?id=41491707"
    517       },
    518       {
    519         "hn_id": "44510299",
    520         "title": "The Cost of an Image: The Energy Consumption of AI Image Generation",
    521         "points": 6,
    522         "comments": 0,
    523         "url": "https://news.ycombinator.com/item?id=44510299"
    524       },
    525       {
    526         "hn_id": "44091127",
    527         "title": "Interactive Post-Training for Vision-Language-Action Models",
    528         "points": 4,
    529         "comments": 0,
    530         "url": "https://news.ycombinator.com/item?id=44091127"
    531       },
    532       {
    533         "hn_id": "40541123",
    534         "title": "Compressed-Language Models for Understanding Compressed File Formats: JPEG",
    535         "points": 3,
    536         "comments": 0,
    537         "url": "https://news.ycombinator.com/item?id=40541123"
    538       },
    539       {
    540         "hn_id": "44430055",
    541         "title": "Survey on Evaluation of LLM-Based Agents",
    542         "points": 2,
    543         "comments": 0,
    544         "url": "https://news.ycombinator.com/item?id=44430055"
    545       },
    546       {
    547         "hn_id": "42952269",
    548         "title": "Fast Processing-Using-DRAM via Dynamic Precision Bit-Serial Arithmetic",
    549         "points": 2,
    550         "comments": 0,
    551         "url": "https://news.ycombinator.com/item?id=42952269"
    552       },
    553       {
    554         "hn_id": "42867278",
    555         "title": "Large Language Model Training Using FP4 Quantization",
    556         "points": 2,
    557         "comments": 0,
    558         "url": "https://news.ycombinator.com/item?id=42867278"
    559       },
    560       {
    561         "hn_id": "40622708",
    562         "title": "Language Models Are Few-Shot Learners",
    563         "points": 2,
    564         "comments": 0,
    565         "url": "https://news.ycombinator.com/item?id=40622708"
    566       }
    567     ],
    568     "top_points": 431,
    569     "total_points": 480,
    570     "total_comments": 204
    571   }
    572 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs