scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24197B)
      1 {
      2   "paper": {
      3     "title": "LLMs in Software Security: A Survey of Vulnerability Detection Techniques and Insights",
      4     "authors": [
      5       "Ze Sheng",
      6       "Zhicheng Chen",
      7       "Shuning Gu",
      8       "Heqing Huang",
      9       "Guofei Gu",
     10       "Jeff Huang"
     11     ],
     12     "year": 2025,
     13     "venue": "ACM Computing Surveys",
     14     "arxiv_id": "2502.07049",
     15     "doi": "10.1145/3769082"
     16   },
     17   "scan_version": 2,
     18   "active_modules": ["survey_methodology"],
     19   "methodology_tags": ["meta-analysis"],
     20   "key_findings": "This survey of 58 papers on LLM-based vulnerability detection finds that decoder-only models (especially GPT-4 and GPT-3.5) account for 67.1% of model usage, C/C++ dominates as the target language at 50% of studies, and 41.3% of studies use code preprocessing techniques (AST, RAG, program slicing). Key gaps identified include narrow focus on function-level binary classification, lack of repository-level datasets, dataset quality/leakage issues, and limited cross-language and cross-file detection capabilities. Fine-tuned large models can achieve near 0.9 F1-scores, but complex real-world vulnerability detection remains challenging.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "The paper provides a GitHub repository URL: https://github.com/OwenSanzas/LLM-For-Vulnerability-Detection, stated in the abstract as maintaining 'the latest findings.'"
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "While the GitHub repo is mentioned, the paper does not describe releasing a structured dataset of its survey extraction results (e.g., the coded features of all 58 papers, screening decisions, or the full list of ~500 screened papers)."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No environment or dependency specifications are provided. As a survey, analysis scripts could have been released with environment details, but none are mentioned."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step instructions for reproducing the survey analysis are provided. The paper describes the search process at a high level but not with enough detail to fully replicate."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": false,
     47         "answer": false,
     48         "justification": "Survey paper with no experiments; only descriptive statistics (counts, percentages) are reported."
     49       },
     50       "significance_tests": {
     51         "applies": false,
     52         "answer": false,
     53         "justification": "Survey paper with no comparative experiments requiring significance tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": false,
     57         "answer": false,
     58         "justification": "Survey paper with no experiments; reports descriptive summaries of others' results."
     59       },
     60       "sample_size_justified": {
     61         "applies": false,
     62         "answer": false,
     63         "justification": "Survey paper with no experiments or power analysis context."
     64       },
     65       "variance_reported": {
     66         "applies": false,
     67         "answer": false,
     68         "justification": "Survey paper with no experimental runs to report variance over."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Section 2.2 compares this survey against prior related reviews: Yao et al. [129], Xu et al. [124], Zhou et al. [140], and several others, explicitly noting how this survey differs."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "The compared prior surveys (Zhou et al. 2024, Xu et al. 2024, Yao et al. 2024) are contemporary, published in 2024."
     81       },
     82       "ablation_study": {
     83         "applies": false,
     84         "answer": false,
     85         "justification": "Survey paper with no system or components to ablate."
     86       },
     87       "multiple_metrics": {
     88         "applies": false,
     89         "answer": false,
     90         "justification": "Survey paper; no evaluation metrics applied to the authors' own analysis."
     91       },
     92       "human_evaluation": {
     93         "applies": false,
     94         "answer": false,
     95         "justification": "Human evaluation is not relevant to a literature survey's claims."
     96       },
     97       "held_out_test_set": {
     98         "applies": false,
     99         "answer": false,
    100         "justification": "Survey paper with no experimental evaluation requiring train/test splits."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper provides extensive breakdowns: by programming language (Figure 5, Finding I), by model architecture (Tables 1-2, RQ1), by technique type (RQ3), by dataset scope (Table 4), and by challenge category (RQ4)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 3.5 (RQ4) extensively discusses challenges and failure modes: limited repository-level coverage, complexity of vulnerability semantics, intrinsic LLM limitations (inconsistent explanations, lack of robustness), and dataset quality issues."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports several negative findings: Ding et al. achieving only 0.21 F1-score, Guo et al. achieving 0.099 F1 on PrimeVul, LLMs lacking robustness against data perturbations (Yin et al.), and inconsistent vulnerability explanations (Haurogne et al., Du et al.)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims (1) systematic analysis, (2) unified framework, and (3) identification of challenges are all substantiated by the paper's four research questions and corresponding sections."
    123       },
    124       "causal_claims_justified": {
    125         "applies": false,
    126         "answer": false,
    127         "justification": "The paper is a survey that synthesizes findings from other papers. It does not make independent causal claims from its own analysis; it reports what other papers found."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "Section 2.1 clearly bounds the scope: 'This survey focuses exclusively on the application of LLMs in vulnerability detection' with explicit exclusion criteria (no traditional ML, no malware/intrusion detection, 2019-2024 timeframe)."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": false,
    136         "answer": false,
    137         "justification": "Pure survey/taxonomy paper; the paper synthesizes existing literature rather than presenting empirical results that require alternative explanations."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper with no measurements of its own; it reports proxies and outcomes from other papers."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": false,
    148         "answer": false,
    149         "justification": "Survey paper; the authors do not use any models themselves."
    150       },
    151       "prompts_provided": {
    152         "applies": false,
    153         "answer": false,
    154         "justification": "Survey paper; no prompting is used by the authors."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": false,
    158         "answer": false,
    159         "justification": "Survey paper with no experiments requiring hyperparameter reporting."
    160       },
    161       "scaffolding_described": {
    162         "applies": false,
    163         "answer": false,
    164         "justification": "Survey paper; no agentic scaffolding is used."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "Section 2.1 describes a search starting from top-tier conferences, keyword extraction, and iterative refinement from ~500 papers to 58. However, intermediate filtering stages with counts at each step are not provided, and the specific criteria for advancing papers between stages are vaguely described ('ensuring recency and relevance')."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 4 is a dedicated 'LIMITATIONS' section discussing factors that may affect the survey's comprehensiveness."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The limitations section identifies specific threats: ~60% of research appears as preprints on arXiv (not peer-reviewed), and terminology variations ('LLM', 'vulnerability detection') may cause search oversights. These are specific to this study's methodology."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 2.1 explicitly states what is excluded: 'traditional machine learning approaches (conventional CNN, RNN and LSTM),' 'malware analysis or network intrusion detection,' and papers outside the 2019-2024 window."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "While a GitHub repo is provided for 'latest findings,' the paper does not describe releasing the raw survey data: the full list of ~500 screened papers, screening decisions, or the complete coded extraction data for each paper."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 2.1 describes the data collection: starting from top-tier security conferences (S&P, USENIX Security, CCS), extracting key terms, conducting iterative searches every three weeks over two months, screening ~500-600 papers to select 58."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Section 2.1 describes how papers were found: starting from top-tier security conferences and journals, keyword-based iterative search, with explicit inclusion/exclusion criteria (LLMs in vulnerability detection, 2019-2024, excluding traditional ML methods)."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "The paper describes going from ~500-600 papers to 58 'highly relevant studies' but does not document intermediate filtering stages with counts at each step. The jump from 500 to 58 is not explained with per-stage filtering decisions."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding or acknowledgments section is present in the paper. There is no disclosure of funding sources."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Texas A&M University (5 authors) and City University of Hong Kong (1 author), with full email addresses."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No funding information is disclosed, so independence of funders cannot be assessed."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement appears in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "Survey paper; the authors do not evaluate any pre-trained model on a benchmark."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "Survey paper; no model evaluation is performed."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Survey paper; no model evaluation is performed. The paper does discuss contamination as a challenge facing the field (Challenge 4, dataset leakage), but this is about surveyed papers, not the survey's own methodology."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this survey paper."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this survey paper."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this survey paper."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this survey paper."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this survey paper."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this survey paper."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this survey paper."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": false,
    290         "answer": false,
    291         "justification": "Survey paper; no method or system of its own to report costs for."
    292       },
    293       "compute_budget_stated": {
    294         "applies": false,
    295         "answer": false,
    296         "justification": "Survey paper; no computational experiments performed."
    297       }
    298     },
    299     "survey_methodology": {
    300       "prisma_or_structured_protocol": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No PRISMA flow diagram, no registered protocol, and no reproducible search queries (exact database queries, date ranges per database). The search is described narratively in Section 2.1 with keywords and venue names, but falls short of a structured protocol. Figure 2 shows the survey structure but is not a PRISMA diagram."
    304       },
    305       "quality_assessment_of_sources": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The survey does not assess the methodological quality of the 58 included papers. All papers are treated equally regardless of venue, study design rigor, or potential biases. Performance numbers from fine-tuning papers (Table 5) are reported without noting differences in evaluation rigor across studies."
    309       },
    310       "publication_bias_discussed": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The limitations section notes that ~60% of included research appears as preprints on arXiv, but this is framed as a comprehensiveness issue rather than a discussion of publication bias (i.e., whether the literature skews toward positive results, whether negative findings are underrepresented)."
    314       }
    315     }
    316   },
    317   "claims": [
    318     {
    319       "claim": "Decoder-only models account for 67.1% of total LLM usage in vulnerability detection, with GPT-4 being the most frequently used model (29 instances across 58 studies).",
    320       "evidence": "Section 3.2 (RQ1) reports statistics from analyzing 58 studies: 33 distinct LLMs identified, GPT-4 in 29 instances, GPT-3.5 in 25, encoder-only at 24.2%, encoder-decoder at 8.7%, decoder-only at 67.1%.",
    321       "supported": "moderate"
    322     },
    323     {
    324       "claim": "C/C++ dominates as the target language in LLM-based vulnerability detection research at 50% of studies, followed by Java at 21.1% and Solidity at 11.8%.",
    325       "evidence": "Finding I, Section 3.3, Figure 5 present the distribution across 56 selected papers analyzing target programming languages.",
    326       "supported": "moderate"
    327     },
    328     {
    329       "claim": "41.3% of studies employ code processing techniques (graph representations, RAG, code slicing) to enhance LLM context utilization.",
    330       "evidence": "Finding III in Section 3.4.1 states this percentage, though the calculation method is not detailed.",
    331       "supported": "moderate"
    332     },
    333     {
    334       "claim": "Fine-tuned large models (>10B parameters with PEFT) achieve F1 scores near 0.9 for vulnerability detection.",
    335       "evidence": "Finding V and Table 5 report various F1 scores: Alam et al. GPT-4o-mini at 0.99, Guo et al. CodeLlama-7B at 0.97, Ma et al. CodeLlama-13B at 0.91. However, these are reported from individual papers with heterogeneous datasets and evaluation setups.",
    336       "supported": "weak"
    337     },
    338     {
    339       "claim": "Approximately 83% of current studies focus on isolated code snippets rather than repository-level analysis.",
    340       "evidence": "Challenge 1 in Section 3.5 states 'approximately 40 studies (83%) concentrate on the analysis of isolated code snippets.'",
    341       "supported": "moderate"
    342     },
    343     {
    344       "claim": "Current datasets face critical limitations: C/C++ language imbalance (~60% coverage) and significant shortage of repository-level datasets.",
    345       "evidence": "Finding II in Section 3.3 and Table 4 showing dataset distribution. Table 4 lists 42 datasets, with the vast majority at function or file level, and only 2-3 repository-level datasets.",
    346       "supported": "strong"
    347     },
    348     {
    349       "claim": "Chain-of-Thought prompting is the dominant approach for large models (>10B parameters), with 100% of recent studies adopting CoT.",
    350       "evidence": "Finding IV in Section 3.4.2 makes this claim, but the denominator and time frame for '100% of recent studies' is not specified.",
    351       "supported": "weak"
    352     }
    353   ],
    354   "red_flags": [
    355     {
    356       "flag": "No quality assessment of surveyed papers",
    357       "detail": "The survey treats all 58 included papers equally without assessing their methodological quality. F1 scores from different papers using different datasets and evaluation protocols are presented side-by-side (Table 5) without noting that these numbers are not directly comparable. This risks laundering weak results alongside strong ones."
    358     },
    359     {
    360       "flag": "Vague and unreproducible selection process",
    361       "detail": "The paper selection goes from ~500-600 to 58 papers without documenting intermediate screening stages, exact search queries, or specific databases searched. The process is described as 'iterative searches every three weeks' with 'key terms' but exact queries and per-database results are not provided."
    362     },
    363     {
    364       "flag": "Uncritical aggregation of incomparable results",
    365       "detail": "Performance numbers from different studies using different datasets, different versions of models, different evaluation splits, and different preprocessing are aggregated into findings (e.g., Finding V claims near-0.9 F1). Alam et al.'s 0.99 F1 on their own VulSmart dataset is presented alongside Ding et al.'s 0.21 F1 on PrimeVul without discussing that dataset difficulty and evaluation rigor drive these differences."
    366     },
    367     {
    368       "flag": "Imprecise quantitative claims",
    369       "detail": "Several findings use precise-sounding percentages (41.3%, 67.1%, 24.2%) derived from a small sample of 58 papers, but the denominator and calculation methodology are often unstated. Finding IV claims '100% of recent studies adopting CoT' without defining 'recent' or the sample size."
    370     }
    371   ],
    372   "cited_papers": [
    373     {
    374       "title": "Vulnerability Detection with Code Language Models: How Far Are We?",
    375       "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim"],
    376       "year": 2025,
    377       "relevance": "ICSE paper systematically evaluating code LLMs for vulnerability detection, finding low F1-scores even with fine-tuning on PrimeVul."
    378     },
    379     {
    380       "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?): A Comprehensive Evaluation, Framework, and Benchmarks",
    381       "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar"],
    382       "year": 2024,
    383       "relevance": "IEEE S&P paper providing a comprehensive evaluation framework demonstrating LLM limitations in security vulnerability identification."
    384     },
    385     {
    386       "title": "Evaluating large language models trained on code",
    387       "authors": ["Mark Chen", "Jerry Tworek"],
    388       "year": 2021,
    389       "arxiv_id": "2107.03374",
    390       "relevance": "Seminal Codex paper evaluating LLMs trained on code, establishing foundational evaluation methodology for code generation."
    391     },
    392     {
    393       "title": "Large Language Models for Software Engineering: A Systematic Literature Review",
    394       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"],
    395       "year": 2024,
    396       "relevance": "Comprehensive systematic review of LLMs in software engineering, covering a broader scope than this vulnerability-focused survey."
    397     },
    398     {
    399       "title": "Large Language Model for Vulnerability Detection and Repair: Literature Review and Roadmap",
    400       "authors": ["Xin Zhou", "Sicong Cao", "Xiaobing Sun", "David Lo"],
    401       "year": 2024,
    402       "arxiv_id": "2404.02525",
    403       "relevance": "Closely related survey on LLMs for vulnerability detection and repair; this paper positions itself as an update and extension."
    404     },
    405     {
    406       "title": "LLM agents can autonomously exploit one-day vulnerabilities",
    407       "authors": ["Richard Fang", "Rohan Bindu", "Akul Gupta", "Daniel Kang"],
    408       "year": 2024,
    409       "arxiv_id": "2404.08144",
    410       "relevance": "Demonstrates LLM agents' capability to autonomously exploit real-world vulnerabilities, relevant to agentic AI safety assessment."
    411     },
    412     {
    413       "title": "A survey on large language model (LLM) security and privacy: The Good, The Bad, and The Ugly",
    414       "authors": ["Yifan Yao", "Jinhao Duan", "Kaidi Xu"],
    415       "year": 2024,
    416       "relevance": "Broad survey of LLM security and privacy impacts, providing context for this survey's narrower vulnerability detection focus."
    417     },
    418     {
    419       "title": "Large Language Models for Code: Security Hardening and Adversarial Testing",
    420       "authors": ["Jingxuan He", "Martin Vechev"],
    421       "year": 2023,
    422       "relevance": "CCS paper on LLM code security hardening and adversarial testing, directly related to LLM safety in code generation."
    423     },
    424     {
    425       "title": "LLM4Vuln: A unified evaluation framework for decoupling and enhancing LLMs' vulnerability reasoning",
    426       "authors": ["Yuqiang Sun", "Daoyuan Wu", "Yue Xue"],
    427       "year": 2024,
    428       "arxiv_id": "2401.16185",
    429       "relevance": "Proposes a unified evaluation framework for LLM vulnerability reasoning, including RAG-based enhancement approaches."
    430     },
    431     {
    432       "title": "How far have we gone in vulnerability detection using large language models",
    433       "authors": ["Zeyu Gao", "Hao Wang", "Yuchen Zhou"],
    434       "year": 2023,
    435       "arxiv_id": "2311.12420",
    436       "relevance": "Evaluates LLM capability boundaries in vulnerability detection, finding significant performance gaps in complex scenarios."
    437     },
    438     {
    439       "title": "Multitask-Based Evaluation of Open-Source LLM on Software Vulnerability",
    440       "authors": ["Xin Yin", "Chao Ni", "Shaohua Wang"],
    441       "year": 2024,
    442       "relevance": "IEEE TSE paper evaluating open-source LLMs on multiple vulnerability-related tasks, finding lack of robustness."
    443     },
    444     {
    445       "title": "Scaling laws for neural language models",
    446       "authors": ["Jared Kaplan", "Sam McCandlish", "Tom Henighan"],
    447       "year": 2020,
    448       "arxiv_id": "2001.08361",
    449       "relevance": "Foundational scaling laws paper cited to support claims about larger models improving vulnerability detection capabilities."
    450     }
    451   ]
    452 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs