ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (18169B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "survey",
      4   "paper": {
      5     "title": "LLMs in Software Security: A Survey of Vulnerability Detection Techniques and Insights",
      6     "authors": [
      7       "Ze Sheng",
      8       "Zhicheng Chen",
      9       "Shuning Gu",
     10       "Heqing Huang",
     11       "Guofei Gu",
     12       "Jeff Huang"
     13     ],
     14     "year": 2025,
     15     "venue": "ACM Computing Surveys",
     16     "arxiv_id": "2502.07049",
     17     "doi": "10.1145/3769082"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims about LLM capabilities, traditional method limitations, and the survey's contributions are substantiated across all sections through literature synthesis and tabulated evidence.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The paper makes causal-sounding claims like 'fine-tuning enhances detection' and 'CoT improves accuracy,' but these are inherited from primary studies without independent assessment of whether those studies' designs were adequate for causal inference.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The survey explicitly bounds its scope to LLM-based vulnerability detection (2019–2024) in C/C++, Java, and Solidity at function/file level, and acknowledges gaps in repository-level and multi-language coverage.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "Trends such as decoder-only model dominance and C/C++ research concentration are presented as conclusions without considering confounding factors or alternative explanations (e.g., dataset availability driving language focus).",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "The paper notes data leakage inflates benchmark metrics but does not systematically distinguish between controlled-dataset F1-scores and real-world vulnerability detection effectiveness across reviewed studies.",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 4 is a dedicated 'LIMITATIONS' section, though it is brief—one paragraph covering only two issues.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The limitations mention ~60% preprint coverage and terminology variation, but do not quantify coverage gaps, address selection bias, discuss publication bias, or explain how these threats affect the survey's conclusions.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 2.1 explicitly excludes traditional ML approaches (CNN/RNN), malware analysis, and network intrusion detection, and restricts inclusion to LLM-based vulnerability detection in specific programming languages.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "No acknowledgments section or funding disclosure is present anywhere in the paper.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All six authors list institutional affiliations (Texas A&M University, City University of Hong Kong) in the author block.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "No funding is disclosed, so independence of any funder cannot be assessed.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) appears in the paper.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 2.3 defines LLMs; Section 2.4 formally defines vulnerability detection as a binary classification problem with mathematical notation, and separately defines vulnerability classification and severity prediction.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The abstract explicitly lists three contributions: systematic analysis of LLM applications, a unified framework examining patterns across studies, and identification of key challenges and research directions.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 2.2 compares this survey against three closely related prior surveys (Yao et al., Xu et al., Zhou et al.) on specific dimensions including model recency, benchmark coverage, and detection-specific depth.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "survey": {
    121       "search_and_selection": {
    122         "search_strategy_reproducible": {
    123           "applies": true,
    124           "answer": false,
    125           "justification": "The search is described narratively (top-tier conferences, keyword extraction, iterative searches every 3 weeks) but lacks specific query strings, exact date ranges, and database-level documentation needed for independent reproduction.",
    126           "source": "haiku"
    127         },
    128         "inclusion_exclusion_explicit": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Section 2.1 explicitly states exclusion of traditional ML methods (CNN/RNN), papers unrelated to vulnerability detection, and restricts inclusion to LLM-based studies covering specific programming languages.",
    132           "source": "haiku"
    133         },
    134         "prisma_or_structured_protocol": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "No PRISMA flow diagram or mention of any structured systematic review protocol is present; the search process is described informally without stage-by-stage counts.",
    138           "source": "haiku"
    139         },
    140         "search_terms_provided": {
    141           "applies": true,
    142           "answer": true,
    143           "justification": "Section 2.1 explicitly lists search terms: 'vulnerability detection,' 'LLM,' 'large language model,' and 'AI.'",
    144           "source": "haiku"
    145         },
    146         "databases_listed": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Only specific venues/conferences are listed (S&P, USENIX Security, CCS, IEEE TSE) but not the actual electronic databases searched (e.g., ACM Digital Library, IEEE Xplore, arXiv, Semantic Scholar).",
    150           "source": "haiku"
    151         },
    152         "screening_process_documented": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "Only a single aggregate count is provided (~500–600 papers screened to 58 selected) with no stage-by-stage flow documenting how papers were excluded at title, abstract, or full-text screening stages.",
    156           "source": "haiku"
    157         },
    158         "review_scope_justified": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "The scope is justified by the emergence of LLMs as a distinct paradigm (post-2019) and the absence of prior surveys specifically addressing LLM-based vulnerability detection methodology in depth.",
    162           "source": "haiku"
    163         }
    164       },
    165       "synthesis_quality": {
    166         "conflicting_findings_acknowledged": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "The paper notes conflicting results such as CoT improving precision but having variable recall impact, and dramatically different F1-scores for the same model across datasets (e.g., CodeBERT: 0.099 on PrimeVul vs. 0.66 on Choi2017).",
    170           "source": "haiku"
    171         },
    172         "quality_assessment_of_sources": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No structured quality assessment, risk-of-bias evaluation, or quality rubric is applied to reviewed papers; preprints and peer-reviewed studies are synthesized without differentiation by methodological quality.",
    176           "source": "haiku"
    177         },
    178         "publication_bias_discussed": {
    179           "applies": true,
    180           "answer": false,
    181           "justification": "Publication bias is never mentioned; the survey does not acknowledge that published papers skew toward positive results or discuss the impact of unpublished negative findings.",
    182           "source": "haiku"
    183         },
    184         "quantitative_synthesis_present": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "The paper presents descriptive statistics (usage counts, percentage breakdowns, tabulated F1-scores) but performs no meta-analysis, vote counting with confidence intervals, or effect size aggregation across studies.",
    188           "source": "haiku"
    189         },
    190         "recommendations_supported_by_evidence": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "Research direction recommendations (repository-level datasets, cross-file detection, robustness improvements) are directly tied to documented gaps and challenges identified across the reviewed literature.",
    194           "source": "haiku"
    195         }
    196       }
    197     }
    198   },
    199   "claims": [
    200     {
    201       "claim": "GPT-4 is the most commonly used LLM in vulnerability detection research, appearing in 29 of 58 reviewed studies.",
    202       "evidence": "Table 2 and RQ1 analysis enumerating usage frequency of 33 distinct LLMs across 58 studies.",
    203       "supported": "strong"
    204     },
    205     {
    206       "claim": "Decoder-only models account for 67.1% of fine-tuning experiments, marking a shift from encoder-only architectures.",
    207       "evidence": "Architecture categorization across 58 reviewed studies: 24.2% encoder-only, 8.7% encoder-decoder, 67.1% decoder-only in fine-tuning.",
    208       "supported": "strong"
    209     },
    210     {
    211       "claim": "C/C++ dominates vulnerability detection research at 50% of studies, followed by Java at 21.1%.",
    212       "evidence": "Figure 5 analysis of target programming languages across 56 selected papers.",
    213       "supported": "strong"
    214     },
    215     {
    216       "claim": "41.3% of studies employ code processing techniques (AST, RAG, program slicing) to address LLMs' limited context windows.",
    217       "evidence": "Finding III based on categorizing preprocessing technique usage across 58 reviewed studies.",
    218       "supported": "moderate"
    219     },
    220     {
    221       "claim": "Fine-tuning large models (>10B parameters) with PEFT achieves F1-scores near 0.9 for vulnerability detection.",
    222       "evidence": "Table 5 shows Alam et al. (0.99), Guo et al. (0.97), Luo et al. (0.90), Ma et al. (0.91) across various datasets.",
    223       "supported": "weak"
    224     },
    225     {
    226       "claim": "Data leakage is pervasive in vulnerability detection datasets because LLMs train on GitHub sources that overlap with test sets.",
    227       "evidence": "Challenge 4 section citing Wu et al. [121] and multiple papers documenting mislabeling and leakage from LLM training corpora.",
    228       "supported": "moderate"
    229     }
    230   ],
    231   "methodology_tags": [
    232     "survey",
    233     "qualitative"
    234   ],
    235   "key_findings": "LLM-based vulnerability detection has rapidly shifted toward large decoder-only architectures (67.1% of fine-tuning studies), with GPT-4 as the dominant model and C/C++ as the primary target language (50% of studies). Fine-tuning with PEFT methods achieves near 0.9 F1-scores on controlled benchmarks, but these results are undermined by pervasive data leakage and label quality problems in existing datasets. Four major challenges are identified: narrow research scope (83% of studies analyze isolated functions rather than real-world codebases), semantic complexity of cross-file vulnerabilities, intrinsic LLM limitations (inconsistent explanations, low robustness to perturbations), and lack of high-quality repository-level datasets. Research directions proposed include repository-level analysis, vulnerability reproduction pipelines, and vulnerability-specific fine-tuning.",
    236   "red_flags": [
    237     {
    238       "flag": "No PRISMA protocol",
    239       "detail": "The survey uses informal keyword searches without a structured systematic review protocol, PRISMA flow diagram, or stage-by-stage screening counts, undermining reproducibility of the selection process."
    240     },
    241     {
    242       "flag": "No quality assessment of sources",
    243       "detail": "Reviewed papers are synthesized without any quality rating or risk-of-bias assessment, meaning low-quality preprints (acknowledged as ~60% of the corpus) are treated on equal footing with peer-reviewed studies."
    244     },
    245     {
    246       "flag": "Publication bias unaddressed",
    247       "detail": "The survey never acknowledges that published papers skew positive, which is especially problematic when aggregating F1-scores that may reflect best-case dataset conditions."
    248     },
    249     {
    250       "flag": "No funding disclosure",
    251       "detail": "No acknowledgments or funding section is present, making it impossible to assess potential conflicts of interest."
    252     },
    253     {
    254       "flag": "Databases not listed",
    255       "detail": "The search describes venue-level sources (conferences, one journal) but not the actual electronic databases searched, leaving the search strategy incompletely reproducible."
    256     },
    257     {
    258       "flag": "Thin limitations section",
    259       "detail": "Section 4 is a single short paragraph mentioning only preprint prevalence and terminology variation, omitting fundamental threats such as selection bias, publication bias, and the impact of evaluating primarily benchmark rather than real-world performance."
    260     }
    261   ],
    262   "cited_papers": [
    263     {
    264       "title": "Vulnerability Detection with Code Language Models: How Far Are We?",
    265       "relevance": "Key empirical paper showing low F1-scores (0.21) even with fine-tuning on PrimeVul; central evidence for the gap between benchmark and real-world performance."
    266     },
    267     {
    268       "title": "DiverseVul: A New Vulnerable Source Code Dataset for Deep Learning Based Vulnerability Detection",
    269       "relevance": "Major dataset providing diverse C/C++ vulnerability samples; cited extensively in the dataset and fine-tuning sections."
    270     },
    271     {
    272       "title": "CVEfixes: automated collection of vulnerabilities and their fixes from open-source software",
    273       "relevance": "Widely used commit-level vulnerability dataset referenced throughout the benchmark and fine-tuning discussion."
    274     },
    275     {
    276       "title": "LLM4Vuln: A unified evaluation framework for decoupling and enhancing LLMs' vulnerability reasoning",
    277       "relevance": "Framework for evaluating LLMs in vulnerability detection; cited for RAG-based knowledge base and CoT analysis."
    278     },
    279     {
    280       "title": "How far have we gone in vulnerability detection using large language models",
    281       "relevance": "Directly relevant benchmarking study on LLM vulnerability detection capabilities; core reference for RQ3 findings."
    282     },
    283     {
    284       "title": "Large Language Model for Vulnerability Detection and Repair: Literature Review and Roadmap",
    285       "relevance": "Closely related prior survey explicitly differentiated from in Section 2.2 on three specific dimensions."
    286     },
    287     {
    288       "title": "LLM-Assisted Static Analysis for Detecting Security Vulnerabilities",
    289       "relevance": "Demonstrates LLM integration with static analysis for repository-level detection; source of CWE-Bench-Java dataset."
    290     },
    291     {
    292       "title": "How Effective Are Neural Networks for Fixing Security Vulnerabilities",
    293       "relevance": "Cited for evidence of data leakage problems in vulnerability datasets, supporting Challenge 4."
    294     }
    295   ],
    296   "engagement_factors": {
    297     "practical_relevance": {
    298       "score": 3,
    299       "justification": "Directly actionable for security practitioners choosing LLMs and techniques for vulnerability detection pipelines, with comprehensive dataset and benchmark tables."
    300     },
    301     "surprise_contrarian": {
    302       "score": 1,
    303       "justification": "Findings largely confirm expected trends; the data leakage problem is notable but not surprising to the security ML community."
    304     },
    305     "fear_safety": {
    306       "score": 2,
    307       "justification": "Highlights that 83% of research operates on isolated code snippets far from real-world scenarios, and that benchmark F1-scores are inflated by data leakage—raising concern about false security from deployed LLM tools."
    308     },
    309     "drama_conflict": {
    310       "score": 1,
    311       "justification": "No significant controversy; the survey takes a neutral synthesis stance without challenging prominent community positions."
    312     },
    313     "demo_ability": {
    314       "score": 2,
    315       "justification": "Links to a maintained GitHub repository of findings, and many reviewed tools (GPT-4, CodeBERT, CodeLlama) are publicly accessible for practitioners to try."
    316     },
    317     "brand_recognition": {
    318       "score": 2,
    319       "justification": "Published in ACM Computing Surveys (high-prestige venue) by Texas A&M authors; features prominent models including GPT-4, Claude 3.5, and DARPA's AIxCC competition."
    320     }
    321   },
    322   "hn_data": {
    323     "threads": [
    324       {
    325         "hn_id": "43042753",
    326         "title": "LM2: Large Memory Models",
    327         "points": 110,
    328         "comments": 30,
    329         "url": "https://news.ycombinator.com/item?id=43042753",
    330         "created_at": "2025-02-13T23:21:21Z"
    331       }
    332     ],
    333     "top_points": 110,
    334     "total_points": 110,
    335     "total_comments": 30
    336   }
    337 }

Impressum · Datenschutz