scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21816B)
      1 {
      2   "paper": {
      3     "title": "From Vulnerabilities to Remediation: A Systematic Literature Review of LLMs in Code Security",
      4     "authors": ["Enna Basic", "Alberto Giaretta"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2412.15004",
      8     "doi": "10.1145/nnnnnnn.nnnnnnn"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["survey_methodology"],
     12   "methodology_tags": ["meta-analysis"],
     13   "key_findings": "This SLR identifies 10 categories of vulnerabilities in LLM-generated code, with injection vulnerabilities most prevalent (16/20 studies). LLMs generally outperform traditional static analysis tools in vulnerability detection rates but suffer from high false positive rates (up to 97%). Chain-of-thought prompting consistently outperforms zero-shot and few-shot approaches for vulnerability detection and fixing. Data poisoning of as little as 3% of training data can cause models to generate insecure code, and no research yet addresses poisoning's impact on vulnerability detection/fixing tasks.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No repository URL or code archive is provided. The paper is a survey but could have released analysis scripts or extracted data."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset of extracted findings, classification spreadsheets, or supplementary data is released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No environment or tooling details are provided. A survey could specify tools used for search, screening, or analysis."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No reproduction instructions are provided for replicating the literature search or analysis."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": false,
     40         "answer": false,
     41         "justification": "This is a survey paper that does not run its own experiments or perform statistical aggregation."
     42       },
     43       "significance_tests": {
     44         "applies": false,
     45         "answer": false,
     46         "justification": "Survey paper with no original experiments or meta-analytic statistical tests."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": false,
     50         "answer": false,
     51         "justification": "Survey paper; no original effect sizes computed."
     52       },
     53       "sample_size_justified": {
     54         "applies": false,
     55         "answer": false,
     56         "justification": "Survey paper; no experimental sample size to justify."
     57       },
     58       "variance_reported": {
     59         "applies": false,
     60         "answer": false,
     61         "justification": "Survey paper with no original experiments."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 2 (Related Work) explicitly compares this survey against prior surveys (Negri et al., Yao et al., Zhou et al., Chen et al.) and identifies what each covers and what this survey adds."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The compared prior surveys are from 2024, contemporary with this work."
     74       },
     75       "ablation_study": {
     76         "applies": false,
     77         "answer": false,
     78         "justification": "Survey paper with no system components to ablate."
     79       },
     80       "multiple_metrics": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Survey paper; no experimental metrics."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Survey paper; no system outputs to evaluate."
     89       },
     90       "held_out_test_set": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "Survey paper; no test set."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Tables 2-7 and Figures 2 provide per-category breakdowns of vulnerability types, detection challenges, and prompting techniques across the reviewed studies."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 5.2 discusses challenges and limitations of LLMs for vulnerability detection (false positives, context awareness, complex scenarios). Section 10.2 discusses open challenges."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports negative findings: LLMs produce high false positive rates (up to 97%), struggle with complex real-world scenarios, and fine-tuned models are limited to trained-for tasks. Table 6 lists studies where traditional approaches outperformed LLMs."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims about vulnerability introduction, detection/fixing capabilities, prompting impact, and data poisoning are all supported by the detailed sections (4-9) of the paper."
    116       },
    117       "causal_claims_justified": {
    118         "applies": false,
    119         "answer": false,
    120         "justification": "The paper is a survey that reports findings from other studies. It does not make independent causal claims — it summarizes causal findings from reviewed papers."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title claims broad coverage of 'LLMs in Code Security' but the paper acknowledges in Section 10.3 that the rapid evolution of LLMs threatens validity. However, it does not explicitly bound which LLM generations, time periods, or programming languages its conclusions apply to."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for the patterns observed across studies (e.g., whether false positive rates could be an artifact of benchmark design, or whether vulnerability introduction rates depend on prompt quality rather than model capability)."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper aggregates results from studies using different metrics (F1, accuracy, precision, recall, detection rates) without discussing whether these proxy metrics adequately measure 'security' or 'vulnerability detection capability' as framed."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": false,
    141         "answer": false,
    142         "justification": "Survey paper that does not use LLMs in its own methodology."
    143       },
    144       "prompts_provided": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "Survey paper; no prompting used."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": false,
    151         "answer": false,
    152         "justification": "Survey paper; no model inference performed."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding used."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3 describes the methodology including search strings (Section 3.2), inclusion/exclusion criteria (Section 3.3, Table 1), databases searched (IEEE Xplore, ACM DL, ScienceDirect, SpringerLink, arXiv). However, the paper does not provide a PRISMA-style count of papers at each filtering stage."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 10.3 'Threats to Validity' discusses threats to validity of the review."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "Section 10.3 is very brief (a few sentences) and only mentions generic threats: 'rapid evolution of LLMs presents challenges' and 'varying prompts across studies.' No specific threats to this particular review's search strategy, inclusion decisions, or synthesis approach."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Section 1 and Section 3.1 clearly state the three research questions that bound the scope. The inclusion/exclusion criteria in Table 1 explicitly define what is in and out of scope (e.g., EX3 excludes secondary studies, EX4 excludes non-security LLM work)."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No supplementary data file listing all reviewed papers with extraction fields is provided."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.2 provides the exact search strings used across five databases (IEEE Xplore, ACM DL, ScienceDirect, SpringerLink, arXiv) with specific keyword sets."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Paper sources are standard databases described in the methodology."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "While the search strategy and criteria are documented, there is no PRISMA-style flow showing how many papers were found at each stage, how many were screened, and how many were included/excluded at each step. The pipeline from search to final paper set is not fully documented with counts."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper. One author is affiliated with Epiroc Rock Drills (industry), but no funding disclosure is made."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: Enna Basic at Örebro University and Epiroc Rock Drills, Alberto Giaretta at Örebro University."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding information disclosed, so independence cannot be assessed. One author has an industry affiliation (Epiroc Rock Drills) with no discussion of potential conflicts."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Survey paper that does not evaluate any pre-trained model on benchmarks."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "Survey paper; no model evaluation performed."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Survey paper; no benchmark evaluation performed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this survey."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "Survey paper; no inference performed."
    285       },
    286       "compute_budget_stated": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "Survey paper; no compute used."
    290       }
    291     },
    292     "survey_methodology": {
    293       "prisma_or_structured_protocol": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "The paper follows Petersen et al. (2015) guidelines for SLRs. Section 3 describes a structured methodology with Planning, Conducting, and Reporting phases (Figure 1). Search strings, databases, and criteria are specified. However, no PRISMA flow diagram with paper counts at each stage is provided."
    297       },
    298       "quality_assessment_of_sources": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The survey does not assess the methodological quality of included papers. All studies are treated equally regardless of their rigor, sample sizes, or evaluation quality. No quality scoring rubric or risk-of-bias assessment is applied."
    302       },
    303       "publication_bias_discussed": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No discussion of publication bias. The paper includes arXiv preprints (IN2) but does not discuss whether the included studies skew toward positive results or whether negative-result papers are underrepresented."
    307       }
    308     }
    309   },
    310   "claims": [
    311     {
    312       "claim": "Injection vulnerabilities are the most commonly identified category in LLM-generated code, appearing in 16 out of 20 studies",
    313       "evidence": "Table 2 and Figure 2 in Section 4 show the distribution across 20 studies, with injection at 16/20.",
    314       "supported": "strong"
    315     },
    316     {
    317       "claim": "LLMs generally outperform traditional static analysis tools in vulnerability detection rates but exhibit higher false positive rates",
    318       "evidence": "Tables 5 and 6 in Section 5.3 compare LLMs vs traditional approaches. 11 studies in Table 5 show LLMs detecting more vulnerabilities; 4 in Table 6 show traditional tools performing better. False positive rates discussed in Section 5.2 (up to 97% for some models).",
    319       "supported": "moderate"
    320     },
    321     {
    322       "claim": "Chain-of-thought prompting outperforms zero-shot and few-shot prompting for vulnerability detection and fixing",
    323       "evidence": "Section 8.3 reviews multiple studies: Zhang et al. showed 21.6% improvement for C/C++, Nong et al. showed GPT-3.5 accuracy rising from 65.88% to 97.65% with VSP. However, CoT decreased Java accuracy by 4.6% in one study.",
    324       "supported": "moderate"
    325     },
    326     {
    327       "claim": "Even poisoning less than 3% of training data can lead models to generate insecure code",
    328       "evidence": "Section 9.1, citing Cotroneo et al. [25], who tested on CodeBERT and CodeT5+ and found poisoning 'less than 3%' was effective.",
    329       "supported": "moderate"
    330     },
    331     {
    332       "claim": "No research addresses the effects of data poisoning on LLMs' vulnerability detection and fixing capabilities",
    333       "evidence": "Section 9.2 states this explicitly as a gap, and Section 10.2 reiterates it as an open challenge.",
    334       "supported": "strong"
    335     }
    336   ],
    337   "red_flags": [
    338     {
    339       "flag": "No quality assessment of included studies",
    340       "detail": "The survey treats all included papers equally without assessing their methodological quality. Papers with tiny sample sizes, no baselines, or questionable evaluation designs are given the same weight as rigorous studies. This risks laundering weak results."
    341     },
    342     {
    343       "flag": "Missing PRISMA flow diagram with counts",
    344       "detail": "Despite claiming to follow established SLR guidelines, the paper does not report how many papers were found in each database, how many were screened, or how many were excluded at each stage. The total number of included papers is never clearly stated as a single figure."
    345     },
    346     {
    347       "flag": "Extremely thin threats to validity",
    348       "detail": "Section 10.3 is only a few sentences long and mentions only generic threats (LLM evolution, varying prompts). For a 35-page SLR, this is inadequate — no discussion of search completeness, inter-rater reliability in paper selection, or synthesis methodology limitations."
    349     },
    350     {
    351       "flag": "No release of extracted data",
    352       "detail": "The categorizations (Tables 2-7) represent substantial extraction work but are not released as structured data for others to verify or build upon."
    353     },
    354     {
    355       "flag": "Publication bias not discussed",
    356       "detail": "The survey includes arXiv preprints but does not discuss whether its corpus is biased toward positive LLM results or whether negative findings are underrepresented."
    357     }
    358   ],
    359   "cited_papers": [
    360     {
    361       "title": "Is github's copilot as bad as humans at introducing vulnerabilities in code?",
    362       "authors": ["Owura Asare", "Meiyappan Nagappan", "N Asokan"],
    363       "year": 2023,
    364       "relevance": "Comparative analysis of Copilot-generated code security vs human-written code."
    365     },
    366     {
    367       "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions",
    368       "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"],
    369       "year": 2022,
    370       "relevance": "Foundational study finding ~40% of Copilot-generated code was vulnerable across 89 security scenarios."
    371     },
    372     {
    373       "title": "Do users write more insecure code with AI assistants?",
    374       "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"],
    375       "year": 2023,
    376       "relevance": "User study showing AI-assisted participants were more likely to produce insecure solutions, relevant to AI programming productivity claims."
    377     },
    378     {
    379       "title": "Trojanpuzzle: Covertly poisoning code-suggestion models",
    380       "authors": ["Hojjat Aghakhani", "Wei Dai"],
    381       "year": 2024,
    382       "relevance": "Novel poisoning attack on code completion models that evades static analysis detection."
    383     },
    384     {
    385       "title": "Security of Language Models for Code: A Systematic Literature Review",
    386       "authors": ["Yuchen Chen", "Weisong Sun", "Chunrong Fang"],
    387       "year": 2024,
    388       "arxiv_id": "2410.15631",
    389       "relevance": "Parallel SLR on LLM code security covering defenses and attacks including code summarization and search."
    390     },
    391     {
    392       "title": "Examining Zero-Shot Vulnerability Repair with Large Language Models",
    393       "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad", "Ramesh Karri", "Brendan Dolan-Gavitt"],
    394       "year": 2023,
    395       "relevance": "Evaluates LLMs' zero-shot vulnerability repair capabilities on synthetic and real-world scenarios."
    396     },
    397     {
    398       "title": "Vulnerability Detection with Code Language Models: How Far Are We?",
    399       "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim"],
    400       "year": 2025,
    401       "doi": "10.1109/ICSE55347.2025.00038",
    402       "relevance": "Demonstrates that commonly used benchmarks overestimate LLM vulnerability detection performance (68% vs 3% F1 on new dataset)."
    403     },
    404     {
    405       "title": "Large language models for software engineering: A systematic literature review",
    406       "authors": ["Xinyi Hou", "Yanjie Zhao", "Yue Liu"],
    407       "year": 2023,
    408       "relevance": "Broad SLR on LLMs in software engineering; relevant as a comparison survey for meta-research assessment."
    409     },
    410     {
    411       "title": "Poisoned chatgpt finds work for idle hands: Exploring developers' coding practices with insecure suggestions from poisoned ai models",
    412       "authors": ["Sanghak Oh", "Kiho Lee", "Seonhye Park", "Doowon Kim", "Hyoungshick Kim"],
    413       "year": 2024,
    414       "relevance": "User study showing developers using poisoned LLMs are more likely to introduce insecure code."
    415     },
    416     {
    417       "title": "Vulnerabilities in ai code generators: Exploring targeted data poisoning attacks",
    418       "authors": ["Domenico Cotroneo", "Cristina Improta", "Pietro Liguori", "Roberto Natella"],
    419       "year": 2024,
    420       "relevance": "Shows that poisoning <3% of training data can cause LLMs to generate vulnerable code."
    421     },
    422     {
    423       "title": "Harnessing Large Language Models for Software Vulnerability Detection: A Comprehensive Benchmarking Study",
    424       "authors": ["Karl Tamberg", "Hayretdin Bahsi"],
    425       "year": 2024,
    426       "arxiv_id": "2405.15614",
    427       "relevance": "Benchmarks LLMs vs traditional tools for vulnerability detection, finding LLMs detect more but with higher false positives."
    428     },
    429     {
    430       "title": "Large Language Model for Vulnerability Detection and Repair: Literature Review and the Road Ahead",
    431       "authors": ["Xin Zhou", "Sicong Cao", "Xiaobing Sun", "David Lo"],
    432       "year": 2024,
    433       "doi": "10.1145/3708522",
    434       "relevance": "Related survey on LLMs for vulnerability detection and repair, useful for survey-of-surveys comparison."
    435     }
    436   ]
    437 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs