scan.json (27860B)
1 { 2 "paper": { 3 "title": "A systematic literature review on the impact of AI models on the security of code generation", 4 "authors": [ 5 "Claudia Negri-Ribalta", 6 "Rémi Geraud-Stewart", 7 "Anastasia Sergeeva", 8 "Gabriele Lenzini" 9 ], 10 "year": 2024, 11 "venue": "Frontiers in Big Data", 12 "doi": "10.3389/fdata.2024.1386720" 13 }, 14 "scan_version": 3, 15 "active_modules": [ 16 "survey_methodology" 17 ], 18 "methodology_tags": [ 19 "meta-analysis" 20 ], 21 "key_findings": "This SLR reviews 19 papers on AI code generation security, finding broad agreement that AI models produce code with known vulnerabilities (CWE Top-25), with AI-generated Python code being more secure than C or Verilog. The security of generated code varies by model, programming language, vulnerability type, and prompt design. Conflicting evidence exists on whether AI-generated code is worse than human-generated code, with some studies finding no significant difference. Mitigation strategies include better training datasets, improved prompting, post-processing, and end-user education, though none are conclusively effective.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": false, 27 "justification": "No analysis code or scripts are released. The Zenodo archive (https://zenodo.org/records/11092334) contains the paper sample dataset, not analysis code." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The full sample dataset is available on Zenodo: 'We have provided the whole sample at: https://doi.org/10.5281/zenodo.10666386 for replication and transparency' (Section 6). Each paper has details on why it was included/excluded." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No environment or dependency specifications are provided. The survey involves no computational analysis requiring reproducible software environments, but analysis scripts could have been released with environment specs." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "No explicit step-by-step reproduction instructions or README are provided. The methodology section (Section 3) describes the SLR process narratively with search strings and criteria, but there is no dedicated reproduction guide." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": false, 48 "answer": false, 49 "justification": "This is a qualitative systematic literature review with no statistical aggregation or meta-analysis computations." 50 }, 51 "significance_tests": { 52 "applies": false, 53 "answer": false, 54 "justification": "The survey performs qualitative synthesis of findings from reviewed papers. No statistical tests are conducted by the authors." 55 }, 56 "effect_sizes_reported": { 57 "applies": false, 58 "answer": false, 59 "justification": "No effect sizes computed by the authors. The survey reports effect sizes from reviewed papers but does not produce its own." 60 }, 61 "sample_size_justified": { 62 "applies": false, 63 "answer": false, 64 "justification": "The sample size (N=19) is determined by the systematic search methodology, not predetermined. No power analysis is applicable to an SLR." 65 }, 66 "variance_reported": { 67 "applies": false, 68 "answer": false, 69 "justification": "No experimental runs or statistical computations are performed by the authors." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": false, 76 "justification": "The survey does not compare its findings against prior surveys or systematic reviews on the same topic. Background (Section 2) discusses related work but does not systematically compare findings with prior review efforts." 77 }, 78 "baselines_contemporary": { 79 "applies": false, 80 "answer": false, 81 "justification": "No experimental baselines apply to this systematic literature review." 82 }, 83 "ablation_study": { 84 "applies": false, 85 "answer": false, 86 "justification": "No system with components exists; this is a systematic literature review." 87 }, 88 "multiple_metrics": { 89 "applies": false, 90 "answer": false, 91 "justification": "No experiments are conducted requiring multiple metrics." 92 }, 93 "human_evaluation": { 94 "applies": false, 95 "answer": false, 96 "justification": "No system outputs exist to be evaluated by humans." 97 }, 98 "held_out_test_set": { 99 "applies": false, 100 "answer": false, 101 "justification": "No experiments requiring train/test splits are conducted." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Results are broken down by programming language (Section 5.3: Python, C, Java, Verilog), AI model family (Section 5.1, Figure 3), vulnerability type (CWE categories), and mitigation strategy (Section 5.4, Table 10)." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper discusses where comparisons between papers are difficult or impossible due to different methodologies, vulnerabilities studied, and experimental setups (Section 5.1). Section 6 discusses threats and limitations of the review itself." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The paper reports conflicting evidence: Asare et al. (2023) and Sandoval et al. (2023) find AI-generated code is no worse than human-generated code, contradicting the general trend. The paper also reports that mitigation strategies have limited or unproven effectiveness." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": false, 123 "justification": "The abstract claims the work provides 'a comprehensive and systematic overview of the impact of AI in secure coding.' While the review is systematic, claiming comprehensiveness with only 19 papers covering a narrow set of AI models and programming languages is an overclaim. Core findings about vulnerabilities and mitigation strategies are supported by Section 5." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper makes the causal claim that AI models 'introduce' vulnerabilities into generated code. This is supported by synthesized evidence from 19 papers, several of which are experimental studies (Pearce et al. 2022, Perry et al. 2023, Sandoval et al. 2023) that provide causal evidence through controlled experiments. The survey appropriately hedges by noting conflicting results." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The abstract claims 'a comprehensive and systematic overview of the impact of AI in secure coding,' but the review covers only 19 papers with a limited set of AI models (predominantly OpenAI) and programming languages. The conclusion appropriately hedges ('it is still premature to conclude'), but the abstract and title frame broader coverage than delivered." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper discusses multiple alternative explanations for why AI models generate insecure code: training data quality (Section 5.4.1), prompt design and temperature settings (Section 5.3.1), model architecture differences (Section 5.1), and the hypothesis that less training data for languages like Verilog leads to worse security (Section 5.3.4)." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper explicitly acknowledges measurement differences across studies: 'it remains difficult to compare the results' (Section 5.1) because papers researched 'different vulnerabilities' using different metrics. Footnote 9 notes that functional correctness and security are distinct concerns. The paper is aware that different CWE metrics measure different aspects of security." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": false, 149 "answer": false, 150 "justification": "This is a systematic literature review that does not use AI models directly." 151 }, 152 "prompts_provided": { 153 "applies": false, 154 "answer": false, 155 "justification": "No prompting is used; this is a systematic literature review." 156 }, 157 "hyperparameters_reported": { 158 "applies": false, 159 "answer": false, 160 "justification": "No models are run; this is a systematic literature review." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used; this is a systematic literature review." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "The paper selection pipeline is well documented with counts at each stage: 95 initial → 21 after deduplication and inclusion/exclusion → 43 after snowballing → 35 after criteria → 23 after full-text review → 19 after quality check (Figure 1, Sections 3-4). Filtering criteria are stated in Table 3." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 6 'Threats to validity and future work' provides substantial discussion of reliability and validity issues, running approximately two full pages." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 6 discusses threats specific to this study: sample representativeness depends on date of data collection, definition of 'code generation' affects results, high proportion of snowballed papers, researcher background affecting classification, and incomparable vulnerability taxonomies across papers making comparison 'at best, complicated and, at worst, a threat to our conclusions.'" 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "The paper explicitly defines scope boundaries: AI code generation means 'artifacts that suggest or produce code,' excluding tools that only verify/check code (Section 4.1). Table 3 lists exclusion criteria. Section 6 states what results do not show: 'the final sample of this research may increase and change depending on the day the data was gathered.'" 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": true, 194 "justification": "The full sample with inclusion/exclusion decisions is available on Zenodo: 'Each paper has details on why it was included/excluded, at which phase, and with details and/or comments to help readers understand and replicate our research' (Section 6)." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Data collection is described in detail: databases used (IEEE Xplore, ACM, Scopus), specific search strings per database (Table 2), time period ('last week of November 2023'), and PICO framework (Section 3.1). Keywords and synonyms are provided in Table 1." 200 }, 201 "recruitment_methods_described": { 202 "applies": true, 203 "answer": true, 204 "justification": "Paper 'recruitment' methods are thoroughly described: database searches with explicit strings (Table 2), forward snowballing via Google Scholar (Section 3.3), and the process for evaluating snowballed papers based on title, abstract, and keywords following Wohlin (2014)." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The full pipeline from search to final analysis is documented in Figure 1 and Section 4.1: 95 initial articles → deduplication → inclusion/exclusion → 21 starting set → snowballing → 43 articles → inclusion/exclusion → 35 → full-text review → 23 → quality check → 19 final. Criteria at each stage are specified." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Funding is disclosed: 'This research was funded in whole, or in part, by the Luxembourg National Research Fund (FNR), grant: NCER22/IS/16570468/NCER-FT.'" 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly stated: University of Luxembourg (Security and Trust, and FHSE), and École Normale Supérieure, Paris. No authors are affiliated with AI companies whose products are being reviewed." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": true, 226 "justification": "The Luxembourg National Research Fund (FNR) is a national funding agency with no commercial stake in whether AI models generate secure or insecure code." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": true, 231 "justification": "A competing interests statement is provided: 'The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.'" 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": false, 237 "answer": false, 238 "justification": "This is a systematic literature review that does not evaluate any pre-trained model on a benchmark." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": false, 242 "answer": false, 243 "justification": "This is a systematic literature review that does not evaluate any pre-trained model on a benchmark." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": false, 247 "answer": false, 248 "justification": "This is a systematic literature review that does not evaluate any pre-trained model on a benchmark." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this systematic literature review. The authors review published papers, not human subjects." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this systematic literature review." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this systematic literature review." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this systematic literature review. Paper inclusion/exclusion criteria are covered under data_integrity." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants or experimental conditions in this systematic literature review." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants or experimental conditions in this systematic literature review." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this systematic literature review." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": false, 291 "answer": false, 292 "justification": "This is a systematic literature review with no computational method whose cost could be reported." 293 }, 294 "compute_budget_stated": { 295 "applies": false, 296 "answer": false, 297 "justification": "This is a systematic literature review with no computational budget." 298 } 299 }, 300 "survey_methodology": { 301 "prisma_or_structured_protocol": { 302 "applies": true, 303 "answer": true, 304 "justification": "The survey follows Kitchenham and Charters (2007) SLR guidelines, uses the PICO framework (Section 3.1), provides reproducible search strings per database (Table 2), and includes a selection flow diagram (Figure 1). Additional guidelines from Wieringa et al. (2006), Wohlin (2014), and Petersen et al. (2015) are also followed." 305 }, 306 "quality_assessment_of_sources": { 307 "applies": true, 308 "answer": true, 309 "justification": "A quality assessment with an 8-question checklist (Table 4) and 4-point scoring system (Table 5) is applied to all papers. Quality scores for each paper are reported transparently in Table 8. Papers scoring below 50% (4 points) are excluded. Quality checks are done by at least two authors." 310 }, 311 "publication_bias_discussed": { 312 "applies": true, 313 "answer": false, 314 "justification": "The paper does not discuss publication bias—whether its source papers skew toward positive findings about AI-generated code vulnerabilities. Section 6 discusses sample representativeness and timing issues but not systematic positive-result bias in the reviewed literature." 315 } 316 } 317 }, 318 "claims": [ 319 { 320 "claim": "There is high-level agreement that AI models do not produce safe code and do introduce vulnerabilities, despite mitigations.", 321 "evidence": "Synthesized from 19 reviewed papers, with multiple studies (Pearce et al. 2022, He and Vechev 2023, Tony et al. 2023, Perry et al. 2023) consistently finding vulnerabilities in AI-generated code. Section 5 and Conclusion.", 322 "supported": "moderate" 323 }, 324 { 325 "claim": "AI-generated Python code has fewer vulnerabilities than AI-generated C code.", 326 "evidence": "Pearce et al. (2022) found 39% vulnerability rate for Python vs 50% for C. He and Vechev (2023) found ~42% for Python. Pearce et al. (2023) also found C performed worse. Section 5.3.", 327 "supported": "moderate" 328 }, 329 { 330 "claim": "Users who use AI assistants produce more insecure code and trust AI-generated code more than their own.", 331 "evidence": "Perry et al. (2023) found experimental group produced less secure code: 21% secure in AI-assisted vs 43% in control group for encryption tasks. However, Sandoval et al. (2023) and Asare et al. (2023) found no significant difference. Section 5.1 and 5.3.", 332 "supported": "weak" 333 }, 334 { 335 "claim": "AI models have fundamental limitations preventing them from creating complete malware from scratch, though they can generate malware snippets.", 336 "evidence": "Botacin (2023) found ChatGPT cannot create malware from scratch but can create snippets. Pa Pa et al. (2023) found models could create up to 400 lines of malware with jailbreaks. Liguori et al. (2023) noted human review remains necessary. Section 5.2.", 337 "supported": "moderate" 338 }, 339 { 340 "claim": "The conclusions diverge on whether AI-generated code is worse than human-generated code.", 341 "evidence": "Perry et al. (2023) found AI-assisted users wrote more insecure code. Asare et al. (2023) and Sandoval et al. (2023) found AI models make similar or fewer mistakes than humans. The paper acknowledges 'there is no clear favor for one hypothesis over the other.' Conclusion.", 342 "supported": "strong" 343 }, 344 { 345 "claim": "Prompting and context play a crucial role in the security of AI-generated code.", 346 "evidence": "Pearce et al. (2022) showed a variation from 0% to 94% vulnerability rate depending on prompt content for SQL injection. Nair et al. (2023) explored prompt strategies for secure output. Perry et al. (2023) observed relationship between model parameters and code quality. Section 5.3.1.", 347 "supported": "moderate" 348 } 349 ], 350 "red_flags": [ 351 { 352 "flag": "Small sample for broad claims", 353 "detail": "Only 19 papers form the final sample, yet the abstract claims to provide 'a comprehensive and systematic overview of the impact of AI in secure coding.' The coverage is narrow—predominantly OpenAI models, Python and C—relative to the broad framing." 354 }, 355 { 356 "flag": "Disproportionate snowballing", 357 "detail": "22 of 43 candidate papers (51%) came from snowballing rather than the database search (21 papers), suggesting the original search strategy may have been incomplete. The authors acknowledge this as unusual and discuss it in Section 6." 358 }, 359 { 360 "flag": "No publication bias assessment", 361 "detail": "Despite being a systematic review, the paper does not discuss whether the reviewed literature skews toward papers that find vulnerabilities in AI-generated code (positive-result bias). Papers finding 'AI code is fine' may be underrepresented." 362 }, 363 { 364 "flag": "Qualitative synthesis without quantitative aggregation", 365 "detail": "The survey qualitatively summarizes findings but does not statistically aggregate results across studies. Different methodologies, vulnerability taxonomies, and experimental setups across the 19 papers make comparison difficult, as the authors themselves acknowledge." 366 } 367 ], 368 "cited_papers": [ 369 { 370 "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions", 371 "authors": ["H. Pearce", "B. Ahmad", "B. Tan", "B. Dolan-Gavitt", "R. Karri"], 372 "year": 2022, 373 "relevance": "Foundational study finding 40% of Copilot's code suggestions contained vulnerabilities across CWE Top-25, directly relevant to AI code generation security." 374 }, 375 { 376 "title": "Do users write more insecure code with AI assistants?", 377 "authors": ["N. Perry", "M. Srivastava", "D. Kumar", "D. Boneh"], 378 "year": 2023, 379 "relevance": "User study finding AI-assisted developers produce more insecure code and trust AI-generated code more, key evidence on human-AI interaction in code security." 380 }, 381 { 382 "title": "Large language models for code: Security hardening and adversarial testing", 383 "authors": ["J. He", "M. Vechev"], 384 "year": 2023, 385 "relevance": "Proposes SVEN for controlled secure/insecure code generation, demonstrating both security improvement (to >85%) and degradation capabilities for LLMs." 386 }, 387 { 388 "title": "Examining zero-shot vulnerability repair with large language models", 389 "authors": ["H. Pearce", "B. Tan", "B. Ahmad", "R. Karri", "B. Dolan-Gavitt"], 390 "year": 2023, 391 "relevance": "Studies LLM capability for zero-shot vulnerability repair across multiple models and CWEs, finding limited but model-dependent repair success." 392 }, 393 { 394 "title": "Lost at C: a user study on the security implications of large language model code assistants", 395 "authors": ["G. Sandoval", "H. Pearce", "T. Nys", "R. Karri", "S. Garg", "B. Dolan-Gavitt"], 396 "year": 2023, 397 "relevance": "User study finding no conclusive evidence that LLM assistants increase CWE incidence compared to unaided developers in C programming." 398 }, 399 { 400 "title": "Is GitHub's Copilot as bad as humans at introducing vulnerabilities in code?", 401 "authors": ["O. Asare", "M. Nagappan", "N. Asokan"], 402 "year": 2023, 403 "doi": "10.48550/arXiv.2204.04741", 404 "relevance": "Compares Copilot's vulnerability introduction to human developers, finding Copilot can avoid detected vulnerabilities in a substantial number of scenarios." 405 }, 406 { 407 "title": "How effective are neural networks for fixing security vulnerabilities", 408 "authors": ["Y. Wu", "N. Jiang", "H. V. Pham", "T. Lutellier", "J. Davis", "L. Tan"], 409 "year": 2023, 410 "relevance": "Proposes VJBench benchmark and evaluates LLMs on Java vulnerability repair, finding models fix very few vulnerabilities (Codex at 20.4%)." 411 }, 412 { 413 "title": "Large language models and simple, stupid bugs", 414 "authors": ["K. Jesse", "T. Ahmed", "P. T. Devanbu", "E. Morgan"], 415 "year": 2023, 416 "relevance": "Studies simple stupid bug generation by code LLMs, finding models propose twice as many bugs as correct code in Java." 417 }, 418 { 419 "title": "CodexLeaks: privacy leaks from code generation language models in GitHub's Copilot", 420 "authors": ["L. Niu", "S. Mirza", "Z. Maradni", "C. Pöpper"], 421 "year": 2023, 422 "relevance": "Demonstrates personal data leakage through Copilot code suggestions, a novel security exploit of AI code generation models." 423 }, 424 { 425 "title": "Evaluating large language models trained on code", 426 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 427 "year": 2021, 428 "doi": "10.48550/arXiv.2107.03374", 429 "relevance": "Foundational Codex paper establishing evaluation of LLMs for code generation, widely cited in AI coding capability research." 430 }, 431 { 432 "title": "CodeAttack: code-based adversarial attacks for pre-trained programming language models", 433 "authors": ["A. Jha", "C. K. Reddy"], 434 "year": 2023, 435 "relevance": "Proposes black-box adversarial attack on code generation models by targeting vulnerable tokens, demonstrating security risks of code LLMs." 436 }, 437 { 438 "title": "GPThreats-3: is automatic malware generation a threat?", 439 "authors": ["M. Botacin"], 440 "year": 2023, 441 "relevance": "Evaluates ChatGPT's capability for malware generation, finding it can create snippets but not complete malware, relevant to AI safety in code generation." 442 }, 443 { 444 "title": "GitHub considered harmful? Analyzing open-source projects for the automatic generation of cryptographic API call sequences", 445 "authors": ["C. Tony", "N. E. D. Ferreyra", "R. Scandariato"], 446 "year": 2022, 447 "relevance": "Studies security risks of AI-generated cryptographic API calls, finding significantly lower accuracy on security-sensitive tasks." 448 } 449 ], 450 "engagement_factors": { 451 "practical_relevance": { 452 "score": 1, 453 "justification": "Practitioners can learn about security risks of AI code generation, but the paper offers no tool, technique, or actionable mitigation they can deploy." 454 }, 455 "surprise_contrarian": { 456 "score": 0, 457 "justification": "Confirms the widely-held expectation that AI-generated code has security issues; the finding of conflicting evidence on human vs AI code quality is notable but not contrarian." 458 }, 459 "fear_safety": { 460 "score": 2, 461 "justification": "Raises security concerns about AI code generation including malware creation, personal data leaks, and vulnerability introduction, topics that resonate with security-conscious audiences." 462 }, 463 "drama_conflict": { 464 "score": 0, 465 "justification": "No controversy or drama; this is a neutral academic literature review without provocative claims." 466 }, 467 "demo_ability": { 468 "score": 0, 469 "justification": "No demo, tool, or code artifact is produced; the output is a literature review with a Zenodo dataset." 470 }, 471 "brand_recognition": { 472 "score": 1, 473 "justification": "Reviews papers about well-known tools (Copilot, ChatGPT, Codex) but comes from University of Luxembourg, not a major AI lab." 474 } 475 } 476 }