scan.json (25772B)
1 { 2 "paper": { 3 "title": "Guiding AI to Fix Its Own Flaws: An Empirical Study on LLM-Driven Secure Code Generation", 4 "authors": ["Hao Yan", "Swapneel Suhas Vaidya", "Xiaokuan Zhang", "Ziyu Yao"], 5 "year": 2025, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2506.23034" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "LLMs generate vulnerable code at rates of 9.8%-42.1% across diverse CWE types, with similar vulnerability distributions across models suggesting shared architectural or training-data limitations. Self-generated vulnerability hints reduce vulnerability rates only when they are relevant, precise, and contextualized — GPT-4o achieved 12.4% TarV-R reduction while weaker models showed minimal or negative effects. Post-hoc vulnerability repair with explained (GPT-4o-enriched) CodeQL feedback is more effective than raw direct feedback, but only for models with strong instruction-following capabilities.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "An anonymized repository link is provided: https://anonymous.4open.science/r/LLM-driven_Secure_Code_Generation-E89E." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper uses two publicly available benchmarks: SecurityEval and SecCodePLT. Both are established public datasets." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment specifications, requirements.txt, or dependency details are mentioned in the paper." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided in the paper. The anonymized repo may contain them but the paper itself does not describe reproduction steps." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results are reported as point estimates (percentages) with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes many comparative claims (e.g., 'GPT-4o achieves a 12.4% reduction') based solely on comparing raw percentages without any statistical significance tests." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports percentage differences with baseline context throughout (e.g., Table 4 subscript numbers showing changes vs. vanilla prompts, Table 8 showing reductions). This provides magnitude in context." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification is given for why these particular dataset sizes (1,071 and 121 questions) are sufficient for the claims being made." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or multi-run results are reported. All numbers appear to be single-run results." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The vanilla prompt serves as the baseline, with self-generated hints and feedback-based repair compared against it (Tables 4, 7, 8 all show changes vs. vanilla baseline in Table 3)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "The study uses contemporary models including GPT-4o, Llama3.1/3.2, DeepSeek-Coder-V2, and references recent related work like Tony et al. (2024) and Wang et al. (2024)." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper ablates the effect of hint quality: comparing hints with vs. without target vulnerability (Table 5), CWE definition vs. contextualized hints (Table 7), and direct vs. explained feedback (Table 8)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Two metrics are defined and used throughout: Target Vulnerability Rate (TarV-R) and All Vulnerability Rate (AllV-R)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "Human validation was conducted: two evaluators assessed 80 hint-definition pairs for the LLM-as-judge evaluation (95.65% precision, 91.67% recall), and manual validation of 160 explained feedback samples." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "The benchmarks (SecurityEval and SecCodePLT) serve as held-out evaluation sets. No tuning was done on these datasets." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Figure 6 provides per-CWE vulnerability distribution across all models. Table 5 breaks down TarV-R by whether hints include the target vulnerability." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper discusses cases where hints increase vulnerability rates (Table 4, GPT-3.5 on SecurityEval), irrelevant hints misleading models, and models that fail to utilize feedback (Section 5.3)." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Several negative results are reported: self-generated hints sometimes increase vulnerability rates, CodeLlama models show no improvement from feedback, and GPT-4o has higher vulnerability rates than GPT-3.5 despite being newer." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims that LLMs are prone to generating insecure code, that advanced models benefit from hints and feedback, and that vulnerability hints and explained feedback help. All are supported by Tables 3, 4, 7, and 8." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper's causal claims (hints reduce vulnerabilities, explained feedback improves repair) are supported by controlled comparisons: same model, same dataset, varying only the intervention (vanilla vs. hints vs. feedback type)." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title says 'Secure Code Generation' broadly, but the study is limited to Python only. The threats-to-validity section acknowledges this but the title and framing do not bound the claim. The paper also only tests single-turn interactions." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper discusses alternative explanations: model instruction-following capability vs. hint quality, architectural vs. training data differences for vulnerability rate variations, and code-optimized vs. general models (Sections 5.1-5.3)." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper measures CodeQL-detected vulnerability rates and frames results in terms of those specific measurements. It explicitly acknowledges CodeQL's limitations (static analysis only, cannot capture dynamic vulnerabilities) in the threats to validity." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Table 2 specifies model versions: GPT-4o-0513, GPT-3.5-turbo-0125, CodeLlama-Instruct 7B/34B, Llama3.1-Instruct 8B, Llama3.2-Instruct 3B, StarCoder2-Instruct 15B, DeepSeek-Coder-V2-Lite-Instruct 16B." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompt templates are provided in Figures 2-5 for all experimental conditions (vanilla, hint generation, hint-augmented generation, direct feedback repair, explained feedback generation, explained feedback repair). The fill values come from the public datasets." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the models used." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. The approach is prompt-based with single-turn or two-turn interactions." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "The paper documents that 6 CWEs (274 samples) were excluded from SecCodePLT because they are not covered by CodeQL, resulting in 21 CWEs with 1,071 samples. The datasets and their properties are described in Section 4." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 6 'Discussion' contains a 'Threats to validity' subsection with substantive discussion of limitations." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "The threats are specific: CodeQL as static-only analysis, Python-only evaluation, two datasets may not cover all vulnerability types, and static single-turn setting vs. real-world multi-turn interactions." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "The paper explicitly states it focuses on Python, uses static analysis (CodeQL), operates in a single-turn setting, and is limited to two datasets. These scope boundaries are clearly stated in Section 6." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "The anonymized repo may contain generated code, but the paper itself does not provide raw experimental outputs (generated code snippets, CodeQL results per sample)." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 4 describes the two benchmarks in detail, including their origins, sizes, CWE coverage, and how they were constructed (Table 1)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants; data sources are standard public benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented: benchmark questions → LLM code generation → CodeQL analysis → vulnerability rate computation. The exclusion of 274 samples is explained. Section 3 details each step." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Acknowledgments section discloses Virginia Commonwealth Cyber Initiative (CCI) sponsorship, CAHMP GRA Fellowship, and NSF Award Number 2018631." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "All four authors are from George Mason University. No evaluated product is affiliated with their institution." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "Funders (Virginia CCI, NSF, GMU) have no financial interest in the performance of any specific LLM evaluated." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is present in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for any of the 8 models evaluated." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether SecurityEval or SecCodePLT problems appeared in the training data of any model." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "SecurityEval (2022) and SecCodePLT (2024) are publicly available. Models trained after their publication may have seen them. This is not discussed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in the main study. The human validators for GPT-4o-as-judge are a validation check, not a human subjects study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference costs, API costs, or latency figures are reported despite using proprietary APIs (GPT-4o, GPT-3.5) across thousands of samples." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No total computational budget, GPU hours, or API spend is reported." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No multiple seed runs are reported. All results appear to be single-run." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of runs per experiment is never stated." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search is described. Temperature and sampling parameters are not even reported." 306 }, 307 "best_config_selection_justified": { 308 "applies": false, 309 "answer": false, 310 "justification": "The paper does not select among configurations — it evaluates fixed prompts across models." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical tests are performed, so multiple comparison correction is moot." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors designed the prompts and evaluated the results without acknowledging potential bias in prompt design favoring certain outcomes." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "The explained feedback approach uses GPT-4o to generate enriched feedback for all models, adding significant compute, but this cost is never compared against the performance gains." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "The paper discusses construct validity by acknowledging CodeQL's limitations (static-only, Python-only) and noting the divergence between SecCodePLT and SecurityEval results indicates different vulnerability coverage matters." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No scaffolding is used; models are prompted directly." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "SecurityEval was published in 2022, SecCodePLT in 2024. Models trained after these dates may have seen the problems. This is not discussed." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the evaluation setup leaks information, e.g., whether the task descriptions inadvertently hint at secure implementations." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "SecCodePLT is generated by mutating 5 seed problems per CWE (up to 70 variants). These are structurally dependent. This is not discussed as a potential confound." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention method is applied." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "LLMs consistently generate vulnerable code at rates ranging from 9.8% to 42.1% across diverse vulnerability types.", 364 "evidence": "Table 3 shows TarV-R 4.0%-27.3% and AllV-R 9.8%-42.1% across 8 models on two benchmarks (Section 5.1).", 365 "supported": "strong" 366 }, 367 { 368 "claim": "The top-10 vulnerability types across 8 models largely overlap, resulting in only 11 unique types, indicating shared weaknesses.", 369 "evidence": "Figure 6 shows the vulnerability distribution across models on SecCodePLT with overlapping top vulnerability types (Section 5.1).", 370 "supported": "strong" 371 }, 372 { 373 "claim": "GPT-4o achieves a 12.4% TarV-R reduction on both datasets when using self-generated vulnerability hints.", 374 "evidence": "Table 4 shows GPT-4o TarV-R dropping from 15.0% to 2.6% on SecCodePLT and from 25.6% to 13.2% on SecurityEval (Section 5.2).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Irrelevant or imprecise self-generated hints can mislead models and increase vulnerability rates.", 379 "evidence": "Table 5 shows that when hints do not include the target vulnerability, most models show increased TarV-R. GPT-3.5 AllV-R increased by 17.4% on SecurityEval (Table 4, Section 5.2).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Explained feedback significantly enhances vulnerability repair effectiveness compared to direct CodeQL feedback.", 384 "evidence": "Table 8 shows explained feedback outperforms direct feedback for most models. GPT-4o achieves 14.3% AllV-R reduction on SecCodePLT with explained vs. 11.2% with direct feedback (Section 5.3).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "Newer models (GPT-4o, Llama3) exhibit higher vulnerability rates than their predecessors (GPT-3.5, CodeLlama).", 389 "evidence": "Table 3: GPT-4o TarV-R 15.0% vs. GPT-3.5 7.7% on SecCodePLT; Llama3.1-8B AllV-R 40.5% vs. CodeLlama-7B 28.9% on SecurityEval (Section 5.1).", 390 "supported": "moderate" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "No statistical significance testing", 396 "detail": "All comparative claims are based on raw percentage differences without any significance tests. With single runs and no variance measures, it is impossible to know if observed differences are meaningful or noise." 397 }, 398 { 399 "flag": "Single-run results with no variance", 400 "detail": "LLM outputs are stochastic, yet no multiple runs, seeds, or variance measures are reported. Results could change substantially with different random seeds or temperature settings." 401 }, 402 { 403 "flag": "Contamination not addressed", 404 "detail": "SecurityEval (2022) has been publicly available for years. Models trained after 2022 may have seen these problems, inflating performance (or reducing vulnerability rates). SecCodePLT's mutated variants from 5 seeds create structural non-independence that is also not discussed." 405 }, 406 { 407 "flag": "Hyperparameters unreported", 408 "detail": "Temperature, top-p, and other sampling parameters are not reported for any model, yet these significantly affect code generation behavior and vulnerability rates." 409 }, 410 { 411 "flag": "GPT-4o used as both subject and evaluator", 412 "detail": "GPT-4o is used to generate explained feedback for all models AND to judge hint preciseness, while also being one of the 8 evaluated models. This creates a potential confound where GPT-4o's repair benefits may partly stem from receiving feedback in its own 'language'." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "Asleep at the keyboard? assessing the security of github copilot's code contributions", 418 "authors": ["Hammond Pearce", "Baleegh Ahmad", "Benjamin Tan", "Brendan Dolan-Gavitt", "Ramesh Karri"], 419 "year": 2022, 420 "relevance": "Foundational study showing GitHub Copilot generated vulnerable code in 40% of cases across 18 CWE types." 421 }, 422 { 423 "title": "Do users write more insecure code with AI assistants?", 424 "authors": ["Neil Perry", "Megha Srivastava", "Deepak Kumar", "Dan Boneh"], 425 "year": 2023, 426 "relevance": "User study showing AI code assistants lead to more insecure code, directly relevant to LLM code security." 427 }, 428 { 429 "title": "SecurityEval Dataset: Mining Vulnerability Examples to Evaluate Machine Learning-Based Code Generation Techniques", 430 "authors": ["Mohammed Latif Siddiq", "Joanna C. S. Santos"], 431 "year": 2022, 432 "doi": "10.1145/3549035.3561184", 433 "relevance": "One of the two benchmarks used in this study; provides 121 coding problems across 69 CWE types." 434 }, 435 { 436 "title": "SecCodePLT: A Unified Platform for Evaluating the Security of Code GenAI", 437 "authors": ["Yu Yang", "Yuzhou Nie", "Zhun Wang", "Yuheng Tang", "Wenbo Guo", "Bo Li", "Dawn Song"], 438 "year": 2024, 439 "arxiv_id": "2410.11096", 440 "relevance": "The other benchmark used in this study; synthesized dataset with 1,345 coding problems covering 27 CWEs." 441 }, 442 { 443 "title": "Prompting techniques for secure code generation: A systematic investigation", 444 "authors": ["Catherine Tony", "Nicolás E Díaz Ferreyra", "Markus Mutas", "Salem Dhiff", "Riccardo Scandariato"], 445 "year": 2024, 446 "arxiv_id": "2407.07064", 447 "relevance": "Evaluated prompting techniques for secure code generation on GPT-series, finding RCI technique most effective." 448 }, 449 { 450 "title": "Examining zero-shot vulnerability repair with large language models", 451 "authors": ["Hammond Pearce", "Benjamin Tan", "Baleegh Ahmad", "Ramesh Karri", "Brendan Dolan-Gavitt"], 452 "year": 2023, 453 "relevance": "Explored zero-shot vulnerability repair by embedding vulnerability information into prompts." 454 }, 455 { 456 "title": "Purple llama cyberseceval: A secure coding benchmark for language models", 457 "authors": ["Manish Bhatt", "Sahana Chennabasappa"], 458 "year": 2023, 459 "arxiv_id": "2312.04724", 460 "relevance": "Meta's comprehensive security evaluation benchmark for LLMs including insecure code generation assessment." 461 }, 462 { 463 "title": "How secure is code generated by chatgpt?", 464 "authors": ["Raphaël Khoury", "Anderson R Avila", "Jacob Brunelle", "Baba Mamadou Camara"], 465 "year": 2023, 466 "relevance": "Showed GPT-3.5 produced 76% vulnerable code, relevant to LLM security evaluation." 467 }, 468 { 469 "title": "Is Your AI-Generated Code Really Safe? Evaluating Large Language Models on Secure Code Generation with CodeSecEval", 470 "authors": ["Jiexin Wang"], 471 "year": 2024, 472 "arxiv_id": "2407.02395", 473 "relevance": "Explored enhancing secure code generation by incorporating vulnerability information into prompts." 474 }, 475 { 476 "title": "Code security vulnerability repair using reinforcement learning with large language models", 477 "authors": ["Nafis Tanveer Islam", "Mohammad Bahrami Karkevandi", "Peyman Najafirad"], 478 "year": 2024, 479 "arxiv_id": "2401.07031", 480 "relevance": "RL-based framework for fine-tuning LLMs with dual functionality and security rewards." 481 }, 482 { 483 "title": "Automated software vulnerability patching using large language models", 484 "authors": ["Yu Nong", "Haoran Yang", "Long Cheng", "Hongxin Hu", "Haipeng Cai"], 485 "year": 2024, 486 "arxiv_id": "2408.13597", 487 "relevance": "LLM-based vulnerability patching using retrieved similar examples with ground-truth annotations." 488 }, 489 { 490 "title": "Is github's copilot as bad as humans at introducing vulnerabilities in code?", 491 "authors": ["Owura Asare", "Meiyappan Nagappan", "N Asokan"], 492 "year": 2023, 493 "relevance": "Comparative study of Copilot vs. human-introduced code vulnerabilities." 494 } 495 ] 496 }