scan.json (21871B)
1 { 2 "paper": { 3 "title": "The Hidden Risks of LLM-Generated Web Application Code: A Security-Centric Evaluation of Code Generation Capabilities in Large Language Models", 4 "authors": [ 5 "Swaroop Dora", 6 "Deven Lunkad", 7 "Naziya Aslam", 8 "S. Venkatesan", 9 "Sandeep Kumar Shukla" 10 ], 11 "year": 2025, 12 "venue": "arXiv", 13 "arxiv_id": "2504.20612" 14 }, 15 "scan_version": 2, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "Evaluates security of web application code generated by 5 LLMs (ChatGPT, DeepSeek, Claude, Gemini, Grok) across 48 security parameters in 6 categories. All models fail entirely on HTTP security headers (0/12). Only SQL injection protection is universally implemented. No model implements MFA, CAPTCHA, or CORS policies. The authors conclude human expertise remains essential for secure deployment of LLM-generated web code.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No repository URL, code archive, or data release mentioned anywhere in the paper." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The generated code samples and evaluation data are not released or made available." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No environment specifications, dependency lists, or setup instructions provided." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No reproduction instructions. The prompts are listed in Table II but no instructions for replicating the evaluation procedure." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": false, 46 "justification": "No confidence intervals or error bars. Results are single yes/no evaluations per model with no uncertainty quantification." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper makes comparative claims about which models are more secure but uses no statistical tests — only raw checklist counts." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "Table V shows compliance counts (e.g., 3/11, 8/10) but no effect sizes, no baseline context for magnitude of differences." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "Only 4 prompts are used to generate code from each model. No justification for why this sample size is sufficient, and no acknowledgment it may be too small." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "Each model is tested once with each prompt. No repeated runs, no variance across runs reported." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper compares 5 LLMs against each other and against security best practices (NIST, OWASP), which serve as the baseline standard." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Models evaluated include GPT-4o, DeepSeek v3, Claude 3.5 Sonnet, Gemini 2.0 Flash, and Grok 3 — all contemporary at time of writing." 79 }, 80 "ablation_study": { 81 "applies": false, 82 "answer": false, 83 "justification": "No system with components to ablate — this is an evaluation of existing LLMs, not a proposed method." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Evaluates across 6 security categories with 48 individual parameters and provides both compliance counts and risk assessments." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": true, 93 "justification": "The security evaluation itself is a human expert assessment — the authors manually inspected generated code against their security checklist." 94 }, 95 "held_out_test_set": { 96 "applies": false, 97 "answer": false, 98 "justification": "No train/test split concept applies — this is a manual evaluation of generated code against security criteria." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table IV provides detailed per-parameter results and Table V provides per-category compliance summaries for all 6 categories." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Section IV extensively discusses specific failures per model — e.g., Claude lacks secure cookie flags, DeepSeek/Gemini vulnerable to XSS, all fail on HTTP headers." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The paper's main finding IS negative — no LLM generates fully secure code. Specific failures documented across all categories." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims about critical vulnerabilities in authentication, session management, input validation, and HTTP headers are all supported by the detailed analysis in Table IV and Section IV." 121 }, 122 "causal_claims_justified": { 123 "applies": false, 124 "answer": false, 125 "justification": "The paper makes observational claims about what LLMs produce, not causal claims about why. No causal language like 'X causes Y' — findings are descriptive." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "Title claims evaluation of 'LLM-Generated Web Application Code' broadly, but tests only 4 PHP/HTML/MySQL prompts for a single e-commerce authentication scenario. Results may not generalize to other web frameworks, languages, or application types." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "No discussion of alternative explanations. For example, models may perform differently with different prompting strategies, temperature settings, or explicit security instructions — Section V briefly mentions this but does not substantively discuss it as an alternative explanation for the results." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": false, 140 "justification": "The paper measures compliance with a self-designed security checklist and frames it as evaluating 'security' of LLM-generated code. No discussion of whether checklist compliance is a valid proxy for actual security in deployed applications." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Table I lists GPT-4o, DeepSeek v3, Claude 3.5 Sonnet, Gemini 2.0 Flash Experimental, Grok 3. These are marketing names without API versions or snapshot dates (e.g., no 'gpt-4o-2024-05-13')." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Table II provides the actual prompts used, with descriptions of each prompt's purpose and content. Four structured prompts for the e-commerce authentication system are given." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No temperature, top-p, max tokens, or other generation parameters reported for any model." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding used — models are queried directly via their conversational interfaces." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": false, 167 "justification": "No documentation of how generated code was collected, stored, or processed for evaluation. The paper jumps from prompts to results with no intermediate methodology." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": false, 174 "justification": "No dedicated limitations section. The Discussion (Section V) makes recommendations but does not discuss study limitations." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "No threats to validity discussed. No mention of single-run limitation, prompt sensitivity, evaluator subjectivity, or narrow scope." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "The paper does not explicitly state what the results do NOT show. The scope is implicitly narrow (PHP/MySQL e-commerce auth) but this is never acknowledged as a boundary." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "The generated code samples are not available for independent verification. Only the summarized yes/no results in Table IV are shown." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": false, 196 "justification": "The paper does not describe when code was generated, which interfaces were used (API vs web chat), or whether generation was repeated. Only the prompts and final results are shown." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants — the study evaluates LLM outputs, not human subjects." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": false, 206 "justification": "No documentation of the pipeline from code generation to security evaluation. How was each security parameter assessed? Who evaluated? Were there multiple evaluators?" 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding or acknowledgments section. No mention of grants, sponsors, or funding sources." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are listed: IIIT Allahabad and IIT Kanpur. None of the authors appear affiliated with the evaluated LLM companies." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "Funding is not disclosed, so independence cannot be assessed." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests statement or financial disclosure provided." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "No training data cutoff dates stated for any of the 5 models evaluated." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether the models may have seen similar security evaluation scenarios or e-commerce authentication patterns in training data." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "The prompts are custom-designed but no discussion of whether similar prompts or security evaluation scenarios exist in training data. The prompts reference common web security patterns that are widely documented online." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in the study." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in the study." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in the study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in the study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in the study." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in the study." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in the study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "No mention of API costs, tokens consumed, or any cost metrics for the evaluation." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No computational budget or hardware information provided." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "Each model was tested once with each prompt. No repeated runs across seeds or temperature settings." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The paper does not state how many times each prompt was run. Appears to be a single run but this is never explicitly stated." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": false, 310 "answer": false, 311 "justification": "No hyperparameter tuning — models are used as-is via their default interfaces." 312 }, 313 "best_config_selection_justified": { 314 "applies": false, 315 "answer": false, 316 "justification": "No configuration selection — models are used with default settings." 317 }, 318 "multiple_comparison_correction": { 319 "applies": false, 320 "answer": false, 321 "justification": "No statistical tests performed, so no multiple comparison issue." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": false, 325 "answer": false, 326 "justification": "The paper evaluates existing LLMs, not the authors' own system. No self-comparison bias." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": false, 330 "answer": false, 331 "justification": "All models are given the same prompts with no compute budget differences to compare." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "The paper designs its own security checklist but does not discuss whether checklist compliance actually measures real-world security. No validation of the evaluation instrument itself." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No scaffolding involved — models are queried directly." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether models may have been trained on security evaluation patterns or similar e-commerce authentication examples." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the prompts (which include security nudges like 'following industry-standard security practices') leak evaluation criteria to the models." 354 }, 355 "non_independence_addressed": { 356 "applies": false, 357 "answer": false, 358 "justification": "Custom prompts are used rather than a benchmark dataset, so train/test independence in the traditional sense does not apply." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention methods applied." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "None of the tested LLMs fully align with industry security best practices for web application code generation.", 370 "evidence": "Table IV and V show all models fail in multiple security categories. All score 0/12 on HTTP security headers. Best authentication score is 3/11 (Grok). Section IV-A details specific failures.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "All LLMs implement SQL injection protection through parameterized queries.", 375 "evidence": "Table IV shows all 5 models use parameterized queries and properly escape special characters (Section IV-A4).", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "No LLM implements HTTP security headers (CSP, X-Frame-Options, HSTS, etc.).", 380 "evidence": "Table IV shows all 12 HTTP security header parameters are 'No' for all 5 models. Table V confirms 0/12 across all models.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Claude and DeepSeek generated code with extreme risk due to missing secure cookie flags.", 385 "evidence": "Figure 1a and Table IV show Claude and DeepSeek lack Secure, HttpOnly, and SameSite cookie flags, rated as extreme risk in Table III.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Human expertise is crucial for secure deployment of LLM-generated code.", 390 "evidence": "Section V discusses this qualitatively based on the findings, but no empirical evidence comparing human-reviewed vs unreviewed code is presented.", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Single-run evaluation with no repetition", 397 "detail": "Each model appears to be tested once per prompt with no repeated runs. LLM outputs are stochastic — a single run cannot establish reliable security profiles. The same prompt could yield different code with different security properties." 398 }, 399 { 400 "flag": "Extremely narrow test scope presented as general evaluation", 401 "detail": "Only 4 prompts for one scenario (PHP/MySQL e-commerce authentication) are used, but the title and conclusions imply general findings about LLM-generated web application code security. Results may not generalize to other languages, frameworks, or application types." 402 }, 403 { 404 "flag": "No inter-rater reliability for manual evaluation", 405 "detail": "The security evaluation appears to be conducted by the authors manually, but there is no mention of how many evaluators participated, whether there was independent evaluation, or how disagreements were resolved." 406 }, 407 { 408 "flag": "Evaluation instrument not validated", 409 "detail": "The 48-parameter security checklist is created by the authors without reference to validation. No comparison with established evaluation frameworks beyond citing OWASP and NIST at a high level." 410 }, 411 { 412 "flag": "Prompts include security nudges that bias results upward", 413 "detail": "The prompts explicitly mention 'secure', 'industry-standard security practices', and 'robust validation' — yet models still fail. This means real-world usage without such nudges would likely produce even worse results, but this is not discussed." 414 }, 415 { 416 "flag": "No limitations section", 417 "detail": "The paper has no limitations or threats-to-validity discussion despite significant methodological constraints (single run, narrow scope, unvalidated instrument, no inter-rater reliability)." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Do users write more insecure code with AI assistants?", 423 "authors": ["N. Perry", "M. Srivastava", "D. Kumar", "D. Boneh"], 424 "year": 2023, 425 "relevance": "Key prior work on security implications of AI code assistants, finding AI-assisted developers produce more vulnerabilities while being more confident." 426 }, 427 { 428 "title": "Security weaknesses of Copilot generated code in GitHub", 429 "authors": ["Y. Fu", "P. Liang", "A. Tahir", "Z. Li", "M. Shahin", "J. Yu", "J. Chen"], 430 "year": 2023, 431 "arxiv_id": "2310.02059", 432 "relevance": "Quantifies security vulnerabilities in GitHub Copilot-generated code (32.8% Python, 24.5% JavaScript)." 433 }, 434 { 435 "title": "How secure is code generated by ChatGPT?", 436 "authors": ["R. Khoury", "A. R. Avila", "J. Brunelle", "B. M. Camara"], 437 "year": 2023, 438 "relevance": "Examines ChatGPT's ability to generate secure code across 5 languages, finding only 5/21 programs initially secure." 439 }, 440 { 441 "title": "LLMs in web development: Evaluating LLM-generated PHP code unveiling vulnerabilities and limitations", 442 "authors": ["R. Tóth", "T. Bisztray", "L. Erdődi"], 443 "year": 2024, 444 "relevance": "Prior work evaluating security of GPT-4 generated PHP code, finding 11.56% of sites could be compromised." 445 }, 446 { 447 "title": "Large language model assisted software engineering: prospects, challenges, and a case study", 448 "authors": ["L. Belzner", "T. Gabor", "M. Wirsing"], 449 "year": 2023, 450 "relevance": "Discusses LLMs as tools for software engineering operations including code generation." 451 } 452 ] 453 }