scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25183B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "The Hidden Risks of LLM-Generated Web Application Code: A Security-Centric Evaluation of Code Generation Capabilities in Large Language Models",
      6     "authors": [
      7       "Swaroop Dora",
      8       "Deven Lunkad",
      9       "Naziya Aslam",
     10       "S. Venkatesan",
     11       "Sandeep Kumar Shukla"
     12     ],
     13     "year": 2025,
     14     "venue": "arXiv",
     15     "arxiv_id": "2504.20612",
     16     "doi": null
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Abstract claims code is 'insecure' but only measures compliance with a custom checklist, not actual vulnerabilities or exploitability. Uses proxy outcome (compliance) as evidence for actual security.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": false,
     28         "answer": false,
     29         "justification": "No causal claims are made; paper is comparative evaluation only.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Tests one domain (e-commerce auth), one stack (PHP/MySQL/HTML), and four specific prompts, but generalizes findings to 'LLMs' broadly. Title claims apply beyond tested scope.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No discussion of alternative explanations. For example, doesn't explore whether prompt design biased results, or whether some 'failures' represent deliberate security trade-offs.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Claims to measure 'security' but measures compliance with a custom checklist. Does not distinguish between checklist compliance and actual exploitable vulnerabilities or real-world security impact.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations section exists. Discussion (Section V) addresses recommendations but not study limitations.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No specific threats to validity discussed. No statements about sample size adequacy, prompt selection bias, or evaluation methodology reliability.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No explicit boundaries stated on what results do NOT show (e.g., other domains, other prompts, actual exploitability, different languages).",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "No funding acknowledgment section present in paper.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Institutional affiliations listed for all authors (IIIT Allahabad, IIT Kanpur), though no statement regarding relationships with evaluated LLM providers.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "No funder identified.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement present.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "'Security,' 'secure code,' and 'web application code' are used without formal definitions. Security is operationalized only through the checklist, not philosophically defined.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Paper explicitly states three contributions: security checklist, comparative analysis of five LLMs, and risk assessment. Contributions are clear.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section II cites Toth et al., Perry et al., and Khoury et al., and explains how this work differs (systematic evaluation of authentication, session management, headers across multiple LLMs).",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "LLM-generated code evaluated is not released. Paper shows only yes/no compliance in Table IV, not actual code samples.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Generated code samples are not provided for independent verification or future research.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Model versions (Table I) are specified, but inference parameters (temperature, top-p, max tokens) are not mentioned, preventing exact reproduction.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Prompts (Table II) are provided, but the evaluation methodology itself is not transparent. How compliance was determined (manual review, automated, criteria) is not specified.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No confidence intervals or error bars reported. Results are binary yes/no compliance with no uncertainty quantification.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance testing performed. Differences between models are presented as counts (3/11, 5/10) without statistical analysis.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "No effect sizes reported. Table V shows compliance ratios (e.g., 3/11) without analysis of practical significance.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "Sample of 5 models × 4 prompts × 1 run is not justified. No power analysis or sample size determination provided.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Only one evaluation run per model shown. No replication or variance across multiple runs reported.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "No baseline comparisons to hand-written code, security-focused frameworks, or human expert code provided.",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": false,
    187           "answer": false,
    188           "justification": "No baselines included.",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "No ablation study. Discussion mentions that models might improve with explicit security prompts but this is not tested.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "Single metric type: binary yes/no compliance across six domains. Multiple dimensions measured but same metric throughout.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "Evaluation methodology is opaque. Unclear if manual code review by authors or automated checking; no inter-rater reliability reported.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": false,
    211           "answer": false,
    212           "justification": "Not a prediction task; NA.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table IV provides category-by-category breakdown across 6 security domains (Authentication, Input Validation, Session, Storage, Error Handling, HTTP Headers).",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Failures are counted in Table IV but not illustrated with code examples or detailed failure explanations.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "All models show failures across multiple security parameters. Negative results are comprehensively reported.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Table I specifies model versions: GPT 4o, DeepSeek v3, Claude 3.5 Sonnet, Gemini 2.0 Flash Experimental, Grok 3. Specific enough for identification.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Table II provides actual prompts used (4 prompts for e-commerce auth system). Prompts are concrete, not templates.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "No inference parameters reported (temperature, top-p, max_tokens, presence_penalty, etc.).",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "Evaluating black-box LLM APIs with no special scaffolding; NA.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "No documentation of preprocessing steps. Unclear if generated code was filtered, cleaned, or modified before evaluation.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "Generated code outputs are not provided, preventing independent verification of compliance assessments.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": false,
    276           "justification": "Process is vaguely described as prompting models and evaluating code, but lacks specifics on timing, environment, any filtering, or iteration.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants; NA.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "Pipeline (prompts → model outputs → checklist evaluation → results) is outlined but not detailed. Evaluation criteria and methodology are opaque.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "Training data cutoffs for evaluated models (GPT 4o, Claude 3.5, etc.) are not stated. Not discussed whether security standards in test could have been in training data.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "No discussion of whether security best practices or test prompts could have appeared in model training data.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Not evaluating on standard benchmarks; NA.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants; NA.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants; NA.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants; NA.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants; NA.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants; NA.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants; NA.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants; NA.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No inference cost or API pricing information provided for running evaluations.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Total computational cost or budget for this evaluation not stated.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "None of the evaluated LLMs fully align with industry security best practices",
    375       "evidence": "Table IV shows all five models fail multiple security parameters across six domains; Table V shows none achieve compliance on HTTP security headers",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "All five LLMs fail to implement HTTP security headers (CSP, X-Frame-Options, HSTS, Referrer-Policy, Permissions-Policy)",
    380       "evidence": "Table IV, HTTP Security Headers section shows all models with 0 out of 12 parameters implemented",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "No LLM supports multi-factor authentication (MFA)",
    385       "evidence": "Table IV, MFA row shows 'No' for all five models (ChatGPT, DeepSeek, Claude, Gemini, Grok)",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "All models implement parameterized queries for SQL injection protection",
    390       "evidence": "Table IV, SQL Injection Protection section shows 'Yes' for all models on 'Parameterized Queries Used' and 'Special characters properly escaped'",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Claude fails even in secure storage (password hashing)",
    395       "evidence": "Table IV shows Claude has 'NA' for hashing algorithm and no salted hashes, while all other models specify bcrypt, Argon2, or use salting",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "DeepSeek and Claude-generated code carries extreme security risk",
    400       "evidence": "Figure 1a shows both models in extreme risk category, but no actual exploits demonstrated",
    401       "supported": "moderate"
    402     },
    403     {
    404       "claim": "Human expertise is crucial to ensure secure code deployment",
    405       "evidence": "Abstract and Discussion state this as conclusion, but no empirical comparison to automated testing or human review effectiveness provided",
    406       "supported": "weak"
    407     },
    408     {
    409       "claim": "Developers using LLM-generated code face critical vulnerabilities if code is deployed without review",
    410       "evidence": "Implied throughout paper via risk assessment, but no actual vulnerability disclosure or exploitation demonstration provided",
    411       "supported": "moderate"
    412     }
    413   ],
    414   "methodology_tags": [
    415     "benchmark-eval"
    416   ],
    417   "key_findings": "The paper evaluates five LLMs (ChatGPT, Claude, DeepSeek, Gemini, Grok) on their ability to generate secure web application code for an e-commerce authentication system using a custom 40+ parameter security checklist. All models universally fail to implement HTTP security headers (CSP, HSTS, etc.), and none support multi-factor authentication. While some models perform better on specific domains (Gemini on authentication, Claude on input validation), none achieve comprehensive security compliance. The authors conclude human review remains essential before production deployment, though the evaluation scope is limited to one application domain, one tech stack, and four specific prompts.",
    418   "red_flags": [
    419     {
    420       "flag": "No actual vulnerability demonstration",
    421       "detail": "Paper measures checklist compliance but does not demonstrate exploitable vulnerabilities or prove failures result in insecure systems. Proxy outcome (compliance) used as evidence for actual security."
    422     },
    423     {
    424       "flag": "Single evaluation run with no variance",
    425       "detail": "Each model evaluated once with four prompts; no replication, no error bars, no statistical testing of differences between models."
    426     },
    427     {
    428       "flag": "Checklist validity not established",
    429       "detail": "Security parameters presented without validation that compliance failures actually matter to real-world security. No justification for why each parameter is critical."
    430     },
    431     {
    432       "flag": "Opaque evaluation methodology",
    433       "detail": "Unclear how compliance was determined (manual code review vs. automated checking). No inter-rater reliability, no documented evaluation criteria, no reproducibility."
    434     },
    435     {
    436       "flag": "Prompt selection confound",
    437       "detail": "Discussion acknowledges models might improve with explicit security prompts, but this is not tested. Results may be artifacts of limited prompt engineering."
    438     },
    439     {
    440       "flag": "Severe scope limitations",
    441       "detail": "Only one domain (e-commerce auth), one tech stack (PHP/MySQL/HTML), four specific prompts. Generalization to 'LLMs broadly' is unjustified."
    442     },
    443     {
    444       "flag": "No baseline comparisons",
    445       "detail": "No comparison to human-written code, security frameworks, or expert implementations. No context for whether findings are concerning or expected."
    446     },
    447     {
    448       "flag": "Arbitrary risk scoring",
    449       "detail": "Risk = Likelihood × Impact values in Table III assigned without validation or justification. Likelihood/impact ratings appear subjective."
    450     },
    451     {
    452       "flag": "Reproducibility blocked",
    453       "detail": "Generated code samples not released. Others cannot verify compliance assessments or replicate evaluation."
    454     },
    455     {
    456       "flag": "No statistical analysis",
    457       "detail": "Results presented as binary counts (3/11, 5/10) with no significance testing, confidence intervals, or analysis of meaningful differences."
    458     }
    459   ],
    460   "cited_papers": [
    461     {
    462       "title": "LLMs in web development: Evaluating llm-generated php code unveiling vulnerabilities and limitations",
    463       "relevance": "Prior evaluation of security in LLM-generated code (PHP focus); establishes baseline for this work"
    464     },
    465     {
    466       "title": "Do users write more insecure code with ai assistants?",
    467       "relevance": "User study on security implications of AI code assistants; demonstrates developers trust AI outputs despite security risks"
    468     },
    469     {
    470       "title": "How secure is code generated by chatgpt?",
    471       "relevance": "Prior ChatGPT code security evaluation across multiple languages; foundational work this paper builds on"
    472     },
    473     {
    474       "title": "NIST Cybersecurity Framework 2.0",
    475       "relevance": "Industry standard for security evaluation; paper uses NIST guidelines as reference for security parameters"
    476     },
    477     {
    478       "title": "OWASP Top 10",
    479       "relevance": "OWASP web application security ranking; paper evaluates compliance with OWASP vulnerability categories"
    480     }
    481   ],
    482   "engagement_factors": {
    483     "practical_relevance": {
    484       "score": 2,
    485       "justification": "Practitioners do use LLMs for code generation, but paper offers limited actionable guidance beyond 'review code' and 'improve prompts.'"
    486     },
    487     "surprise_contrarian": {
    488       "score": 2,
    489       "justification": "Confirms known findings that LLMs generate insecure code (Perry, Khoury), but systematic multi-LLM comparison adds modest novelty."
    490     },
    491     "fear_safety": {
    492       "score": 2,
    493       "justification": "Raises security concerns but frames them via compliance checklist rather than actual exploits. Risk assessment is somewhat alarmist."
    494     },
    495     "drama_conflict": {
    496       "score": 1,
    497       "justification": "Straightforward evaluation with expected results; no controversy, no surprising disparities, no drama."
    498     },
    499     "demo_ability": {
    500       "score": 1,
    501       "justification": "Paper presents results only; no interactive tool, benchmark, or framework for readers to try."
    502     },
    503     "brand_recognition": {
    504       "score": 1,
    505       "justification": "IIIT Allahabad and IIT Kanpur are respectable but not top-tier in security research. No famous labs or major industry involvement."
    506     }
    507   },
    508   "hn_data": {
    509     "threads": [
    510       {
    511         "hn_id": "43991256",
    512         "title": "LLMs get lost in multi-turn conversation",
    513         "points": 374,
    514         "comments": 259,
    515         "url": "https://news.ycombinator.com/item?id=43991256",
    516         "created_at": "2025-05-15T02:28:42Z"
    517       },
    518       {
    519         "hn_id": "43369815",
    520         "title": "A Proof of the Collatz Conjecture",
    521         "points": 6,
    522         "comments": 3,
    523         "url": "https://news.ycombinator.com/item?id=43369815",
    524         "created_at": "2025-03-15T03:46:44Z"
    525       },
    526       {
    527         "hn_id": "43249977",
    528         "title": "A Proof of the Collatz Conjecture",
    529         "points": 4,
    530         "comments": 2,
    531         "url": "https://news.ycombinator.com/item?id=43249977",
    532         "created_at": "2025-03-04T03:36:15Z"
    533       },
    534       {
    535         "hn_id": "45392597",
    536         "title": "Fast and Accurate Long Text Generation with Few-Step Diffusion Language Models",
    537         "points": 4,
    538         "comments": 1,
    539         "url": "https://news.ycombinator.com/item?id=45392597",
    540         "created_at": "2025-09-27T01:22:24Z"
    541       },
    542       {
    543         "hn_id": "35475791",
    544         "title": "Eight things to know about large language models",
    545         "points": 3,
    546         "comments": 0,
    547         "url": "https://news.ycombinator.com/item?id=35475791",
    548         "created_at": "2023-04-06T23:11:43Z"
    549       },
    550       {
    551         "hn_id": "35444967",
    552         "title": "Eight Things to Know about Large Language Models",
    553         "points": 3,
    554         "comments": 0,
    555         "url": "https://news.ycombinator.com/item?id=35444967",
    556         "created_at": "2023-04-04T19:46:22Z"
    557       },
    558       {
    559         "hn_id": "43894376",
    560         "title": "CrashFixer: A crash resolution agent for the Linux kernel",
    561         "points": 2,
    562         "comments": 0,
    563         "url": "https://news.ycombinator.com/item?id=43894376",
    564         "created_at": "2025-05-05T12:31:05Z"
    565       },
    566       {
    567         "hn_id": "31062922",
    568         "title": "Are You Muted?: A Privacy Analysis of Mute Buttons in Video Conferencing",
    569         "points": 2,
    570         "comments": 1,
    571         "url": "https://news.ycombinator.com/item?id=31062922",
    572         "created_at": "2022-04-17T18:16:21Z"
    573       },
    574       {
    575         "hn_id": "44652385",
    576         "title": "Fixed point thm in metric spaces and its application to the Collatz conjecture",
    577         "points": 1,
    578         "comments": 0,
    579         "url": "https://news.ycombinator.com/item?id=44652385",
    580         "created_at": "2025-07-22T20:07:15Z"
    581       }
    582     ],
    583     "top_points": 374,
    584     "total_points": 399,
    585     "total_comments": 266
    586   }
    587 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs