scan-v5.json (26749B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Inducing Vulnerable Code Generation in LLM Coding Assistants", 6 "authors": [ 7 "Binqi Zeng", 8 "Quan Zhang", 9 "Chijin Zhou", 10 "Gwihwan Go", 11 "Yu Jiang", 12 "Heyuan Shi" 13 ], 14 "year": 2025, 15 "venue": "arXiv.org", 16 "arxiv_id": "2504.15867", 17 "doi": "10.48550/arXiv.2504.15867" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "The 84.29% ASR claim is supported by Table II, and the 75.92% real-world ASR is supported by Table V, both matching the abstract precisely.", 25 "source": "haiku" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "The ablation study (Table VI) directly tests whether the progressive generation component causes higher transferability, finding a 25.49pp improvement, providing adequate support for the causal design claim.", 31 "source": "haiku" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": false, 36 "justification": "The title and conclusions claim broad impact on 'LLM coding assistants' but all tested models are small open-source LLMs (7b–15b parameters); no closed-source or large-scale models were tested, and the abstract does not bound this scope.", 37 "source": "haiku" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper offers no alternative explanations for why attack sequences work (e.g., whether the effect is purely statistical noise, tokenizer artifacts, or another mechanism); it only hypothesizes why certain LLMs perform worse.", 43 "source": "haiku" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "Attack success rate is measured by directly detecting the target vulnerability pattern in LLM-generated code, which matches the claimed outcome without proxy mismatch.", 49 "source": "haiku" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": true, 56 "justification": "Section VI contains a dedicated 'Limitations' subsection listing three specific constraints beyond generic disclaimers.", 57 "source": "haiku" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Limitations enumerate specifically: (1) only open-source LLMs tested, (2) parameter range restricted to 7b–15b, (3) reduced effectiveness against heavily quantized models (e.g., GPTQ Llama2 drops to 29.63% ASR).", 63 "source": "haiku" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper explicitly states it does not cover closed-source LLMs, models outside 7b–15b range, and notes planned future work for these cases.", 69 "source": "haiku" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding acknowledgment appears anywhere in the paper.", 77 "source": "haiku" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "Author affiliations (Central South University, Tsinghua University) are listed on the title page.", 83 "source": "haiku" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": false, 87 "answer": false, 88 "justification": "No funding is disclosed, so this criterion is not applicable.", 89 "source": "haiku" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests or financial disclosure statement is present in the paper.", 95 "source": "haiku" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "Key terms such as 'attack sequence,' 'assembled input,' 'attack success rate,' and vulnerability types (via CWE IDs) are defined with sufficient precision for the study.", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Three explicit contributions are stated: a new security threat, a practical attack prototype (HACKODE), and empirical evaluation demonstrating substantial impact.", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Section VII relates HACKODE to prior work on LLM-generated code quality, jailbreak attacks, and backdoor attacks, explicitly distinguishing HACKODE's mechanism from each.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": true, 125 "justification": "Source code is linked at https://github.com/HACKODE11/HACKODE in the contributions section.", 126 "source": "haiku" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": false, 131 "justification": "The 35-problem StackOverflow dataset is described as custom-constructed but no explicit confirmation it is included in the repository or otherwise publicly available.", 132 "source": "haiku" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "Hardware is specified (AMD EPYC 7763, 8× V100, Ubuntu 22.04) but no software dependency specs (Python version, PyTorch version, requirements.txt, or Dockerfile) are provided.", 138 "source": "haiku" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step reproduction instructions appear in the paper; the reader is pointed to a GitHub repository without any procedural detail.", 144 "source": "haiku" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "No confidence intervals or error bars are reported for any results in Tables II–VII.", 152 "source": "haiku" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "No statistical significance tests are used despite comparative claims across LLMs and attack variants.", 158 "source": "haiku" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Percentage-point differences are reported (e.g., 25.49pp improvement for progressive generation, 68.57pp for comment vs. variable-name embedding).", 164 "source": "haiku" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "The dataset of 35 StackOverflow problems is described pragmatically but no power analysis or justification for this number is provided.", 170 "source": "haiku" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "No standard deviation or variance across runs is reported; only mean ASR values appear in all results tables.", 176 "source": "haiku" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "HACKODE− (single-phase, no enhancement) serves as an ablation baseline in Table VI, and variable-renaming injection is compared against comment-injection in Table VII.", 184 "source": "haiku" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": true, 189 "justification": "The baselines are internal ablations of the proposed method, making contemporaneity a non-issue; no outdated external systems are used as primary comparators.", 190 "source": "haiku" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section V-D ablates both the progressive generation module (HACKODE vs. HACKODE−) and the injection strategy (code comment vs. variable renaming).", 196 "source": "haiku" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Evaluation uses ASR, iteration count, and token lengths of attack sequences, responses, and inputs.", 202 "source": "haiku" 203 }, 204 "human_evaluation": { 205 "applies": false, 206 "answer": false, 207 "justification": "Human evaluation is not relevant; success is determined by automated detection of vulnerability patterns in generated code.", 208 "source": "haiku" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": true, 213 "justification": "Transferability evaluation (Section V-B) uses five randomly assembled inputs with new instructions, queries, and prompt templates not used during attack sequence generation.", 214 "source": "haiku" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Table II breaks results down by all five vulnerability types (Array Violation, Buffer Overflow, Incorrect Variable, Invalid Validation, Infinite Loop) across each LLM.", 220 "source": "haiku" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "The paper discusses Llama2 failing to generate code on certain questions, GPTQ-quantized Llama2 dropping to 29.63% ASR, and low performance on Incorrect Variable for Mistral (25%).", 226 "source": "haiku" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "Table IV reports GPTQ Llama2 ASR at 29.63%, and Table II reports Incorrect Variable ASR of 25% on Mistral, both explicitly discussed as limitations.", 232 "source": "haiku" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": false, 239 "justification": "Models are named (Llama2-7b, Mistral-7b, CodeLlama-7b, StarChat2-15b) with citations but no specific checkpoint versions, snapshot dates, or HuggingFace commit hashes are given.", 240 "source": "haiku" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "Figure 2 provides three actual prompt templates (PT1–PT3) with structural format and two example instructions (IN1–IN2) with full text.", 246 "source": "haiku" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": false, 251 "justification": "maxStep=500 and k=3 are reported for the attack, but LLM inference hyperparameters (temperature, top-p, repetition penalty) are not — only 'default values' is stated.", 252 "source": "haiku" 253 }, 254 "scaffolding_described": { 255 "applies": true, 256 "answer": true, 257 "justification": "The assembled input construction process (query → search → consolidate → prompt template) is described in detail across Sections III–IV.", 258 "source": "haiku" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": true, 263 "justification": "Data collection steps are described: StackOverflow API, keyword summarization, filtering for domain-specific problems requiring external reference, and manual validation of code examples.", 264 "source": "haiku" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": false, 271 "justification": "The 35-problem dataset and generated attack sequences are not confirmed to be publicly released; only source code for the attack tool is mentioned.", 272 "source": "haiku" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": true, 277 "justification": "The StackOverflow API collection procedure, filtering criteria, and manual validation steps are described in the Experiment Dataset subsection.", 278 "source": "haiku" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants involved; data is programmatically collected from StackOverflow.", 284 "source": "haiku" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": true, 289 "justification": "The pipeline from StackOverflow collection → keyword filtering → manual validation → attack sequence generation is described with sufficient detail to understand each stage.", 290 "source": "haiku" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": false, 296 "answer": false, 297 "justification": "This is an adversarial attack paper, not a benchmark capability evaluation; training cutoff is not relevant to the study design.", 298 "source": "haiku" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": false, 302 "answer": false, 303 "justification": "NA for same reason as training_cutoff_stated.", 304 "source": "haiku" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": false, 308 "answer": false, 309 "justification": "NA — the study evaluates an attack mechanism, not model knowledge on benchmarks.", 310 "source": "haiku" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants.", 318 "source": "haiku" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants.", 324 "source": "haiku" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants.", 330 "source": "haiku" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "haiku" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "haiku" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "haiku" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "haiku" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "Average iteration counts are reported (179.17) and hardware is specified, but no wall-clock time or cost per attack sequence is given.", 362 "source": "haiku" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "Hardware is described (8× V100 32GB GPUs) but total GPU-hours or compute budget for the full evaluation is not stated.", 368 "source": "haiku" 369 } 370 } 371 } 372 }, 373 "claims": [ 374 { 375 "claim": "HACKODE achieves an average 84.29% attack success rate across four open-source LLMs on 35 StackOverflow problems.", 376 "evidence": "Table II reports per-LLM averages: Llama2-7b 77.14%, Mistral-7b 80.00%, CodeLlama-7b 94.29%, StarChat2-15b 85.71%, with an overall average of 84.29%.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "HACKODE achieves 75.92% ASR on a real-world ChatChat coding assistant application.", 381 "evidence": "Table V reports Mistral-powered assistant at 82.14% and CodeLlama-powered at 69.70%, yielding the stated overall average; experiment used local web pages simulating StackOverflow.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "The two-phase progressive generation improves transferability by 25.49 percentage points over single-phase generation.", 386 "evidence": "Table VI shows HACKODE− at 32.48% vs. HACKODE at 57.97% average ASR on randomly assembled inputs not used during generation.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Attack sequences transfer to quantized (4-bit) LLMs with ~50% ASR.", 391 "evidence": "Table IV shows GPTQ average 48.07% and BitsAndBytes average 53.45% across four LLMs, with Llama2-GPTQ as the outlier at 29.63%.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "Code LLMs are more susceptible to HACKODE than general-purpose LLMs.", 396 "evidence": "Table II shows CodeLlama-7b at 94.29% and StarChat2-15b at 85.71%, versus Llama2-7b at 77.14% and Mistral-7b at 80.00%; attributed to general LLMs generating text rather than code in some cases.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "Injecting attack sequences as code comments is substantially more effective than variable renaming (80.00% vs. 11.43% ASR on Mistral-7b).", 401 "evidence": "Table VII reports comment injection at 80.00% and variable renaming at 11.43% on Mistral-7b across all vulnerability types.", 402 "supported": "strong" 403 } 404 ], 405 "methodology_tags": [ 406 "benchmark-eval", 407 "case-study" 408 ], 409 "key_findings": "HACKODE demonstrates that LLM coding assistants using external reference retrieval can be induced to generate vulnerable code by embedding adversarial 'attack sequences' as comments in otherwise correct code examples posted online. Across four open-source LLMs (7b–15b parameters), the attack achieves an average 84.29% ASR with sequences comprising only 3.44% of the assembled input token length, making them inconspicuous to human reviewers. A two-phase progressive generation approach boosts cross-input transferability from 32.48% to 57.97% compared to single-phase generation. A real-world experiment on the ChatChat application achieves 75.92% ASR, though it was conducted on local web pages rather than actual internet deployment.", 410 "red_flags": [ 411 { 412 "flag": "Tiny dataset", 413 "detail": "Only 35 StackOverflow problems used; no power analysis or statistical justification for this sample size, making all reported percentages highly sensitive to individual cases." 414 }, 415 { 416 "flag": "No confidence intervals or significance tests", 417 "detail": "All results reported as point estimates only; with n=35 and small per-category counts (3–11 per vulnerability type), the numbers carry high uncertainty that is never quantified." 418 }, 419 { 420 "flag": "Simulated real-world experiment", 421 "detail": "The 'real-world' evaluation uses local web pages, not actual StackOverflow posts; the threat model's assumption that crafted answers will be referenced in practice is not validated." 422 }, 423 { 424 "flag": "Small model scope", 425 "detail": "All tested LLMs are 7b–15b parameter open-source models; the paper makes no claims about modern frontier models (GPT-4, Claude, Gemini) which dominate commercial coding assistants." 426 }, 427 { 428 "flag": "No environment specifications", 429 "detail": "Software dependency versions (PyTorch, transformers, etc.) are not reported, making exact reproduction difficult despite the code release." 430 }, 431 { 432 "flag": "Unverified data release", 433 "detail": "The 35-problem dataset is not confirmed to be publicly available; only the attack tool code is mentioned in the repository link." 434 } 435 ], 436 "cited_papers": [ 437 { 438 "title": "Asleep at the keyboard? Assessing the security of GitHub Copilot's code contributions", 439 "relevance": "Directly assesses Copilot's security in generating code for CWE top-25 vulnerabilities — foundational prior work on LLM-generated insecure code." 440 }, 441 { 442 "title": "Do users write more insecure code with AI assistants?", 443 "relevance": "User study on how AI coding assistants affect security practices, providing human-factors context for the threat HACKODE poses." 444 }, 445 { 446 "title": "DeceptPrompt: Exploiting LLM-driven code generation via adversarial natural language instructions", 447 "relevance": "Prior adversarial attack on LLM code generation using natural language prompts — most directly comparable attack paper." 448 }, 449 { 450 "title": "Lost at C: A user study on the security implications of large language model code assistants", 451 "relevance": "USENIX Security study on how LLM coding assistants affect developer security practices, directly related to HACKODE's threat model." 452 }, 453 { 454 "title": "Universal and transferable adversarial attacks on aligned language models", 455 "relevance": "Foundational GCG attack paper providing the gradient-based token mutation technique that HACKODE adapts for its attack sequence generation." 456 }, 457 { 458 "title": "Is your code generated by ChatGPT really correct? Rigorous evaluation of large language models for code generation", 459 "relevance": "Evaluates correctness of LLM-generated code, providing context for quality and security limitations HACKODE exploits." 460 }, 461 { 462 "title": "Code Llama: Open foundation models for code", 463 "relevance": "One of the four target LLMs evaluated in the paper; paper describes its architecture and capabilities." 464 }, 465 { 466 "title": "SecurityEval dataset: mining vulnerability examples to evaluate machine learning-based code generation techniques", 467 "relevance": "Security evaluation benchmark for LLM code generation — compared against in motivation for custom dataset construction." 468 } 469 ], 470 "engagement_factors": { 471 "practical_relevance": { 472 "score": 3, 473 "justification": "Directly demonstrates a concrete threat to developers using LLM coding assistants backed by retrieval from platforms like StackOverflow." 474 }, 475 "surprise_contrarian": { 476 "score": 2, 477 "justification": "The finding that correct-looking code can carry adversarial sequences invisible to humans that redirect LLM generation is counterintuitive." 478 }, 479 "fear_safety": { 480 "score": 3, 481 "justification": "Paper demonstrates an actionable attack vector that could introduce exploitable vulnerabilities (buffer overflows, code injection) into production code via developer tools." 482 }, 483 "drama_conflict": { 484 "score": 2, 485 "justification": "Frames a security arms race between attacker-crafted StackOverflow posts and LLM coding assistant defenses." 486 }, 487 "demo_ability": { 488 "score": 2, 489 "justification": "Code is released and the attack targets open-source LLMs that can be run locally, but requires significant GPU resources (V100s) and setup effort." 490 }, 491 "brand_recognition": { 492 "score": 1, 493 "justification": "Tsinghua University affiliation has some recognition; no major industry lab or high-profile product is involved." 494 } 495 }, 496 "hn_data": { 497 "threads": [ 498 { 499 "hn_id": "44353071", 500 "title": "Companies should be liable for the serious privacy concerns of LLMs", 501 "points": 9, 502 "comments": 0, 503 "url": "https://news.ycombinator.com/item?id=44353071", 504 "created_at": "2025-06-23T06:41:35Z" 505 }, 506 { 507 "hn_id": "44642760", 508 "title": "Diffusion Beats Autoregressive in Data-Constrained Settings", 509 "points": 5, 510 "comments": 0, 511 "url": "https://news.ycombinator.com/item?id=44642760", 512 "created_at": "2025-07-22T02:45:19Z" 513 }, 514 { 515 "hn_id": "44653340", 516 "title": "Diffusion Beats Autoregressive in Data-Constrained Settings", 517 "points": 3, 518 "comments": 1, 519 "url": "https://news.ycombinator.com/item?id=44653340", 520 "created_at": "2025-07-22T21:43:31Z" 521 }, 522 { 523 "hn_id": "47213997", 524 "title": "Von Neumann on Consciousness in Quantum Mechanics", 525 "points": 3, 526 "comments": 0, 527 "url": "https://news.ycombinator.com/item?id=47213997", 528 "created_at": "2026-03-02T04:46:53Z" 529 }, 530 { 531 "hn_id": "45232738", 532 "title": "Correctness-Guaranteed Code Generation via Constrained Decoding", 533 "points": 3, 534 "comments": 0, 535 "url": "https://news.ycombinator.com/item?id=45232738", 536 "created_at": "2025-09-13T15:13:22Z" 537 }, 538 { 539 "hn_id": "42829895", 540 "title": "HTAP Databases: A Survey", 541 "points": 3, 542 "comments": 0, 543 "url": "https://news.ycombinator.com/item?id=42829895", 544 "created_at": "2025-01-26T13:15:26Z" 545 }, 546 { 547 "hn_id": "43974319", 548 "title": "Can Third-Parties Read Our Emotions?", 549 "points": 2, 550 "comments": 0, 551 "url": "https://news.ycombinator.com/item?id=43974319", 552 "created_at": "2025-05-13T15:56:02Z" 553 }, 554 { 555 "hn_id": "44325072", 556 "title": "Sekai: A Video Dataset Towards World Exploration", 557 "points": 2, 558 "comments": 1, 559 "url": "https://news.ycombinator.com/item?id=44325072", 560 "created_at": "2025-06-20T05:51:05Z" 561 }, 562 { 563 "hn_id": "44902095", 564 "title": "AlgoTune: Can Language Models Speed Up General-Purpose Numerical Programs?", 565 "points": 1, 566 "comments": 0, 567 "url": "https://news.ycombinator.com/item?id=44902095", 568 "created_at": "2025-08-14T16:04:10Z" 569 }, 570 { 571 "hn_id": "44329807", 572 "title": "SwarmAgentic: Automated Agentic System Generation via Swarm Intelligence", 573 "points": 1, 574 "comments": 0, 575 "url": "https://news.ycombinator.com/item?id=44329807", 576 "created_at": "2025-06-20T17:13:34Z" 577 } 578 ], 579 "top_points": 9, 580 "total_points": 32, 581 "total_comments": 2 582 } 583 }