scan.json (32010B)
1 { 2 "paper": { 3 "title": "ShadowCode: Towards (Automatic) External Prompt Injection Attack against Code LLMs", 4 "authors": [ 5 "Yuchen Yang", 6 "Yiming Li", 7 "Hongwei Yao", 8 "Bingrun Yang", 9 "Yiling He", 10 "Tianwei Zhang", 11 "Dacheng Tao", 12 "Zhan Qin" 13 ], 14 "year": 2024, 15 "venue": "arXiv", 16 "arxiv_id": "2407.09164" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "ShadowCode generates concise 12-token non-functional perturbations that manipulate Code LLMs into producing attacker-specified malicious code during completion, achieving up to 97.9% ASR on open-source models (CodeGemma-2b/7b, CodeGeeX2-6b) and over 90% on commercial tools (GitHub Copilot, CodeGeeX application). The attack transfers effectively from open-source surrogate models to black-box commercial systems, and resists potential defenses including semantic analysis and character removal. Ablation studies show that forward reasoning enhancement and keyword-based perturbation design are both critical components.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The abstract states 'The code is available at https://github.com/LianPing-cyber/ShadowCodeEPI' providing a working repository URL." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper uses publicly available benchmarks: Humaneval, Humaneval-x, CodeXGLUE, MBPP, and Eval-Plus (Table 1). The 13 malicious objectives and 31 threat cases are described in Section 4.1 and Appendix A with enough detail to reconstruct." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "Section 4.1 mentions 'NVIDIA RTX A6000 Graphics Card' and 'float16 precision format without quantization' but provides no requirements.txt, Python version, library versions, or dependency specifications." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "While code is released and hyperparameters are detailed in Section 4.1, the paper itself contains no step-by-step reproduction instructions, no README-style commands, and no 'Reproducing Results' section." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results in Tables 2-4 and Figures 6, 10 are reported as point estimates (e.g., '82.0%', '86.6%') with no confidence intervals, error bars, or ± notation." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims ShadowCode 'significantly outperforms the other baseline methods' (Section 4.2) based solely on comparing ASR numbers. No statistical significance tests (p-values, t-tests, etc.) are reported." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The paper reports absolute ASR values with baseline context throughout, e.g., Table 2 shows ShadowCode at 82.0-86.6% vs C-GCG at 41.9-44.6% and IPI-M at 11.2-26.2%, and Table 3 provides NBR/STF comparisons across methods." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The paper uses 14,273 data items across 7 sub-datasets (Table 1) and 620 manually tested items for commercial evaluation, but never justifies why these sizes were chosen or discusses statistical power." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "No standard deviation, variance, or spread measures are reported. The random seed is fixed at 100 (Section 4.1) suggesting single-run results. No multi-run aggregation is described." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "Section 4.1 describes two baselines: C-GCG (adapted from GCG [26]) and IPI-M (manually designed IPI based on [18]). A 'No Attack' reference is also included in Table 2." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "GCG (Zou et al., 2023) and indirect prompt injection (Greshake et al., 2023) are recent and represent the closest existing work for this attack paradigm." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Section 4.5 provides ablation studies for keyword-based design (0 vs 1 vs 2 keywords, Figure 10), forward reasoning enhancement (Figure 11), and Top-k value effects. Each component's contribution is measured independently." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "The paper uses five evaluation metrics: ASR (attack success rate), NBR (not bad cases rate), STF (selective threat factor), NIT (number of injected tokens), and NIC (number of injected chars), defined in Section 4.1 and Equation 7." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "All evaluation is automated. ASR is measured by functional equivalence to target code (Section 4.1). No human evaluation of attack outputs is performed. The commercial evaluation (Section 4.3) involves manual testing of 620 items but this is manual execution, not human judgment of output quality." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "Perturbations are optimized on simulated code contexts (Section 3.3) using a surrogate model, then tested on separate code completion benchmark datasets (Humaneval, MBPP, etc.). The optimization and evaluation data are distinct." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Table 2 breaks results down by model, dataset, and programming language. Table 3 provides per-language NBR/STF. Figure 14 shows per-case ASR across all 13 objectives. Table 4 separates gray-box and black-box scenarios." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 4.2 notes 'ShadowCode is relatively less effective on CodeXGLUE' due to data complexity. Appendix B discusses that ST2 is 'the most challenging' case and CWE22 is 'difficult for attackers to achieve.' Figure 14 shows per-case variation including low-performing cases." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Section 4.5 reports that 0-keyword perturbation 'struggles to achieve an ASR above 80%' even with high computational power. Figure 11 shows low ASR (<20%) without FRE even with low loss. Section 4.2 discusses reduced effectiveness on complex datasets." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims of 'up to 97.9% ASR' on open-source models and 'over 90% ASR' on commercial applications are supported by Table 2 (CodeGemma-7b Python at 97.9%) and Table 4 (CodeGeeX Python at 93.3%, Copilot Python at 90.4%). The '12-token' claim is supported by the default 10-token perturbation + 2 keywords." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Ablation studies in Section 4.5 make causal claims (FRE and keyword-based design improve ASR). The ablation design uses controlled single-variable manipulation: Figures 10-11 isolate individual components while holding others constant, which is adequate for these causal claims." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The conclusion states 'Our work alerted to a new serious security threat in the use of Code LLMs' broadly, but testing covers only 3 open-source models from 2 families and 2 commercial applications. The title claims about 'Code LLMs' generally, beyond the 5 systems tested. Modern models like GPT-4, Claude, or Llama-based code models are not evaluated." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not consider alternative explanations for why the attack succeeds. For example, it does not discuss whether the success is due to specific model architectures, training data composition, or inherent properties of autoregressive generation. Section 5 discusses limitations but not alternative explanations for the observed results." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "ASR measures whether generated code has equivalent functionality to the target malicious code at the correct position (Section 4.1). This directly corresponds to the paper's claim of achieving malicious objectives — the proxy and outcome are tightly aligned." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": true, 150 "justification": "Section 4.1 specifies exact model versions: CodeGemma-2b, CodeGemma-7b, and CodeGeeX2-6b. For commercial applications, CodeGeeX and GitHub Copilot are named, though the paper acknowledges the exact underlying model versions are unknown (which is the point of gray/black-box evaluation)." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": false, 155 "justification": "The paper provides example code contexts (Figures 3, 5, 7, 13) but not the full set of code contexts/inputs used across all 31 threat cases and 14,273 data items. The code simulation structure is described algorithmically but complete prompt texts are not provided in the paper." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 4.1 reports all key hyperparameters: Top-k=400, r=0.4, h=2, keyword count=2, perturbation length=10, random seed=100, 5 noisy code samples, float16 precision, beam search size k for Algorithm 1." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. The paper evaluates direct code completion by Code LLMs, not multi-step agent workflows." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 3.3 documents code contextual simulation construction (conditional code, noise code, position/target code). Section 4.1 and Table 1 detail how malicious objectives are converted into 31 threat cases across 3 languages, with dataset breakdowns (14,273 total items: 2,132 C/C++, 7,965 Python, 4,176 Java)." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "Section 5 'Potential Limitations and Future Work' provides substantive discussion of three specific limitations: keyword selection strategy, lack of natural-looking perturbation enforcement, and restriction to code completion scenarios." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Section 5 discusses specific threats: keyword tokens selected only from output tuple via simple grid search 'may overlook potentially better solutions'; perturbations are not mandated to be entirely natural-looking; the attack is only evaluated on code completion, not other programming tasks." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "Section 5 explicitly states what was NOT tested: other code tasks ('code summarization and code translation'), natural-looking perturbation generation, and more efficient keyword optimization. Section 4.1 bounds the evaluation to 3 languages and specific model families." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "Only aggregated ASR, NBR, and STF values are reported. Per-item attack results, generated code outputs, and detailed logs are not available for independent verification." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 4.1 describes data sources: 13 malicious objectives from CWE top-10 KEV and prior work, standard benchmarks (Humaneval, MBPP, etc.). Table 1 provides item counts per language. Appendix A details each threat case (Table 6)." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data sources are standard publicly available benchmarks and CWE vulnerability definitions." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline from malicious objective → output tuple (Section 3.3) → code contextual simulation → perturbation optimization (Algorithm 1) → evaluation on benchmark datasets is fully documented. Dataset counts and language breakdowns are provided in Table 1." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "The Acknowledgments section thanks two professors for comments but does not disclose any funding sources, grants, or sponsoring agencies." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Author affiliations are clearly listed: Zhejiang University and Nanyang Technological University. The evaluated models (CodeGemma by Google, CodeGeeX by Tsinghua-affiliated, Copilot by GitHub/OpenAI) are from different organizations than the authors." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "No funding is disclosed, so independence of funders cannot be assessed. Without a funding statement, this criterion cannot be satisfied." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests statement or financial interests declaration appears anywhere in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": false, 237 "answer": false, 238 "justification": "This paper tests an attack method against Code LLMs, not model capability on benchmarks. The benchmarks serve as realistic code contexts for attack evaluation, not as capability measures. Similar to a red-teaming study." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": false, 242 "answer": false, 243 "justification": "The paper evaluates attack success rates, not model knowledge. Whether models have seen benchmark code is a confound for attack evaluation but not a contamination concern in the standard sense." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": false, 247 "answer": false, 248 "justification": "As an attack evaluation rather than a model capability benchmark, contamination is not the primary concern. The attack aims to manipulate outputs regardless of whether the model has seen the input code before." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants in this study." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": true, 292 "justification": "Section 4.1 reports perturbation generation time: 'approximately 0.5-1 hours for CodeGemma-2b and 2-3 hours for both CodeGemma-7b and CodeGeeX-6b' on NVIDIA RTX A6000." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": true, 297 "justification": "Section 4.1 states: 'We run Algorithm 1 on an NVIDIA RTX A6000 Graphics Card... employing a float16 precision format without quantization' with generation times for each model." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "Section 4.1 states 'the random seed for initializing induced perturbations is set to 100' — a single fixed seed. No results across multiple seeds or seed sensitivity analysis is provided." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The paper does not explicitly state how many experimental runs produced the reported ASR values. The fixed seed of 100 suggests single-run results, but this is not stated." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "Default hyperparameters are stated (Top-k=400, r=0.4, h=2, etc.) but the paper does not report how these defaults were selected, how many configurations were tried, or the total compute spent on hyperparameter tuning." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "The ablation study (Section 4.5) varies Top-k and keyword count but the selection of default values (Top-k=400, 2 keywords, r=0.4) is described as 'default parameter values' without justification for why these specific values were chosen." 320 }, 321 "multiple_comparison_correction": { 322 "applies": false, 323 "answer": false, 324 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "Both baselines (C-GCG and IPI-M) are the authors' own implementations/adaptations of existing methods. The paper does not acknowledge potential bias from implementing competitor baselines, as documented by Lucic et al. (2018)." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "ShadowCode uses 12 tokens (10 perturbation + 2 keyword) vs C-GCG at 12 prefix tokens vs IPI-M at <30 tokens, controlling for perturbation size. However, the computational cost of generating perturbations (0.5-3 hours on A6000) is not compared across methods." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "The paper does not discuss whether ASR on benchmark code completion datasets (Humaneval, MBPP, etc.) actually represents real-world attack success in production code environments with more complex contexts and workflows." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "No agentic scaffolding is involved. The evaluation tests direct code completion without multi-step scaffolding." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "The paper does not discuss whether the Code LLMs were trained on the benchmark datasets used for evaluation (Humaneval published 2021, MBPP 2021, etc.), which could affect attack success rates if models have strong priors about 'correct' completions." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the evaluation setup provides the model with information that would not be available in a real-world attack scenario." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "No analysis of whether benchmark examples share structural similarities or whether optimization on simulated contexts could overfit to patterns present in the evaluation benchmarks." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No concrete leakage detection or prevention method is applied. The paper does not test for overlap between training data and evaluation benchmarks." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "ShadowCode achieves up to 97.9% attack success rate on open-source Code LLMs using 12-token non-functional perturbations.", 373 "evidence": "Table 2 shows CodeGemma-7b achieves 97.9% ASR on Humaneval (Python) and 97.4% on Eval-Plus (Python). Average ASR across all models and datasets ranges from 82.0% to 86.6%.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "ShadowCode successfully transfers to commercial Code LLM-integrated applications with over 90% ASR.", 378 "evidence": "Table 4 reports 93.3% ASR on CodeGeeX (gray-box, Python) and 90.4% on GitHub Copilot (black-box, Python). Java and C/C++ results range from 67.7% to 80.4%.", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "ShadowCode significantly outperforms existing attack methods (C-GCG and IPI-M) across all evaluation settings.", 383 "evidence": "Table 2 shows ShadowCode average ASR of 82.0-86.6% vs C-GCG at 41.9-44.6% and IPI-M at 11.2-26.2%. Table 3 shows ShadowCode leads in both NBR and STF metrics across all languages.", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Forward reasoning enhancement (FRE) and keyword-based perturbation design (KPD) are both necessary for ShadowCode's effectiveness.", 388 "evidence": "Section 4.5 and Figures 10-11: removing keywords drops ASR by ~10% (1-keyword) or prevents reaching 80% ASR even with Top-k=600 (0-keyword). Without FRE, ASR stays below 20% even when loss is lower than with FRE.", 389 "supported": "strong" 390 }, 391 { 392 "claim": "ShadowCode perturbations are semantically indistinguishable from benign code via embedding-layer analysis.", 393 "evidence": "Section 4.4 and Figure 8: t-SNE visualization of 1,640 code segments (820 benign, 820 injected) at CodeGemma-2b layer 17 shows features are 'almost completely intertwined.'", 394 "supported": "weak" 395 }, 396 { 397 "claim": "ShadowCode resists character removal defenses, maintaining sufficient ASR under 50% removal rate.", 398 "evidence": "Figure 9 shows ASR remains high at 50% character removal rate. Table 5 shows ASR of 65-90% when perturbations use non-comment forms (variable assignment, output content).", 399 "supported": "moderate" 400 } 401 ], 402 "red_flags": [ 403 { 404 "flag": "No error bars or uncertainty quantification", 405 "detail": "All results across Tables 2-4 and Figures 6, 10 are single point estimates. No confidence intervals, standard deviations, or multi-run statistics are provided despite the stochastic nature of the optimization process (random seed, greedy search)." 406 }, 407 { 408 "flag": "Single random seed evaluation", 409 "detail": "All experiments use a fixed random seed of 100 (Section 4.1). No seed sensitivity analysis is conducted, so it is impossible to assess how robust the results are to different initializations." 410 }, 411 { 412 "flag": "Self-implemented baselines", 413 "detail": "Both baselines (C-GCG and IPI-M) are the authors' own adaptations of existing methods (GCG and indirect prompt injection). The C-GCG adaptation removes code simulation and enhancement modules, potentially weakening the baseline. IPI-M is a simplified version of existing IPI. No independent baseline implementations are used." 414 }, 415 { 416 "flag": "Selective reporting of best-case results", 417 "detail": "The abstract highlights 'up to 97.9% ASR' and 'over 90% ASR' which are best-case language-specific results (Python). Average results across all languages are lower (84.5% for open-source, 67.7-90.4% for commercial), and some individual cases have considerably lower ASR (Figure 14)." 418 }, 419 { 420 "flag": "Weak defense evaluation", 421 "detail": "The defense evaluation (Section 4.4) only considers t-SNE visualization of embeddings (not a trained classifier) and random character removal. No evaluation against practical defenses like instruction-following guardrails, output safety filters, or structured code analysis tools." 422 }, 423 { 424 "flag": "Commercial evaluation sample size and methodology", 425 "detail": "Commercial application testing (Table 4) uses only 620 manually tested items from Humaneval/Humaneval-x, which are simpler code contexts. Manual testing introduces potential subjectivity in ASR determination. No description of how 'randomly selected' samples were chosen." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 431 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 432 "year": 2023, 433 "relevance": "Foundational work on indirect prompt injection against LLM-integrated applications; directly inspires ShadowCode's external prompt injection paradigm." 434 }, 435 { 436 "title": "Universal and transferable adversarial attacks on aligned language models", 437 "authors": ["A. Zou", "Z. Wang", "J. Z. Kolter", "M. Fredrikson"], 438 "year": 2023, 439 "arxiv_id": "2307.15043", 440 "relevance": "GCG attack method that serves as the basis for the C-GCG baseline and informs the greedy search optimization approach." 441 }, 442 { 443 "title": "You autocomplete me: Poisoning vulnerabilities in neural code completion", 444 "authors": ["R. Schuster", "C. Song", "E. Tromer", "V. Shmatikov"], 445 "year": 2021, 446 "relevance": "Pioneering work on backdoor attacks against code completion models; provides the ST0 and ST1 malicious objectives used in ShadowCode evaluation." 447 }, 448 { 449 "title": "Stealthy backdoor attack for code models", 450 "authors": ["Z. Yang", "B. Xu", "J. M. Zhang", "H. J. Kang", "J. Shi", "J. He", "D. Lo"], 451 "year": 2024, 452 "relevance": "Stealthy backdoor attack via dataset poisoning for code models; represents the category of training-time attacks that ShadowCode claims to improve upon." 453 }, 454 { 455 "title": "Multi-target backdoor attacks for code pre-trained models", 456 "authors": ["Y. Li", "S. Liu", "K. Chen", "X. Xie", "T. Zhang", "Y. Liu"], 457 "year": 2023, 458 "relevance": "Model poisoning-based backdoor attack on code LLMs; represents training-time attack methods that require model access." 459 }, 460 { 461 "title": "An LLM-assisted easy-to-trigger backdoor attack on code completion models: Injecting disguised vulnerabilities against strong detection", 462 "authors": ["S. Yan", "S. Wang", "Y. Duan", "H. Hong", "K. Lee", "D. Kim", "Y. Hong"], 463 "year": 2024, 464 "relevance": "LLM-assisted backdoor attack on code completion that injects disguised CWE vulnerabilities; closely related attack vector targeting code security." 465 }, 466 { 467 "title": "Large language models for code: Security hardening and adversarial testing", 468 "authors": ["J. He", "M. Vechev"], 469 "year": 2023, 470 "relevance": "Evaluates LLMs for code security including adversarial testing with CWE vulnerabilities; directly relevant to security assessment of code LLMs." 471 }, 472 { 473 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 474 "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"], 475 "year": 2024, 476 "relevance": "Benchmark for indirect prompt injection in LLM agents; shares the attack scenario of compromising users through injected instructions." 477 }, 478 { 479 "title": "CodeAttack: Code-based adversarial attacks for pre-trained programming language models", 480 "authors": ["A. Jha", "C. K. Reddy"], 481 "year": 2023, 482 "relevance": "Adversarial attacks on code models via identifier renaming; represents the adversarial attack category that ShadowCode contrasts with." 483 }, 484 { 485 "title": "Evaluating large language models trained on code", 486 "authors": ["M. Chen", "J. Tworek", "H. Jun"], 487 "year": 2021, 488 "arxiv_id": "2107.03374", 489 "relevance": "Introduces the HumanEval benchmark for code generation used as a primary evaluation dataset in this paper." 490 }, 491 { 492 "title": "\"Do Anything Now\": Characterizing and evaluating in-the-wild jailbreak prompts on large language models", 493 "authors": ["X. Shen", "Z. Chen", "M. Backes", "Y. Shen", "Y. Zhang"], 494 "year": 2024, 495 "relevance": "Comprehensive study of jailbreak prompts against LLMs; represents the broader prompt injection attack landscape that ShadowCode extends to code domain." 496 }, 497 { 498 "title": "CodeGemma: Open code models based on Gemma", 499 "authors": ["C. Team", "H. Zhao", "J. Hui"], 500 "year": 2024, 501 "arxiv_id": "2406.11409", 502 "relevance": "CodeGemma model series used as primary evaluation targets (CodeGemma-2b, CodeGemma-7b) in ShadowCode experiments." 503 } 504 ], 505 "engagement_factors": { 506 "practical_relevance": { 507 "score": 2, 508 "justification": "Demonstrates a real attack vector against widely-used code completion tools (Copilot, CodeGeeX) that practitioners should be aware of, but executing the attack requires ML expertise and GPU access." 509 }, 510 "surprise_contrarian": { 511 "score": 1, 512 "justification": "Prompt injection is a known concern; extending it to code completion via non-functional perturbations is incremental rather than paradigm-shifting." 513 }, 514 "fear_safety": { 515 "score": 3, 516 "justification": "Demonstrates 90%+ success rate injecting malicious code (including 'rm -rf /') into GitHub Copilot completions via just 12 tokens hidden in code comments — directly alarming for developer security." 517 }, 518 "drama_conflict": { 519 "score": 1, 520 "justification": "Shows popular commercial tools are vulnerable but does not frame this as controversy or conflict with the tool developers." 521 }, 522 "demo_ability": { 523 "score": 2, 524 "justification": "Code is released on GitHub (https://github.com/LianPing-cyber/ShadowCodeEPI) but requires GPU hardware and ML setup to run." 525 }, 526 "brand_recognition": { 527 "score": 2, 528 "justification": "Attacks GitHub Copilot (OpenAI/GitHub) and CodeGeeX — well-known products — but the authors and their institutions are not high-profile in the broader AI discourse." 529 } 530 } 531 }