scan.json (30902B)
1 { 2 "paper": { 3 "title": "ImportSnare: Directed \"Code Manual\" Hijacking in Retrieval-Augmented Code Generation", 4 "authors": ["Kai Ye", "Liangcai Su", "Chenxiong Qian"], 5 "year": 2025, 6 "venue": "Conference on Computer and Communications Security", 7 "arxiv_id": "2509.07941", 8 "doi": "10.1145/3719027.3765161" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "ImportSnare demonstrates that poisoned documentation in RAG databases can induce LLMs to recommend malicious dependencies at >50% attack success rate for popular libraries like matplotlib and seaborn, even with poisoning ratios as low as 0.01% of the total database. The attack exploits a dual trust chain: LLM reliance on RAG-retrieved context and developers' blind trust in LLM suggestions. Cross-platform transferability is demonstrated across Python, Rust, and JavaScript with multiple open-source and closed-source LLMs including GPT-4o, DeepSeek-r1, and Claude 3.5 Sonnet.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper states 'Our datasets and artifacts are available to facilitate future research and reproducibility in our project homepage' and provides the URL https://importsnare.github.io/. However, the abstract says 'we will release' (future tense), creating ambiguity. A project homepage URL is provided." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The Availability section states datasets and artifacts are available at the project homepage. The RAG database sources and query datasets are from publicly available HuggingFace datasets listed in Table 10." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper lists model names and attack parameters but does not provide sufficient detail to recreate the full software environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. While a project homepage exists, the paper itself contains no README-style instructions or scripts for replicating experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 1-4 and 6-7 are reported as point estimates (e.g., '0.677' ASR) with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": true, 46 "justification": "Table 5 uses Wilcoxon signed-rank tests with Bonferroni correction to assess whether code quality differences between poisoned and clean conditions are statistically significant." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 5 reports mean deviations from clean baselines (e.g., '-0.03', '+1.50') providing effect size context. Tables 1-2 report absolute ASR values and Precision@k which serve as interpretable effect sizes with baseline comparisons." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No power analysis or justification for the number of test queries per target package. Some targets have as few as 8 test queries (e.g., requests with only 8 queries producing 0% ASR), yet no discussion of whether this is sufficient for reliable estimation." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviation or variance is reported across experimental runs. The authors set temperature=0 and seed=100 for deterministic outputs, but do not report sensitivity to these choices or variance across configurations." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Table 2 compares ImportSnare against four baselines: Naive (no ranking sequences), HotFlip, ReMiss, and ReMiss with ImportSnare-R integration." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "ReMiss (2024) is contemporary. HotFlip (2017) is older but is the foundational method for gradient-based token replacement and is appropriate as a retrieval attack baseline." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 6.4 provides comprehensive ablation: module ablation (R, G, R+G, G+R, R+G+R) in Figure 5, hyperparameter tuning (L, B, kb) in Figure 6, local proxy LLM selection in Table 3, and Precision@k ablation in Table 4." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Four evaluation metrics are used: Attack Success Rate (ASR), Precision@k, #Queries (proxy queries per poisoned document), and Average Processing Time (APT). Code quality is assessed via Bandit, Pylint, and Flake8." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation of attack stealth or output quality. All evaluation is automated: ASR counts import statement presence, Precision@k is computed from retrieval rankings, and code quality uses automated tools (Bandit, Pylint, Flake8)." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section 5.2 states: 'This dataset was split 8:2 into proxy and test subsets. The 80% proxy queries were used to poison relevant database entries, while the 20% test queries were strictly held out for evaluation. Crucially, no overlap exists between proxy and test queries.'" 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per target package (12 Python, 5 Rust, 4 JavaScript targets), per language, and per LLM model in Table 1. Transferability results are shown per retriever (Table 6) and per query language (Table 7)." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses failure cases: typosquatted names like 'requstss' achieve 0% ASR on most models due to LLMs' typo-correction capabilities. Known malicious packages (e.g., 'tn-moment') achieve poor ASR. JavaScript shows significantly lower ASR than Python/Rust." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Several negative results are reported: requests/requstss achieves 0% ASR on most models (Table 1), collections/collection-strong achieves 0% on DeepSeek models, and JavaScript targets generally show low ASR. Cross-retriever transfer degrades significantly for some models (Table 6)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims are supported: '>50% for popular libraries such as matplotlib and seaborn' is confirmed in Table 1 (matplotlib_safe: 67.7% on GPT-4o-mini; malware_seaborn: 53.3% on GPT-4o). 'Poisoning ratio as low as 0.01%' is supported by Figure 4." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims (e.g., 'component R enhances retrieval ranking', 'component G improves ASR') are supported by controlled ablation studies in Figure 5 that systematically add/remove components while holding others constant." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title and conclusions make broad claims about 'critical supply chain risks in LLM-powered development' and the need to 'rethink security protocols for LLM-RAG systems.' While the paper tests three languages and several LLMs, the evaluation uses specific datasets and a simplified RAG setup. Section 7.2 acknowledges the gap from real-world systems but the framing remains broad." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not systematically discuss alternative explanations for the attack's success. Section 7.1 discusses LLM trust in RAG documents but does not consider whether other factors (e.g., dataset-specific artifacts, model-specific prompt sensitivity) could explain the results. No robustness checks against confounds." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "ASR measures whether the target library name appears in an import statement, but the paper frames this as a supply chain attack. The gap between 'LLM outputs an import statement' and 'developer installs and uses a malicious package' is acknowledged in the attack chain (Figure 1) but not explicitly discussed as a proxy-outcome limitation." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are listed as marketing names: 'GPT-4-Turbo', 'GPT-4o', 'GPT-4o-mini', 'DeepSeek V3', 'DeepSeek R1', 'Claude 3.5 Sonnet', 'LLama3.2-3B'. No snapshot dates or API versions are provided for any model." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix A.4 provides the full prompt template used for LLM code generation queries, with [context] and [question] placeholders. The inductive suggestion texts are provided in Table 8 and Appendix A.2." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 5.5 reports: temperature=0, seed=100, maximum sequence length L=20, beam width B=10, top candidates kb=15, iterations N=50, research iterations N'=25. Section 5.4 lists contriever models and their defaults." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The system is a standard RAG pipeline (retriever + LLM generator) without agents, tool use, retry logic, or memory management." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3 states documents are split into 1024-token segments. Section 4.2 describes how proxy queries map to documents via bidirectional retrieval. Section 5.2 describes the 80/20 proxy/test split. Table 10 lists all dataset sources." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7 'Discussion' contains substantive discussion of threats, real-world implications (7.2), programming language limitations (7.3), and ethics (7.4), serving as a comprehensive limitations section." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 7.2 discusses specific threats: 'our evaluation includes as many datasets to form the RAG database, it remains far from real-world RAG systems'; reranking mechanisms may degrade transferability; code comments in LLM output could alert users. These are specific to this study." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 7.2 explicitly states the evaluation 'remains far from real-world RAG systems, and approximating their scale is impractical.' Section 7.3 acknowledges other programming languages face different risk profiles. Table 9 explains why certain languages were excluded." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The paper states datasets and artifacts are available at the project homepage (https://importsnare.github.io/). The RAG database sources are publicly available HuggingFace datasets listed in Table 10." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 5.2 describes dataset composition: RAG databases from HuggingFace code datasets, query datasets from specific sources per language. Table 10 lists all sources with URLs. Table 1 shows poisoning ratios for each target." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public code datasets and benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is documented: datasets are collected from listed sources, documents are split into 1024-token segments, proxy queries retrieve relevant documents, documents are poisoned with ranking and inducing sequences, and test queries evaluate the attack. Poisoning ratios are reported in Table 1." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Acknowledgments section states: 'This work is partially supported by the NSFC for Young Scientists of China (No.62202400) and the RGC for Early Career Scheme (No.27210024).'" 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All three authors are affiliated with The University of Hong Kong. They are not evaluating their own commercial product, so no conflict of interest with evaluated systems." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "NSFC and RGC are government research funding agencies with no financial stake in the outcome of the attack evaluation." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial disclosure statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "This paper evaluates an attack framework's effectiveness, not model capability on benchmarks. The core evaluation measures whether poisoned documents can induce malicious dependency recommendations, which is a defense/attack test rather than a model knowledge evaluation." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Same rationale: the paper tests an attack mechanism rather than evaluating pre-trained model knowledge on a benchmark." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Same rationale: the paper tests an attack mechanism rather than evaluating pre-trained model knowledge on a benchmark." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The paper discusses ethics (Section 7.4) regarding responsible disclosure of the attack, not human subjects." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Average Processing Time (APT) per document is reported in Table 3 (e.g., 519.06s for matplotlib with LLama3.2-3B). This measures the practical cost of constructing poisoned documents." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget is stated. The paper does not report GPU hours, total API spend, or total hardware resources used for the full evaluation across all experiments." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Results are reported with a single seed (seed=100) and temperature=0. No analysis of sensitivity to different seeds or stochastic settings is provided." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is not explicitly stated. Setting temperature=0 and seed=100 implies single deterministic runs, but this is not explicitly confirmed." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "While ablation studies explore hyperparameter effects (Figure 6), the total search budget (number of configurations tried, compute spent on search) is not reported." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "Section 6.4 states: 'our default hyperparameter selection balances the largest feasible combination supported by the experimental equipment with acceptable efficiency, avoiding excessive slowdown.' The rationale is practical resource constraints." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": true, 316 "justification": "Table 5 applies Bonferroni correction for multiple comparisons in the Wilcoxon signed-rank tests, with adjusted alpha values (alpha'=alpha/m)." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement baselines (HotFlip, ReMiss) themselves and compare against their own system without acknowledging potential self-comparison bias. No independent evaluation or discussion of this bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "APT is reported per model configuration (Table 3) but performance is not systematically reported as a function of compute budget. Baselines and ImportSnare are not compared at matched compute." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether ASR on their benchmark setup actually measures real-world attack risk. Section 7.2 acknowledges the gap between experimental and real-world RAG systems but does not question the construct validity of their metrics." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": true, 336 "justification": "Table 6 evaluates cross-retriever transferability (gte-base, all-mpnet, bge-base, e5-base), separating retriever effects from LLM effects. Multiple RAG system configurations are tested (LlamaIndex, LangGraph referenced)." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether query datasets (e.g., BigCodeBench) or RAG database contents existed before model training cutoffs. Models like GPT-4o may have been trained on these public datasets." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The RAG context provides documents that may contain hints beyond what a realistic retrieval scenario would surface." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "While the 80/20 proxy/test split ensures no overlap between proxy and test queries, independence of test queries from model training data is not discussed. Public datasets used for queries may overlap with LLM training data." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination pipelines are used." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "ImportSnare achieves attack success rates exceeding 50% for popular libraries such as matplotlib and seaborn.", 365 "evidence": "Table 1 shows matplotlib_safe achieves 67.7% ASR on GPT-4o-mini and 93.5% on LLama3.2-3B. Seaborn variants achieve up to 66.7% (seaborn_v2 on LLama3.2-3B) and 53.3% (malware_seaborn on GPT-4o). Section 6.1.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "The attack can succeed with poisoning ratios as low as 0.01% of the total RAG database.", 370 "evidence": "Figure 4 shows ASR improvement emerging when poisoning exceeds 3% of relevant documents (0.01% of total documents) for matplotlib_safe targeting DeepSeek-v3. Section 6.3.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "ImportSnare outperforms all baselines (Naive, HotFlip, ReMiss) across all target packages.", 375 "evidence": "Table 2 compares ImportSnare against baselines on GPT-4o-mini. ImportSnare achieves higher or comparable ASR on all 12 target packages. For matplotlib_safe: Naive 0.194, HotFlip 0.387, ImportSnare 0.677. Section 6.2.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Poisoned documentation does not significantly degrade code generation quality.", 380 "evidence": "Table 5 shows Wilcoxon signed-rank tests with Bonferroni correction find no statistically significant difference between poisoned and clean conditions for security issues, severity, Pylint score, or Flake8 errors. Section 6.5.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "The framework demonstrates cross-platform transferability across RAG systems and LLM architectures.", 385 "evidence": "Table 6 shows cross-retriever results (attack still succeeds with different retrievers though at reduced rates). Table 7 shows cross-lingual query transferability. Table 1 shows results across 7 target LLMs. Section 6.6.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Claude-3.5-Sonnet exhibits unusually high ASR against Rust-related poisoning despite superior general code generation performance.", 390 "evidence": "Table 1 shows Claude-3.5-Sonnet achieves 70.0% (ndarray_v2), 50.0% (regex_safe), 72.2% (rocket_safe), 95.9% (rsscraper), and 51.2% (serde_json_safe) ASR on Rust targets, all higher than other closed-source models. Section 6.1.", 391 "supported": "strong" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No error bars or variance reporting", 397 "detail": "All main results (Tables 1-4, 6-7) report single point estimates with no confidence intervals, error bars, or variance measures. With temperature=0 this is deterministic, but sensitivity to seed choice, query sampling, or minor parameter changes is unknown." 398 }, 399 { 400 "flag": "Proxy-outcome gap not acknowledged", 401 "detail": "ASR measures whether a target package name appears in an import statement, but the paper frames this as a full supply chain attack. The gap between 'LLM generates import statement' and 'developer installs malicious package and gets compromised' involves multiple human decision points that are assumed away rather than studied." 402 }, 403 { 404 "flag": "Lab setup significantly differs from real-world RAG systems", 405 "detail": "The authors acknowledge their setup 'remains far from real-world RAG systems.' Real systems use reranking, larger corpora, and more complex retrieval pipelines. The reported ASR may not transfer to production systems, but the paper's framing suggests immediate practical concern." 406 }, 407 { 408 "flag": "Small sample sizes for some targets", 409 "detail": "Some target packages have very few test queries (e.g., requests with only ~8 queries showing 0% ASR on most models). With such small N, single-query differences could substantially change reported ASR, making these results unreliable." 410 }, 411 { 412 "flag": "No model version specifications", 413 "detail": "All models are referenced by marketing names without snapshot dates or API versions. Model behavior can change across versions, making results non-reproducible against the same model configurations." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "Evaluating large language models trained on code", 419 "authors": ["Mark Chen", "Jerry Tworek", "Heewoo Jun"], 420 "year": 2021, 421 "arxiv_id": "2107.03374", 422 "relevance": "Foundational work on LLM code generation evaluation (HumanEval benchmark) and dependency monopolization observations." 423 }, 424 { 425 "title": "PoisonedRAG: Knowledge poisoning attacks to retrieval-augmented generation of large language models", 426 "authors": ["Wei Zou", "Runpeng Geng", "Binghui Wang", "Jinyuan Jia"], 427 "year": 2024, 428 "arxiv_id": "2402.07867", 429 "relevance": "Prior RAG poisoning work that ImportSnare extends; used as a baseline comparison for attack methodology." 430 }, 431 { 432 "title": "CodeRAG-Bench: Can retrieval augment code generation?", 433 "authors": ["Zora Zhiruo Wang", "Akari Asai", "Xinyan Velocity Yu"], 434 "year": 2024, 435 "arxiv_id": "2406.14497", 436 "relevance": "Benchmark for retrieval-augmented code generation, directly relevant to the RACG attack surface studied." 437 }, 438 { 439 "title": "Security Attacks on LLM-based Code Completion Tools", 440 "authors": ["Wen Cheng", "Ke Sun", "Xinyu Zhang", "Wei Wang"], 441 "year": 2025, 442 "relevance": "Related security attack work on LLM code completion tools, demonstrating supply chain risks in code generation." 443 }, 444 { 445 "title": "Exploring the Security Threats of Knowledge Base Poisoning in Retrieval-Augmented Code Generation", 446 "authors": ["Bo Lin", "Shangwen Wang", "Liqian Chen", "Xiaoguang Mao"], 447 "year": 2025, 448 "arxiv_id": "2502.03233", 449 "relevance": "Concurrent work exploring similar RAG code generation security threats, directly related attack surface research." 450 }, 451 { 452 "title": "A survey on large language model (llm) security and privacy: The good, the bad, and the ugly", 453 "authors": ["Yifan Yao", "Jinhao Duan", "Kaidi Xu"], 454 "year": 2024, 455 "relevance": "Survey of LLM security challenges relevant to understanding code generation safety landscape." 456 }, 457 { 458 "title": "BigCodeBench: Benchmarking code generation with diverse function calls and complex instructions", 459 "authors": ["Terry Yue Zhuo"], 460 "year": 2024, 461 "arxiv_id": "2406.15877", 462 "relevance": "Used as both RAG database source and query dataset source for Python experiments in ImportSnare evaluation." 463 }, 464 { 465 "title": "Backstabber's knife collection: A review of open source software supply chain attacks", 466 "authors": ["Marc Ohm", "Henrik Plate", "Arnold Sykosch", "Michael Meier"], 467 "year": 2020, 468 "relevance": "Foundational taxonomy of software supply chain attacks used to inform malicious package naming strategies." 469 }, 470 { 471 "title": "DeepSeek-r1: Incentivizing reasoning capability in LLMs via reinforcement learning", 472 "authors": ["Daya Guo", "Dejian Yang"], 473 "year": 2025, 474 "arxiv_id": "2501.12948", 475 "relevance": "One of the primary target LLMs evaluated in the attack experiments, representing SOTA reasoning models." 476 }, 477 { 478 "title": "Retrieval-augmented generation for knowledge-intensive NLP tasks", 479 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 480 "year": 2020, 481 "relevance": "Foundational RAG paradigm paper underlying the RACG systems that ImportSnare attacks." 482 }, 483 { 484 "title": "HotFlip: White-box adversarial examples for text classification", 485 "authors": ["Javid Ebrahimi", "Anyi Rao", "Daniel Lowd", "Dejing Dou"], 486 "year": 2017, 487 "arxiv_id": "1712.06751", 488 "relevance": "Baseline method adapted for the retrieval-oriented ranking sequence attack in ImportSnare-R." 489 }, 490 { 491 "title": "Wolves in the Repository: A Software Engineering Analysis of the XZ Utils Supply Chain Attack", 492 "authors": ["Piotr Przymus", "Thomas Durieux"], 493 "year": 2025, 494 "relevance": "Real-world supply chain attack case study informing the threat model for dependency hijacking attacks." 495 } 496 ], 497 "engagement_factors": { 498 "practical_relevance": { 499 "score": 2, 500 "justification": "Relevant to RAG system developers and security teams but not an immediately usable defensive tool." 501 }, 502 "surprise_contrarian": { 503 "score": 2, 504 "justification": "Reveals a novel attack surface in RACG that challenges assumptions about RAG system safety." 505 }, 506 "fear_safety": { 507 "score": 3, 508 "justification": "Demonstrates a practical supply chain attack against popular AI coding tools (Copilot, Cursor) with real-world demos." 509 }, 510 "drama_conflict": { 511 "score": 1, 512 "justification": "No direct controversy, but implicitly critiques LLM vendors' safety alignment for code generation." 513 }, 514 "demo_ability": { 515 "score": 1, 516 "justification": "Project homepage exists with datasets but no turnkey demo; reproducing the attack requires significant setup." 517 }, 518 "brand_recognition": { 519 "score": 2, 520 "justification": "Attacks well-known products (GPT-4o, Claude, DeepSeek, Copilot, Cursor) which drives attention." 521 } 522 } 523 }