scan.json (26747B)
1 { 2 "paper": { 3 "title": "Backdoored Retrievers for Prompt Injection Attacks on Retrieval Augmented Generation of Large Language Models", 4 "authors": ["Cody Clop", "Yannick Teglia"], 5 "year": 2024, 6 "venue": "arXiv preprint", 7 "arxiv_id": "2410.14479" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL, GitHub link, or code archive is mentioned anywhere in the paper. No mention of code release in the paper or appendix." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses publicly available datasets from the BEIR benchmark: Natural Questions, MS-MARCO, HotpotQA, and NFCorpus. These are all standard public benchmarks that the authors did not modify (aside from inserting poisoned documents, which are described in the methodology)." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, conda environment file, or environment setup section is provided. The paper mentions model names but not library versions, Python version, or hardware details." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology section describes the experimental setup at a conceptual level but does not provide commands or configurations needed to reproduce." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results are reported as point estimates (e.g., ASR values like 0.91, 0.27, 1.0). No confidence intervals, error bars, or ± notation appear in any table or figure." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper makes comparative claims (e.g., 'Llama-3 exhibited the highest susceptibility', 'Vicuna demonstrated the highest resistance', backdoor attacks achieve 'even higher success rates') but uses no statistical significance tests. Comparisons are made by directly comparing ASR numbers." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports ASR (Attack Success Rate) values with sufficient context: e.g., 'Llama-3 exhibited the highest susceptibility to prompt injection, achieving an ASR of up to 0.91' and 'Vicuna demonstrated the highest resistance to prompt injection attacks, with a maximum ASR of 0.27.' Tables 3 and 4 provide detailed ASR values across conditions. The reader can assess magnitude of effects." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The paper mentions '100 queries per dataset' for LLM vulnerability experiments (Section 3.1, Figure 4 caption) but provides no justification for why 100 was chosen, no power analysis, and no discussion of whether this is sufficient." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviation, variance, or spread measures are reported across any experimental runs. All results appear to be single-run numbers. No mention of multiple runs with different seeds." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares corpus poisoning against their novel backdoor attack approach. Table 2 compares pretrained, fine-tuned, and backdoored retrievers. Table 4 compares corpus poisoning vs. backdoor attacks across models and configurations." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "The corpus poisoning baseline is based on PoisonedRAG [3] from 2024, and the paper references Long et al. [4] (2024) and AgentPoison [30] (2024). These are contemporary works in the same problem space." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "The paper systematically varies multiple factors: injection position (10 positions), directive strength (6 levels), number of retrieved documents (k=1,3,5,10,20), attack type (corpus poisoning vs. backdoor), target domain (Alzheimer's Disease vs. Nutrition), and LLM model. These variations function as ablations testing which components matter." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "The paper uses only a single metric: Attack Success Rate (ASR). While precision@k is reported for the retriever (Table 2), the attack evaluation itself relies solely on ASR. No other metrics such as attack detectability, output quality degradation, or false positive rates are reported." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation is included. The attack success is determined automatically (presence of attacker's link, coupon code, or message in the output). Human evaluation of output quality, naturalness of injected content, or detectability of attacks would be relevant but is absent." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "The paper does not explicitly describe a separation between development and test sets. The 100 queries per dataset used for evaluation are not described as being separate from any queries used during attack development or tuning of directive strength levels." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by attack type, target domain (Alzheimer's Disease vs. Nutrition), LLM model (Llama-3, Vicuna, Mistral), dataset (NFCorpus, HotpotQA), injection position, directive strength level, and number of retrieved documents k. Tables 3 and 4 provide per-configuration breakdowns." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses failure modes: Vicuna's high resistance to attacks, corpus poisoning's lower success for broader domains like nutrition (ASR 0.14-0.46), and how increasing k can dilute the injected prompt effect. Section 4 discusses limitations of the trigger approach." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Several negative results are reported: Vicuna showed strong resistance (max ASR 0.27), corpus poisoning on nutrition domain yielded poor results (ASR 0.14-0.44), and the dilution effect of increasing k on backdoor attacks with Vicuna is visible in Table 4 (ASR dropping from 0.83 to 0.46)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims: (1) corpus poisoning can achieve significant ASR through small number of compromised documents -- supported by Tables 3 and 4; (2) backdoor attacks demonstrate even higher success rates -- supported by comparison in Tables 3 and 4; (3) backdoor requires more complex setup (victim must fine-tune) -- discussed in Section 3.2 and 4. All abstract claims are supported by the experimental results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper makes causal claims about attack mechanisms (e.g., 'injected prompt is more effective when it appears among the first retrieved documents', Section 3.1). These are supported by controlled experiments that systematically vary injection position while holding other factors constant. The experimental design (controlled manipulation of attack parameters) is adequate for these causal claims." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title and abstract claim broad vulnerability of 'Retrieval Augmented Generation of Large Language Models' but experiments use only 3 open-source 7-8B parameter models, one embedding model (GTE-large-en-v1.5), and 2-4 datasets. No commercial models (GPT-4, Claude) are tested. The paper does not explicitly bound its claims to the tested setting." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for its findings. For example, it does not consider whether the high ASR might be specific to the safety training of the tested models, whether different system prompts could mitigate the attacks, or whether the results reflect artifacts of the specific prompt templates used." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper mentions 'Llama-3-8B', 'Vicuna-7B', 'Mistral-7B', and 'GTE-large-en-v1.5' but does not provide specific version snapshots, dates, or checkpoint identifiers. For example, there are multiple Llama-3-8B variants (base, instruct, different releases) and no specific version is identified." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "The full injection prompts for all 6 directive strength levels are provided in Table 1 (Link Insertion), Table 5 (Denial of Service), and Table 6 (Advertising) in the appendix. The system prompt template for the RAG system is not explicitly shown, but the attack prompts themselves are fully specified." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No hyperparameters are reported: no temperature, top-p, or max tokens for LLM inference; no learning rate, batch size, or number of epochs for retriever fine-tuning; no details on the contrastive loss training beyond the loss formula." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "The system is a standard RAG pipeline (retriever + LLM) without agentic scaffolding such as tool use, retry logic, or multi-step reasoning. No agentic scaffolding is used." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "The paper does not describe how queries were selected from the BEIR datasets (100 queries per dataset -- how were they chosen?), how the poisoned documents were generated beyond a brief mention of using ChatGPT, or the exact process for constructing backdoor training datasets." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. The Discussion section (Section 4) briefly mentions that the study 'centers on attacks where an unsuspecting user inadvertently triggers behaviors' but does not provide substantive limitation discussion." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed. The paper does not address potential issues like the limited model set, single embedding model, arbitrary directive strength levels, or the gap between lab settings and real-world RAG deployments." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound its claims to the specific models tested, acknowledge that commercial models with better safety training might behave differently, or state that results may not generalize to production RAG systems." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental data (query results, model outputs, individual ASR measurements) is made available. Only aggregated ASR values in tables and heatmaps are shown." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "The paper describes the data sources: BEIR benchmark datasets (NQ, MS-MARCO, HotpotQA, NFCorpus) with their references and sizes (e.g., NFCorpus has 3,633 documents, HotpotQA has over 5 million). The attack setup describes how poisoned documents are generated and how backdoor training data is constructed." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants are involved. The data sources are standard public benchmarks, so recruitment methods do not apply." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "Key pipeline details are missing: how 100 queries per dataset were selected, how ChatGPT-generated poisoned documents were filtered or validated, the exact construction of backdoor training datasets (number of poisoned vs. legitimate pairs), and how ASR was computed from raw model outputs." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding acknowledgment section is present. Both authors are affiliated with Thales DIS (a defense/cybersecurity company) but there is no disclosure of whether this work was funded by Thales or any other entity." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Both authors' affiliations with Thales DIS Cybersecurity Artificial Intelligence Team are clearly listed on the first page." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding source is disclosed. The authors work at Thales DIS, a cybersecurity company that has a commercial interest in demonstrating AI security vulnerabilities (which could drive demand for security products/services). Without explicit funding disclosure, independence cannot be assessed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper. Absence of a declaration is not the same as absence of conflict." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This paper tests attack vectors (prompt injection, corpus poisoning, backdoor attacks) rather than evaluating pre-trained model knowledge on a benchmark. The models are used as targets for attack, not assessed on their pre-existing knowledge. Contamination in the traditional sense is not relevant to the attack success metrics." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Same as above -- the paper evaluates attack effectiveness on RAG systems, not model knowledge. Train/test overlap for the underlying benchmarks is not relevant to whether the LLM follows injected malicious instructions." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Same as above -- benchmark contamination in the traditional sense does not apply to attack success rate measurements. The question is whether the LLM follows injected prompts, not whether it has memorized benchmark answers." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants are involved in this study. All experiments are computational." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved. The study uses automated experiments on publicly available models and datasets." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No inference cost, latency, or token consumption is reported for any of the LLM or retriever operations, despite the experiments involving thousands of LLM queries across multiple models." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No computational budget, GPU hours, hardware specifications, or total experiment time is reported. The fine-tuning of the retriever and the thousands of LLM inference calls presumably required meaningful compute, but this is not quantified." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Llama-3 exhibits the highest susceptibility to prompt injection attacks, achieving an ASR of up to 0.91.", 286 "evidence": "Section 3.1 and Figure 4 show heatmaps of ASR across injection positions and directive strengths for Llama-3, Vicuna, and Mistral, based on 100 queries per dataset across three BEIR corpora.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Vicuna demonstrates the highest resistance to prompt injection attacks, with a maximum ASR of 0.27.", 291 "evidence": "Section 3.1 and Figure 4 show Vicuna's ASR peaking at 0.27 across all conditions tested.", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Backdoor attacks on the dense retriever achieve near-perfect retrieval of poisoned documents (ASR up to 1.0) without degrading performance on benign queries.", 296 "evidence": "Table 3 shows backdoor ASR@k of 0.97-1.0 across conditions. Table 2 shows that backdoored retriever precision on benign queries is essentially identical to the clean fine-tuned model.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "Corpus poisoning is effective for narrow domains (Alzheimer's Disease) but yields lower success rates for broader domains (Nutrition).", 301 "evidence": "Table 3: AD corpus poisoning ASR@1 is 0.63 (NFCorpus) and 0.17 (HotpotQA), while Nutrition ASR@1 is only 0.14 and 0.06 respectively. Table 4 confirms this pattern in end-to-end results.", 302 "supported": "strong" 303 }, 304 { 305 "claim": "The effectiveness of injected instructions may be related to their alignment with the information being processed by the LLM.", 306 "evidence": "Section 3.3 notes higher end-to-end ASR for link insertion on Alzheimer's domain and advertising on nutrition domain compared to the LLM-only evaluation, suggesting coherence between injected content and retrieved context matters.", 307 "supported": "weak" 308 }, 309 { 310 "claim": "ASR decreases as the position of the poisoned document moves further down the retrieval list.", 311 "evidence": "Section 3.1 describes this trend based on Figure 4, which shows heatmaps across 10 injection positions.", 312 "supported": "moderate" 313 } 314 ], 315 "methodology_tags": ["benchmark-eval"], 316 "key_findings": "RAG systems are vulnerable to prompt injection attacks via both corpus poisoning and backdoor attacks on the dense retriever, with effects varying by LLM model and attack domain. Backdoor attacks on the retriever achieve near-perfect attack success rates (0.97-1.0) while maintaining normal retrieval performance on benign queries, making them difficult to detect. Corpus poisoning is more practical but less reliable, particularly for broad target domains. Among the three tested LLMs, Llama-3-8B was most susceptible (ASR up to 0.91) while Vicuna-7B showed the strongest resistance (max ASR 0.27).", 317 "red_flags": [ 318 { 319 "flag": "No uncertainty quantification", 320 "detail": "All results are reported as single-point ASR values with no confidence intervals, error bars, or variance across runs. With 100 queries per dataset, sampling variance could meaningfully affect the reported numbers, but this is never addressed." 321 }, 322 { 323 "flag": "No code or raw data released", 324 "detail": "Despite describing novel attack methods, no code, scripts, or raw experimental data are released, making independent verification impossible." 325 }, 326 { 327 "flag": "Broad claims from narrow model set", 328 "detail": "The paper tests only three 7-8B open-source models and one embedding model, but the title and abstract imply broad vulnerability of RAG systems generally. No commercial models with stronger safety training (GPT-4, Claude) are tested." 329 }, 330 { 331 "flag": "No limitations section", 332 "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or explicit scope boundaries. This is a significant omission for a security paper making vulnerability claims." 333 }, 334 { 335 "flag": "Missing hyperparameters and training details", 336 "detail": "No training hyperparameters (learning rate, epochs, batch size) are reported for the retriever fine-tuning, and no inference parameters (temperature, top-p) are stated for the LLMs, despite these significantly affecting results." 337 } 338 ], 339 "cited_papers": [ 340 { 341 "title": "PoisonedRAG: Knowledge poisoning attacks to retrieval-augmented generation of large language models", 342 "authors": ["Wei Zou", "Runpeng Geng", "Binghui Wang", "Jinyuan Jia"], 343 "year": 2024, 344 "relevance": "Direct baseline for corpus poisoning attacks on RAG systems, evaluating security vulnerabilities of LLM-integrated retrieval." 345 }, 346 { 347 "title": "Backdoor attacks on dense passage retrievers for disseminating misinformation", 348 "authors": ["Quanyu Long", "Yue Deng", "LeiLei Gan", "Wenya Wang", "Sinno Jialin Pan"], 349 "year": 2024, 350 "relevance": "Prior work on backdoor attacks targeting dense retrievers in RAG, focusing on misinformation dissemination." 351 }, 352 { 353 "title": "AgentPoison: Red-teaming LLM agents via poisoning memory or knowledge bases", 354 "authors": ["Zhaorun Chen", "Zhen Xiang", "Chaowei Xiao", "Dawn Song", "Bo Li"], 355 "year": 2024, 356 "relevance": "Trigger-based attacks on retrieval-augmented LLM agents, directly relevant to RAG security evaluation." 357 }, 358 { 359 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 360 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 361 "year": 2023, 362 "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications." 363 }, 364 { 365 "title": "Certifiably robust RAG against retrieval corruption", 366 "authors": ["Chong Xiang", "Tong Wu", "Zexuan Zhong", "David Wagner", "Danqi Chen", "Prateek Mittal"], 367 "year": 2024, 368 "relevance": "Defense mechanism for RAG systems against retrieval corruption, relevant to evaluating robustness of AI systems." 369 }, 370 { 371 "title": "Prompt injection attack against LLM-integrated applications", 372 "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang", "Zihao Wang", "Xiaofeng Wang", "Tianwei Zhang", "Yepang Liu", "Haoyu Wang", "Yan Zheng", "Yang Liu"], 373 "year": 2024, 374 "relevance": "Systematic study of prompt injection attacks on LLM-integrated applications." 375 }, 376 { 377 "title": "Benchmarking and defending against indirect prompt injection attacks on large language models", 378 "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"], 379 "year": 2024, 380 "relevance": "Benchmark and defense mechanisms for indirect prompt injection, directly relevant to RAG security evaluation." 381 }, 382 { 383 "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations", 384 "authors": ["Hakan Inan", "Kartikeya Upasani", "Jianfeng Chi", "Rashi Rungta", "Krithika Iyer", "Yuning Mao", "Michael Tontchev", "Qing Hu", "Brian Fuller", "Davide Testuggine", "Madian Khabsa"], 385 "year": 2023, 386 "relevance": "LLM-based safety guardrail system for sanitizing inputs and outputs, relevant to defense against prompt injection." 387 }, 388 { 389 "title": "Tensor Trust: Interpretable prompt injection attacks from an online game", 390 "authors": ["Sam Toyer", "Olivia Watkins", "Ethan Adrian Mendes", "Justin Svegliato", "Luke Bailey", "Tiffany Wang", "Isaac Ong", "Karim Elmaaroufi", "Pieter Abbeel", "Trevor Darrell", "Alan Ritter", "Stuart Russell"], 391 "year": 2023, 392 "relevance": "Large-scale study of prompt injection attacks and defenses through an interactive game setting." 393 }, 394 { 395 "title": "A survey on large language model (LLM) security and privacy: The good, the bad, and the ugly", 396 "authors": ["Yifan Yao", "Jinhao Duan", "Kaidi Xu", "Yuanfang Cai", "Zhibo Sun", "Yue Zhang"], 397 "year": 2024, 398 "relevance": "Comprehensive survey of LLM security and privacy concerns, providing context for the attack landscape." 399 }, 400 { 401 "title": "Jailbreaking black box large language models in twenty queries", 402 "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"], 403 "year": 2024, 404 "relevance": "Automated jailbreaking technique for LLMs relevant to understanding LLM vulnerability to adversarial inputs." 405 } 406 ] 407 }