scan.json (33116B)
1 { 2 "paper": { 3 "title": "DeRAG: Black-box Adversarial Attacks on Multiple Retrieval-Augmented Generation Applications via Prompt Injection", 4 "authors": ["Jerry Wang", "Fang Yu"], 5 "year": 2025, 6 "venue": "KDD Workshop on Prompt Optimization", 7 "arxiv_id": "2507.15042", 8 "doi": "10.48550/arXiv.2507.15042" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "DeRAG uses Differential Evolution to generate short adversarial suffixes (2-5 tokens) that manipulate RAG retrieval rankings in a black-box setting, achieving success rates competitive with or exceeding the white-box GGPP method on dense retrievers and outperforming PRADA on sparse retrievers at Top-1. The method produces smaller cosine shifts than gradient-based approaches, and DE-generated suffixes largely evade a RoBERTa-based detector (AUROC 0.20). Downstream QA evaluation on SQuAD and NQ-Open shows that successful adversarial retrieval manipulation degrades answer quality by 15-28% on average across metrics.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "A GitHub repository is provided: https://github.com/pen9rum/Rag_attack_DeRag (reference [34]), described as containing data and experiment results." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available BEIR benchmark datasets (MS MARCO, SciFact, FiQA, FEVER), and states 'Data and experiment results are available at' the GitHub repository (Section 4.1)." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications (requirements.txt, Dockerfile, library versions) are provided in the paper. Only the model (BERT-base-uncased, 110M parameters) is mentioned." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no README-style instructions for replicating experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All main results in Tables 1-5 report only point estimates (e.g., Success@1 = 0.198) with no confidence intervals or error bars. The only ± notation appears in Table 15 for MLM NLL, not for the main attack results." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "Welch's t-test is used only for the readability analysis (Table 16, Appendix H). The primary comparative claims — that DE outperforms GGPP and PRADA — are based solely on comparing point estimates across Tables 1 and 2 without any significance tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports multiple effect size metrics: ΔMRR, ΔnDCG@20, and Δcos (Tables 1, 2, 7), providing context for the magnitude of retrieval disruption. Downstream quality degradation is reported as percentage changes from baseline (Table 5)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper uses 100 queries and 1,000-document subsets per dataset without any justification for why these sizes were chosen. No power analysis or discussion of whether 100 queries is sufficient for reliable estimates." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviations, variance, or multi-run spread measures are reported for the main attack results. Tables 1-5 all present single-run point estimates. DE is stochastic but no seed sensitivity is explored." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares against GGPP (white-box gradient-guided attack, reference [23]), PRADA (black-box sparse retrieval attack, reference [16]), and a random suffix baseline (Tables 1-2)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "GGPP (Hu et al., 2024) and PRADA (Wu & Zhang, 2022) are recent and represent the state of the art for white-box and black-box retrieval attacks respectively." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Multiple ablations are conducted: DE_seq_stop vs DE_seq vs DE_fixed_stop (Section 4.3.2, Figure 2), prefix vs suffix attacks (Table 3), suffix length variation (Appendix D, Figure 4), hinge loss vs cosine loss (Appendix E, Table 12), and pool size variation (Table 6)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper uses Success@K (K=1,10,20), ΔMRR, ΔnDCG@20, Δcos, average tokens, and average iterations for attack evaluation. Downstream evaluation adds EM, F1, ROUGE-L, and BERTScore (Table 5)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is included. The readability of adversarial suffixes is assessed only via automated MLM NLL proxy (Appendix H), not human judgment. Detection evasion is tested only against automated detectors." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "No separation between development and test sets. The same 100 queries per dataset are used for both optimization and evaluation. There is no held-out test set mentioned." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by dataset (SciFact, FiQA, FEVER, MS MARCO), by method variant, by retrieval threshold K (1, 10, 20), and by attack outcome stratum (Top-1, Top-10 only, Fail) in Table 5." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 3 reports query counts where both prefix and suffix fail. Table 5 stratifies results by 'Fail' outcome. The paper notes diminishing returns beyond 5 tokens (Appendix D) and discusses when cosine loss fails vs hinge loss (Appendix E)." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that cosine loss is consistently worse than hinge loss (Appendix E, Table 12), that a monotonic suffix length schedule did not improve results (Section 4.3.3, Appendix D), and that suffixes beyond 5 tokens yield diminishing or negative marginal returns (Figure 4b)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims competitive success rates vs GGPP (supported by Table 2), few tokens (supported by avg token columns), readability improvement validated by Welch's t-test (supported by Table 16), and detection evasion (supported by Table 4). The claims are reasonably hedged ('competitive and in some cases higher')." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's causal claims (e.g., DE-optimized suffixes cause retrieval manipulation, early stopping reduces iterations) are supported by controlled single-variable ablations — comparing methods on the same data with only the attack strategy varying. The ablation studies (Section 4.3.2-4.3.3) isolate individual components." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims attacks on 'Multiple Retrieval-Augmented Generation Applications' but experiments use only BERT-base-uncased as the dense retriever and BM25 as the sparse retriever, with 1,000-document subsets. The conclusion states 'DeRAG reveals critical RAG vulnerabilities' without bounding this to the specific retrievers and small corpus sizes tested. Real RAG systems use more sophisticated retrievers and much larger corpora." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations. For instance, the high success rate on MS MARCO might be explained by corpus-specific properties rather than DE's general effectiveness. The paper does not consider whether results would hold with different retriever architectures, larger corpora, or whether BERT-base-uncased is unusually vulnerable." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper distinguishes between the retrieval manipulation proxy (Success@K, ΔMRR) and the downstream impact on answer quality, running a separate evaluation on SQuAD and NQ-Open (Section 4.5, Table 5) to validate that retrieval rank manipulation actually degrades generated answers." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "The main retriever is specified as 'BERT-base-uncased encoder' with 'over 110 million parameters' and '768-dimensional CLS embeddings' (Section 4.1). For detection, 'RoBERTa detector' is named (Section 4.4.1). These are specific, uniquely identifiable models." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The downstream QA evaluation (Section 4.5) feeds retrieved chunks to an LLM generator, but the generation prompt/template is not provided. The adversarial suffixes themselves are shown (Appendix C), but the prompt format for the downstream answer generation is missing." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "DE hyperparameters are reported: scale factor F (0.5-1.0), crossover rate CR (0.1-0.9), suffix length budget n_max (1-10), patience T, and population size N (Section 3.3). Specific values for the example: N=3, F=0.5, CR=0.5. The downstream evaluation uses 5-token suffix with up to 120 iterations (Section 4.5)." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The method is a direct optimization loop (DE) over token sequences evaluated against a retriever." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper states 'we randomly sample a subset of 1,000 documents and 100 queries' but leaves ambiguity in target passage selection: 'either drawn uniformly at random from the non-relevant documents, or selected as a topically confusable distractor' (Section 4.1) — it is unclear which method was actually used in which experiments." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no limitations, threats-to-validity, or discussion section. The paper goes directly from evaluation results (Section 4) to a brief conclusion (Section 5) with no dedicated limitations discussion." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address limitations such as the small corpus size (1,000 documents vs. real-world millions), the single retriever model, or the gap between experimental and production RAG settings." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show, such as applicability to production-scale corpora, modern instruction-tuned retrievers, or multi-stage RAG pipelines with rerankers." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The underlying BEIR datasets are publicly available, and the paper states 'Data and experiment results are available at' the GitHub repository (Section 4.1, reference [34])." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.1 describes the data source (BEIR framework), sampling procedure (1,000 documents and 100 queries per dataset), encoding method (BERT-base-uncased CLS embeddings), and retrieval mechanism (cosine similarity for dense, BM25 for sparse)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data comes from standard public benchmarks (BEIR/MS MARCO/SciFact/FiQA/FEVER)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from raw BEIR data to experimental setup has gaps. The target passage selection method is ambiguously described ('either drawn uniformly at random... or selected as a topically confusable distractor'). The specific random seed for sampling is not stated, and the downstream QA evaluation pipeline details are sparse." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding sources are mentioned anywhere in the paper. There is no acknowledgments section disclosing grants or sponsors." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly listed: both authors are from the Department of Management Information Systems, National ChengChi University, Taipei, Taiwan. No conflict with evaluated systems." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence of funder cannot be assessed. The absence of any funding disclosure makes this NO by default." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper evaluates an adversarial attack method against a retriever, not a pre-trained model's knowledge on a benchmark. BERT-base-uncased is used as a retrieval encoder, and the attack's validity does not depend on whether the model has seen the benchmark data." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "The paper tests an attack strategy on retrieval behavior, not model knowledge. Train/test overlap in the BERT pre-training data is irrelevant to whether adversarial suffixes can manipulate retrieval rankings." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Contamination is not relevant because the paper evaluates the effectiveness of an adversarial attack method on retrieval systems, not the retriever's inherent knowledge of benchmark answers." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study. All experiments are automated attacks on retrieval systems." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. Pure computational experiment on retrieval benchmarks." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Average iteration counts are reported per method per dataset (Tables 1-2). Pool construction and query optimization times are reported in Appendix G (Table 14), with per-query times around 40-50 seconds. Section 4.3.1 discusses the iterations-tokens trade-off in detail." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget is stated. The paper does not report total GPU hours, hardware specifications, or aggregate compute cost across all experiments." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "DE is a stochastic algorithm but no multi-seed results are reported. All tables present single-run point estimates without any analysis of sensitivity to random initialization." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not state how many times each experiment was run. Results appear to be from single runs, but this is never explicitly stated." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "DE hyperparameters (F, CR, N, patience) are reported but there is no description of how they were selected — no search budget, no grid/random search, no sensitivity analysis over hyperparameter configurations." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper presents results for DE_seq_stop, DE_fixed_stop, and DE_seq variants but does not justify how specific hyperparameter values were chosen or whether these are the best among tested configurations." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper compares multiple methods across multiple datasets and thresholds without any correction for multiple comparisons. Only the readability analysis uses a significance test (Welch's t-test), with no family-wise correction." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement their own version of GGPP and PRADA baselines for comparison. There is no acknowledgment that their reimplementation may systematically underperform the original authors' implementations." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "GGPP iterations are listed as '—' in Table 2, making matched-compute comparison impossible. The paper discusses the iterations-tokens trade-off qualitatively (Section 4.3.1) but does not compare methods at equivalent compute budgets." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether Success@K on 1,000-document subsets with BERT-base-uncased actually measures vulnerability of real-world RAG systems, which use much larger corpora and more sophisticated retrievers." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. The evaluation is a direct optimization against a retriever, not a scaffold-dependent agent evaluation." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether BERT-base-uncased's pre-training data includes any of the BEIR benchmark documents. BERT (2018) may have been trained on data overlapping with some benchmarks." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the attack evaluation setup leaks information that would not be available in a real attack scenario (e.g., knowing the exact corpus and encoder used)." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether the sampled 100 queries and 1,000 documents are independent or share structural similarities that could inflate success rates." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, or decontamination analysis." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "DE-based prompt optimization achieves competitive or higher success rates compared to GGPP on dense retrievers while using only ≤5 tokens", 365 "evidence": "Table 2 shows DE_seq_stop achieves 0.739 Success@20 on SciFact vs 0.565 for GGPP, 1.000 Success@10 on MS MARCO matching GGPP, with only 1.32-2.76 average tokens vs GGPP's fixed 5 tokens. However, GGPP outperforms on MS MARCO Top-1 (0.830 vs 0.570).", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "DE_seq_stop outperforms PRADA at Top-1 success rates on sparse retrievers across all four benchmarks", 370 "evidence": "Table 1 shows DE_seq_stop Top-1 rates of 0.250, 0.270, 0.620, 0.710 vs PRADA's 0.010, 0.030, 0.070, 0.020 on SciFact, FiQA, FEVER, MS MARCO respectively. However, PRADA achieves higher Top-10/Top-20 on some datasets.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Early stopping cuts average query cost by approximately 40% while maintaining attack success", 375 "evidence": "Section 4.3.2 and Figure 2 show DE_seq_stop reaches higher cumulative success at every suffix length compared to DE_seq. The 40% figure is stated but not derived with a clear calculation in the paper.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "DE-generated adversarial suffixes evade RoBERTa-based detection with near-chance accuracy", 380 "evidence": "Table 4 shows recall of 0.00% at 0.5% and 1.0% target FPR, and only 0.62% at 2.0% FPR. AUROC is 0.2023 and AUPRC is 0.4665 (Section 4.4.1).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Readability-aware suffix construction significantly reduces MLM negative log-likelihood compared to full vocabulary pool", 385 "evidence": "Table 16 reports Welch's t-test comparing pool_size=5,000 vs full (30,522): t-values from -6.54 to -9.40, all p < 10^-9 across three datasets. Table 15 shows monotonic NLL decrease as pool shrinks.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Adversarial retrieval manipulation causes significant downstream answer quality degradation", 390 "evidence": "Table 5 shows Top-1 attacks reduce SQuAD EM by 83.5%, F1 by 85.6%. On NQ-Open, Top-10 insertion reduces EM by 14.4% and F1 by 13.6%. Weighted average degradation is 26.7% EM on SQuAD and 14.8% EM on NQ-Open.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "Short suffixes (≤5 tokens) yield diminishing marginal returns under fixed iteration budgets", 395 "evidence": "Figure 4 shows mean ΔRank rises sharply from L=1 to L=4, then plateaus. Figure 4b shows marginal gain per extra token declines steeply after L=2, reaching near zero by L=5 across all six dataset-threshold combinations.", 396 "supported": "strong" 397 } 398 ], 399 "red_flags": [ 400 { 401 "flag": "Tiny corpus size may inflate success rates", 402 "detail": "All experiments use 1,000-document subsets, whereas real RAG systems typically index millions of documents. The much smaller search space likely makes adversarial ranking manipulation significantly easier. The paper acknowledges MS MARCO's 'relatively small and highly redundant' corpus but does not address this as a general limitation." 403 }, 404 { 405 "flag": "Single retriever model tested for dense retrieval", 406 "detail": "Only BERT-base-uncased is used as the dense retriever. Modern RAG systems use more sophisticated retrievers (e.g., instruction-tuned embeddings, GritLM, Arctic-Embed — models the paper itself cites). Results may not transfer to these architectures." 407 }, 408 { 409 "flag": "No error bars or multi-run analysis on stochastic method", 410 "detail": "Differential Evolution is inherently stochastic (random initialization, random mutation/crossover), yet all results appear to be single-run point estimates with no variance, confidence intervals, or seed sensitivity analysis." 411 }, 412 { 413 "flag": "No limitations section", 414 "detail": "The paper entirely omits a limitations or threats-to-validity section, jumping from evaluation results directly to a brief conclusion. Major threats (corpus scale, retriever generality, real-world applicability) go unacknowledged." 415 }, 416 { 417 "flag": "Unfair compute comparison with GGPP", 418 "detail": "GGPP iteration counts are listed as '—' in Table 2, making it impossible to compare methods at equivalent computational cost. The paper claims DE is preferable but cannot demonstrate cost-efficiency relative to the gradient-based baseline." 419 }, 420 { 421 "flag": "Overclaimed generality", 422 "detail": "The title claims 'Multiple Retrieval-Augmented Generation Applications' but experiments cover one dense retriever (BERT-base) and one sparse retriever (BM25) on four benchmark subsets. No multi-stage RAG pipelines, rerankers, or production systems are tested." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "Ignore Previous Prompt: Attack Techniques for Language Models", 428 "authors": ["Fábio Perez", "Ian Ribeiro"], 429 "year": 2022, 430 "arxiv_id": "2211.09527", 431 "relevance": "Foundational work on prompt injection attacks against LLMs, directly relevant to adversarial prompt research." 432 }, 433 { 434 "title": "Automatic and Universal Prompt Injection Attacks Against Large Language Models", 435 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 436 "year": 2024, 437 "arxiv_id": "2403.04957", 438 "relevance": "Proposes universal prompt injection attacks on LLMs, a closely related attack paradigm to adversarial suffix optimization." 439 }, 440 { 441 "title": "Goal-guided Generative Prompt Injection Attack on Large Language Models", 442 "authors": ["Chong Zhang", "Mingyu Jin", "Qinkai Yu", "Chengzhi Liu", "Haochen Xue", "Xiaobo Jin"], 443 "year": 2024, 444 "arxiv_id": "2404.07234", 445 "relevance": "Goal-guided prompt injection method for LLMs, relevant to the survey's coverage of adversarial prompt generation techniques." 446 }, 447 { 448 "title": "Jailbreaking Black Box Large Language Models in Twenty Queries", 449 "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"], 450 "year": 2024, 451 "arxiv_id": "2310.08419", 452 "relevance": "Black-box jailbreaking of LLMs using few queries, relevant to gradient-free adversarial methods against language models." 453 }, 454 { 455 "title": "Prompt Perturbation in Retrieval-Augmented Generation based Large Language Models", 456 "authors": ["Zhibo Hu", "Chen Wang", "Yanfeng Shu", "Hye-young Paik", "Liming Zhu"], 457 "year": 2024, 458 "arxiv_id": "2402.07179", 459 "relevance": "Introduces GGPP, the primary white-box baseline in this paper, for adversarial prompt perturbation in RAG systems." 460 }, 461 { 462 "title": "Targeting the Core: A Simple and Effective Method to Attack RAG-based Agents via Direct LLM Manipulation", 463 "authors": ["Xuying Li", "Zhuo Li", "Yuji Kosuga", "Yasuhiro Yoshida", "Victor Bian"], 464 "year": 2024, 465 "arxiv_id": "2412.04415", 466 "relevance": "Demonstrates adversarial prefix attacks that subvert RAG-based AI agents, directly relevant to RAG security research." 467 }, 468 { 469 "title": "BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of Large Language Models", 470 "authors": ["Jiaqi Xue", "Mengxin Zheng", "Yebowen Hu", "Fei Liu", "Xun Chen", "Qian Lou"], 471 "year": 2024, 472 "arxiv_id": "2406.00083", 473 "relevance": "Poisoning-based backdoor attack on RAG databases, complementary attack vector to prompt-based adversarial approaches." 474 }, 475 { 476 "title": "CtrlRAG: Black-box Adversarial Attacks Based on Masked Language Models in Retrieval-Augmented Language Generation", 477 "authors": ["Runqi Sui"], 478 "year": 2025, 479 "arxiv_id": "2503.06950", 480 "relevance": "MLM-based black-box adversarial attack on RAG systems, a closely related concurrent work on retrieval manipulation." 481 }, 482 { 483 "title": "PRADA: Practical Black-box Adversarial Attacks against Neural Ranking Models", 484 "authors": ["Chen Wu", "Ruqing Zhang"], 485 "year": 2022, 486 "doi": "10.1145/3576923", 487 "relevance": "Black-box attack baseline for sparse retrieval systems, directly compared against DeRAG in experiments." 488 }, 489 { 490 "title": "Robust Safety Classifier Against Jailbreaking Attacks: Adversarial Prompt Shield", 491 "authors": ["Jinhwa Kim", "Ali Derakhshan", "Ian Harris"], 492 "year": 2024, 493 "relevance": "Adversarial prompt detection classifier, relevant to defense mechanisms against prompt injection attacks." 494 }, 495 { 496 "title": "Detecting Language Model Attacks with Perplexity", 497 "authors": ["Gabriel Alon", "Michael Kamfonas"], 498 "year": 2023, 499 "arxiv_id": "2308.14132", 500 "relevance": "Perplexity-based detection of adversarial prompts, directly relevant to attack detection and evasion in LLM security." 501 }, 502 { 503 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 504 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin", "Naman Goyal", "Heinrich Küttler", "Mike Lewis", "Wen-tau Yih", "Tim Rocktäschel", "Sebastian Riedel", "Douwe Kiela"], 505 "year": 2021, 506 "arxiv_id": "2005.11401", 507 "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm that DeRAG attacks." 508 }, 509 { 510 "title": "EvoPrompt: Connecting LLMs with Evolutionary Algorithms Yields Powerful Prompt Optimizers", 511 "authors": ["Qingyan Guo", "Rui Wang", "Junliang Guo", "Bei Li", "Kaitao Song", "Xu Tan", "Guoqing Liu", "Jiang Bian", "Yujiu Yang"], 512 "year": 2025, 513 "arxiv_id": "2309.08532", 514 "relevance": "Uses evolutionary strategies for prompt optimization, inspiring the application of DE to adversarial prompt generation." 515 } 516 ], 517 "engagement_factors": { 518 "practical_relevance": { 519 "score": 2, 520 "justification": "Security researchers and RAG system developers could use this to test retrieval robustness; code is released but requires domain expertise to apply." 521 }, 522 "surprise_contrarian": { 523 "score": 1, 524 "justification": "RAG retrieval vulnerability to adversarial inputs is already known; the contribution is a specific black-box method rather than a surprising finding." 525 }, 526 "fear_safety": { 527 "score": 2, 528 "justification": "Demonstrates that RAG systems can be attacked without any access to model internals and that attacks evade detection, raising practical security concerns." 529 }, 530 "drama_conflict": { 531 "score": 0, 532 "justification": "No controversy, no challenges to specific companies or products, straightforward technical contribution." 533 }, 534 "demo_ability": { 535 "score": 2, 536 "justification": "Code and data are released on GitHub; a researcher could reproduce the attacks, but there is no pip-installable tool or live demo." 537 }, 538 "brand_recognition": { 539 "score": 0, 540 "justification": "Authors are from National ChengChi University with no industry affiliation or brand recognition in the AI community." 541 } 542 } 543 }