scan-v5.json (26033B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "DeRAG: Black-box Adversarial Attacks on Multiple Retrieval-Augmented Generation Applications via Prompt Injection", 6 "authors": ["Jerry Wang", "Fang Yu"], 7 "year": 2025, 8 "venue": "KDD Workshop on Prompt Optimization", 9 "arxiv_id": "2507.15042", 10 "doi": "10.48550/arXiv.2507.15042" 11 }, 12 "checklist": { 13 "claims_and_evidence": { 14 "abstract_claims_supported": { 15 "applies": true, 16 "answer": true, 17 "justification": "All abstract claims are backed by experimental results: DE vs. GGPP/PRADA comparisons in Tables 1-2, ≤5 token budgets confirmed, Welch's t-test for readability in Table 16, and AUROC 0.2023 for detector evasion in Table 4.", 18 "source": "haiku" 19 }, 20 "causal_claims_justified": { 21 "applies": true, 22 "answer": true, 23 "justification": "Causal claims such as 'early stopping cuts query cost by ~40%' are supported by controlled comparisons of DE variants across multiple datasets with consistent ablations (Figure 2, Table 2).", 24 "source": "haiku" 25 }, 26 "generalization_bounded": { 27 "applies": true, 28 "answer": false, 29 "justification": "The title and conclusion claim DeRAG attacks 'multiple RAG applications,' but experiments only cover BERT-base-uncased (dense) and BM25 (sparse) on 1,000-document subsets; modern embedding models and production-scale corpora are untested.", 30 "source": "haiku" 31 }, 32 "alternative_explanations_discussed": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper attributes MS MARCO's high success to corpus redundancy but does not explore whether BERT-base-uncased is unusually vulnerable compared to modern retrievers, nor whether the artificially small corpus (1,000 docs) inflates success rates.", 36 "source": "haiku" 37 }, 38 "proxy_outcome_distinction": { 39 "applies": true, 40 "answer": true, 41 "justification": "The paper distinguishes retrieval rank manipulation (Success@K) from downstream answer quality and validates the connection in Table 5, showing EM/F1/ROUGE-L/BERTScore degradation stratified by attack outcome.", 42 "source": "haiku" 43 } 44 }, 45 "limitations_and_scope": { 46 "limitations_section_present": { 47 "applies": true, 48 "answer": false, 49 "justification": "There is no dedicated limitations or threats-to-validity section; the conclusion briefly mentions future defenses but does not enumerate limitations of the current work.", 50 "source": "haiku" 51 }, 52 "threats_to_validity_specific": { 53 "applies": true, 54 "answer": false, 55 "justification": "No specific threats to validity are discussed — e.g., the impact of corpus size (1,000 docs vs. production scale), retriever model choice, or the small query count (100) on result generalizability.", 56 "source": "haiku" 57 }, 58 "scope_boundaries_stated": { 59 "applies": true, 60 "answer": false, 61 "justification": "The paper does not explicitly bound what the results do NOT show — for example, whether attacks transfer to instruction-tuned embedding models, larger corpora, or API-based retrieval services.", 62 "source": "haiku" 63 } 64 }, 65 "conflicts_of_interest": { 66 "funding_disclosed": { 67 "applies": true, 68 "answer": false, 69 "justification": "No funding source is disclosed anywhere in the paper.", 70 "source": "haiku" 71 }, 72 "affiliations_disclosed": { 73 "applies": true, 74 "answer": true, 75 "justification": "Both authors' affiliations (Department of Management Information Systems, National ChengChi University, Taipei, Taiwan) are disclosed on the title page.", 76 "source": "haiku" 77 }, 78 "funder_independent_of_outcome": { 79 "applies": false, 80 "answer": false, 81 "justification": "No funder is identified, so independence cannot be assessed.", 82 "source": "haiku" 83 }, 84 "financial_interests_declared": { 85 "applies": true, 86 "answer": false, 87 "justification": "No competing interests or financial interests statement is present in the paper.", 88 "source": "haiku" 89 } 90 }, 91 "scope_and_framing": { 92 "key_terms_defined": { 93 "applies": true, 94 "answer": true, 95 "justification": "Key terms are defined: RAG (Section 1), Differential Evolution (Section 2.2), dense/sparse retrievers (Section 2.4), and all evaluation metrics (Success@K, ΔMRR, ΔnDCG, Δcos) are formally defined in Section 4.1.", 96 "source": "haiku" 97 }, 98 "intended_contribution_clear": { 99 "applies": true, 100 "answer": true, 101 "justification": "The paper clearly states its contribution: a gradient-free, black-box adversarial attack (DeRAG) using Differential Evolution to generate short adversarial suffixes that manipulate RAG retrieval rankings without model internals.", 102 "source": "haiku" 103 }, 104 "engagement_with_prior_work": { 105 "applies": true, 106 "answer": true, 107 "justification": "A four-subsection related work covers adversarial prompts, evolutionary optimization, detection methods, and retrieval mechanisms; the paper explicitly compares against GGPP (white-box) and PRADA (sparse black-box) throughout.", 108 "source": "haiku" 109 } 110 } 111 }, 112 "type_checklist": { 113 "empirical": { 114 "artifacts": { 115 "code_released": { 116 "applies": true, 117 "answer": true, 118 "justification": "Source code is released at https://github.com/pen9rum/Rag_attack_DeRag, explicitly referenced in Section 4.1.", 119 "source": "haiku" 120 }, 121 "data_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "All datasets used (MS MARCO, SciFact, FiQA, FEVER, SQuAD, NQ-Open) are standard publicly available benchmarks accessible via the BEIR framework.", 125 "source": "haiku" 126 }, 127 "environment_specified": { 128 "applies": true, 129 "answer": false, 130 "justification": "No requirements.txt, Dockerfile, or version-pinned dependency specifications are provided; only the model name (BERT-base-uncased) is mentioned without library or Python version details.", 131 "source": "haiku" 132 }, 133 "reproduction_instructions": { 134 "applies": true, 135 "answer": false, 136 "justification": "The algorithm is described via pseudocode (Algorithm 1) but no step-by-step instructions for reproducing the specific experimental results (table entries, figures) are provided in the paper.", 137 "source": "haiku" 138 } 139 }, 140 "statistical_methodology": { 141 "confidence_intervals_or_error_bars": { 142 "applies": true, 143 "answer": false, 144 "justification": "Main attack success results (Tables 1-2) report point estimates only; standard deviations appear only for MLM NLL in Table 15, not for primary Success@K metrics.", 145 "source": "haiku" 146 }, 147 "significance_tests": { 148 "applies": true, 149 "answer": false, 150 "justification": "Welch's t-test is used only for the MLM NLL readability comparison (Table 16); the primary comparative claims — DE vs. GGPP vs. PRADA attack success — are made without any statistical significance testing.", 151 "source": "haiku" 152 }, 153 "effect_sizes_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Effect sizes are reported across multiple metrics: Success@K, ΔMRR, ΔnDCG, Δcos, and downstream percentage drops in EM/F1/ROUGE-L in Table 5.", 157 "source": "haiku" 158 }, 159 "sample_size_justified": { 160 "applies": true, 161 "answer": false, 162 "justification": "The choice of 100 queries and 1,000 documents per dataset is not justified through power analysis or reasoning about statistical adequacy.", 163 "source": "haiku" 164 }, 165 "variance_reported": { 166 "applies": true, 167 "answer": false, 168 "justification": "Main results tables (Tables 1-2) report no variance; standard deviations appear only for MLM NLL (Table 15), not for primary attack success metrics.", 169 "source": "haiku" 170 } 171 }, 172 "evaluation_design": { 173 "baselines_included": { 174 "applies": true, 175 "answer": true, 176 "justification": "Three baselines are included: GGPP (gradient-based white-box), PRADA (sparse black-box), and random suffix.", 177 "source": "haiku" 178 }, 179 "baselines_contemporary": { 180 "applies": true, 181 "answer": true, 182 "justification": "GGPP (2024) and PRADA (2022) are the most relevant contemporary methods for the dense and sparse retriever attack settings respectively.", 183 "source": "haiku" 184 }, 185 "ablation_study": { 186 "applies": true, 187 "answer": true, 188 "justification": "Extensive ablations: DE variants (seq_stop vs. fixed_stop vs. seq), suffix length effects (Figure 4, Appendix D), loss function comparison (Appendix E), prefix vs. suffix positioning (Table 3), and candidate pool size effects (Table 6).", 189 "source": "haiku" 190 }, 191 "multiple_metrics": { 192 "applies": true, 193 "answer": true, 194 "justification": "Multiple metrics: Success@K (K=1,10,20), Avg Tok, Avg Iter, ΔMRR, ΔnDCG, Δcos, EM, F1, ROUGE-L, BERTScore, AUROC, AUPRC, MLM NLL.", 195 "source": "haiku" 196 }, 197 "human_evaluation": { 198 "applies": false, 199 "answer": false, 200 "justification": "Human evaluation is not applicable; attack effectiveness is measured through automated retrieval and QA metrics.", 201 "source": "haiku" 202 }, 203 "held_out_test_set": { 204 "applies": false, 205 "answer": false, 206 "justification": "This is an adversarial optimization task, not a supervised prediction task; the held-out test set concept does not apply.", 207 "source": "haiku" 208 }, 209 "per_category_breakdown": { 210 "applies": true, 211 "answer": true, 212 "justification": "Results are broken down per dataset (SciFact, FiQA, FEVER, MS MARCO) and per retrieval threshold (K=1, 10, 20) across all main tables.", 213 "source": "haiku" 214 }, 215 "failure_cases_discussed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Table 3 explicitly tabulates queries where both prefix and suffix attacks fail; Section 4.5 stratifies outcomes into Top-1 success, Top-10-only, and Fail with corresponding quality metrics.", 219 "source": "haiku" 220 }, 221 "negative_results_reported": { 222 "applies": true, 223 "answer": true, 224 "justification": "Negative results include: cosine loss underperforming hinge loss (Table 12/Appendix E), monotonic suffix length schedule not improving results (Appendix D), and low Succ@1 rates (~10-20%) on most datasets.", 225 "source": "haiku" 226 } 227 }, 228 "setup_transparency": { 229 "model_versions_specified": { 230 "applies": true, 231 "answer": false, 232 "justification": "BERT-base-uncased is specified for retrieval, but the LLM used for answer generation in the downstream RAG pipeline (Section 4.5, Table 5) is never named or versioned — a critical omission.", 233 "source": "haiku" 234 }, 235 "prompts_provided": { 236 "applies": true, 237 "answer": false, 238 "justification": "No system prompts or query templates for the downstream RAG generator LLM are provided; appendix tables show adversarial suffix output examples but not the generation prompts used.", 239 "source": "haiku" 240 }, 241 "hyperparameters_reported": { 242 "applies": true, 243 "answer": false, 244 "justification": "DE hyperparameters (F, CR, N, patience T) are described as typical ranges (e.g., F ∈ [0.5, 1.0], CR ∈ [0.1, 0.9]) rather than the exact values used in the reported experiments.", 245 "source": "haiku" 246 }, 247 "scaffolding_described": { 248 "applies": false, 249 "answer": false, 250 "justification": "No agentic scaffolding is used; this is a direct adversarial optimization attack on retrieval systems.", 251 "source": "haiku" 252 }, 253 "data_preprocessing_documented": { 254 "applies": true, 255 "answer": true, 256 "justification": "Data preprocessing is documented: random sampling of 1,000 documents and 100 queries from official BEIR corpus/query splits, BERT-base-uncased CLS embedding extraction (768-dim), cosine similarity retrieval.", 257 "source": "haiku" 258 } 259 }, 260 "data_integrity": { 261 "raw_data_available": { 262 "applies": true, 263 "answer": false, 264 "justification": "Per-query attack outcomes and generated adversarial suffixes are not released as structured data; only the code repository is linked and we cannot verify its contents from the paper.", 265 "source": "haiku" 266 }, 267 "data_collection_described": { 268 "applies": true, 269 "answer": true, 270 "justification": "Data collection is described: standard BEIR benchmarks, random sampling of 1,000-document subsets and 100 queries from official splits, target documents chosen as non-relevant passages or topically confusable distractors.", 271 "source": "haiku" 272 }, 273 "recruitment_methods_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants; all data comes from standard NLP benchmarks.", 277 "source": "haiku" 278 }, 279 "data_pipeline_documented": { 280 "applies": true, 281 "answer": true, 282 "justification": "The full pipeline is documented: query/target selection → BERT embedding → DE optimization loop (Algorithm 1) → retrieval evaluation → downstream QA generation and scoring.", 283 "source": "haiku" 284 } 285 }, 286 "contamination": { 287 "training_cutoff_stated": { 288 "applies": false, 289 "answer": false, 290 "justification": "This paper evaluates an adversarial attack on retrieval ranking, not LLM benchmark knowledge recall; standard contamination concerns do not apply.", 291 "source": "haiku" 292 }, 293 "train_test_overlap_discussed": { 294 "applies": false, 295 "answer": false, 296 "justification": "Not applicable; benchmarks are used as retrieval corpora to be manipulated, not as knowledge tests for a generative model.", 297 "source": "haiku" 298 }, 299 "benchmark_contamination_addressed": { 300 "applies": false, 301 "answer": false, 302 "justification": "Not applicable for the same reason as above.", 303 "source": "haiku" 304 } 305 }, 306 "human_studies": { 307 "pre_registered": { 308 "applies": false, 309 "answer": false, 310 "justification": "No human participants in this study.", 311 "source": "haiku" 312 }, 313 "irb_or_ethics_approval": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants in this study.", 317 "source": "haiku" 318 }, 319 "demographics_reported": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants in this study.", 323 "source": "haiku" 324 }, 325 "inclusion_exclusion_criteria": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants in this study.", 329 "source": "haiku" 330 }, 331 "randomization_described": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants in this study.", 335 "source": "haiku" 336 }, 337 "blinding_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants in this study.", 341 "source": "haiku" 342 }, 343 "attrition_reported": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants in this study.", 347 "source": "haiku" 348 } 349 }, 350 "cost_and_practicality": { 351 "inference_cost_reported": { 352 "applies": true, 353 "answer": true, 354 "justification": "Per-query iteration counts (Table 2) and pool construction/query optimization times in seconds (Table 14) are reported, providing practical cost information.", 355 "source": "haiku" 356 }, 357 "compute_budget_stated": { 358 "applies": true, 359 "answer": false, 360 "justification": "Total computational budget (GPU/CPU hours, hardware specifications) is not stated; only per-query timing data are provided.", 361 "source": "haiku" 362 } 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "DE-based black-box attack achieves competitive or higher success rates than GGPP (gradient white-box) at Top-10 and Top-20 thresholds on dense retrievers", 369 "evidence": "Table 2: DE_seq_stop matches or exceeds GGPP at Succ@10/Succ@20 on SciFact (0.573 vs 0.458) and FiQA (0.520 vs 0.480), though GGPP dominates Succ@1 on MS MARCO (0.830 vs 0.570)", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Effective adversarial suffixes require only 2-3 tokens on average", 374 "evidence": "DE_seq_stop achieves average suffix lengths of 1.32 (MS MARCO) to 2.76 (FEVER) tokens while maintaining high Top-10/Top-20 success rates (Table 2)", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Early stopping reduces query cost by approximately 40% without reducing attack success", 379 "evidence": "Figure 2 shows DE_seq_stop reaches 97% Top-10 success at 2 tokens while DE_seq needs 3-4; Section 3.3.3 states the hybrid strategy 'cuts average query cost by ~40%'", 380 "supported": "strong" 381 }, 382 { 383 "claim": "DE-generated suffixes evade BERT-based and RoBERTa-based adversarial detection", 384 "evidence": "Table 4: RoBERTa detector achieves AUROC 0.2023 and AUPRC 0.4665 at 0.5% FPR target; Table 13: CLS attack probability is near-identical for original vs. attacked queries (~0.40)", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Readability-aware MLM pooling strategy significantly reduces suffix perplexity without degrading attack success", 389 "evidence": "Table 16: Welch's t-test yields p < 1e-9 for NLL reduction across all three datasets; Table 6 shows stable Success@1 across pool sizes from 500 to 30,522", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Adversarial retrieval manipulation causes substantial downstream answer quality degradation in real QA pipelines", 394 "evidence": "Table 5 shows 83.5% EM drop on SQuAD when target reaches rank 1 and 14.8% average EM drop across NQ-Open; tested on only 500 queries per dataset with unspecified generator LLM", 395 "supported": "moderate" 396 } 397 ], 398 "methodology_tags": ["benchmark-eval"], 399 "key_findings": "DeRAG demonstrates that gradient-free Differential Evolution can generate adversarial query suffixes of 2-3 tokens that effectively manipulate RAG retrieval rankings, matching gradient-based white-box attacks (GGPP) at broader retrieval thresholds (Top-10, Top-20) while requiring no model internals. The attack evades RoBERTa-based detectors with near-chance accuracy (AUROC 0.2023) and causes measurable downstream answer quality degradation (14-27% average EM drop on QA benchmarks, up to 83.5% when the adversarial document reaches rank 1). A readability-aware suffix construction strategy using MLM token pooling statistically significantly reduces suffix perplexity (Welch's t, p < 1e-9) without degrading attack success. However, all experiments use BERT-base-uncased on artificially small 1,000-document corpus subsets, limiting generalizability claims.", 400 "red_flags": [ 401 { 402 "flag": "Unrealistically small corpus", 403 "detail": "Experiments use only 1,000-document subsets of BEIR datasets; production RAG systems operate over millions of documents where attack rank targets and success rates would differ substantially." 404 }, 405 { 406 "flag": "No CIs or significance tests on primary results", 407 "detail": "Attack success rates (Success@K, ΔMRR, ΔnDCG) in Tables 1-2 are point estimates from 100 queries per dataset with no confidence intervals or statistical significance tests for the main comparative claims." 408 }, 409 { 410 "flag": "Single retriever model tested", 411 "detail": "Dense retrieval experiments use only BERT-base-uncased (2018); modern instruction-tuned embedding models widely used in production (E5, GTE, OpenAI text-embedding-3) are untested." 412 }, 413 { 414 "flag": "Generator LLM unspecified for downstream evaluation", 415 "detail": "Section 4.5 evaluates downstream RAG answer quality (Table 5) but never names or versions the LLM used for generation, making these results unreproducible." 416 }, 417 { 418 "flag": "Exact DE hyperparameters not reported", 419 "detail": "The paper provides typical ranges for DE parameters (F ∈ [0.5, 1.0], CR ∈ [0.1, 0.9], N, T) but not the exact values used in the reported experiments." 420 }, 421 { 422 "flag": "Title overstates breadth of coverage", 423 "detail": "Title claims attacks on 'Multiple Retrieval-Augmented Generation Applications' but only one dense retriever (BERT-base-uncased) and one sparse retriever (BM25) on small corpus subsets are tested." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "Prompt Perturbation in Retrieval-Augmented Generation based Large Language Models (GGPP)", 429 "relevance": "Primary dense-retriever baseline; gradient-based white-box attack on RAG that DeRAG is designed to compete with without gradient access" 430 }, 431 { 432 "title": "PRADA: Practical Black-box Adversarial Attacks against Neural Ranking Models", 433 "relevance": "Primary sparse-retriever baseline; the most comparable black-box adversarial ranking attack method" 434 }, 435 { 436 "title": "BEIR: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models", 437 "relevance": "Evaluation framework used for all retrieval experiments across SciFact, FiQA, FEVER, MS MARCO" 438 }, 439 { 440 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 441 "relevance": "Original RAG paper; defines the RAG paradigm whose retrieval stage DeRAG attacks" 442 }, 443 { 444 "title": "BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of Large Language Models", 445 "relevance": "Related poisoning-based backdoor attack on RAG corpora; complementary threat model" 446 }, 447 { 448 "title": "CtrlRAG: Black-box Adversarial Attacks Based on Masked Language Models in Retrieval-Augmented Language Generation", 449 "relevance": "Related black-box RAG attack using MLM; close competitor using a different gradient-free approach" 450 }, 451 { 452 "title": "Differential evolution – a simple and efficient heuristic for global optimization over continuous spaces", 453 "relevance": "Foundational algorithm (Storn & Price 1997) underlying the DeRAG optimization method" 454 }, 455 { 456 "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding", 457 "relevance": "Core retrieval encoder used in all dense retrieval experiments" 458 } 459 ], 460 "engagement_factors": { 461 "practical_relevance": { 462 "score": 2, 463 "justification": "Demonstrates a deployable black-box attack against production RAG systems with public code; directly actionable for practitioners assessing RAG security." 464 }, 465 "surprise_contrarian": { 466 "score": 1, 467 "justification": "The finding that ≤5 tokens suffice for effective retrieval manipulation is noteworthy, but RAG vulnerability to adversarial attacks is an expected result in this field." 468 }, 469 "fear_safety": { 470 "score": 2, 471 "justification": "Shows RAG systems can be manipulated to surface misinformation via small, detector-evading token appends — a concrete and practical AI safety concern for deployed systems." 472 }, 473 "drama_conflict": { 474 "score": 1, 475 "justification": "The attack-vs-defense framing is inherently adversarial but the paper is technical rather than polemical; no controversial claims about deployed systems." 476 }, 477 "demo_ability": { 478 "score": 2, 479 "justification": "Code is released on GitHub using public BEIR benchmarks and BERT-base-uncased, making reproduction accessible to practitioners without specialized resources." 480 }, 481 "brand_recognition": { 482 "score": 0, 483 "justification": "Authors are from National ChengChi University (Taiwan), a respected institution but not a major AI lab with brand recognition in the LLM community." 484 } 485 }, 486 "hn_data": { 487 "threads": [ 488 { 489 "hn_id": "44120359", 490 "title": "Diffusion vs. Autoregressive Language Models: A Text Embedding Perspective", 491 "points": 19, 492 "comments": 1, 493 "url": "https://news.ycombinator.com/item?id=44120359" 494 }, 495 { 496 "hn_id": "36931866", 497 "title": "Universal and Transferable Adversarial Attacks on LLM", 498 "points": 3, 499 "comments": 0, 500 "url": "https://news.ycombinator.com/item?id=36931866" 501 }, 502 { 503 "hn_id": "36903968", 504 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 505 "points": 1, 506 "comments": 0, 507 "url": "https://news.ycombinator.com/item?id=36903968" 508 } 509 ], 510 "top_points": 19, 511 "total_points": 23, 512 "total_comments": 1 513 } 514 }