scan-v4.json (33320B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "DeRAG: Black-box Adversarial Attacks on Multiple Retrieval-Augmented Generation Applications via Prompt Injection", 6 "authors": [ 7 "Jerry Wang", 8 "Fang Yu" 9 ], 10 "year": 2025, 11 "venue": "KDD Workshop on Prompt Optimization", 12 "arxiv_id": "2507.15042", 13 "doi": "10.48550/arXiv.2507.15042" 14 }, 15 "checklist": { 16 "claims_and_evidence": { 17 "abstract_claims_supported": { 18 "applies": true, 19 "answer": true, 20 "justification": "The abstract claims competitive success rates vs GGPP (supported by Table 2), few tokens (supported by avg token columns), readability improvement validated by Welch's t-test (supported by Table 16), and detection evasion (supported by Table 4). The claims are reasonably hedged ('competitive and in some cases higher').", 21 "source": "opus" 22 }, 23 "causal_claims_justified": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper's causal claims (e.g., DE-optimized suffixes cause retrieval manipulation, early stopping reduces iterations) are supported by controlled single-variable ablations — comparing methods on the same data with only the attack strategy varying. The ablation studies (Section 4.3.2-4.3.3) isolate individual components.", 27 "source": "opus" 28 }, 29 "generalization_bounded": { 30 "applies": true, 31 "answer": false, 32 "justification": "The title claims attacks on 'Multiple Retrieval-Augmented Generation Applications' but experiments use only BERT-base-uncased as the dense retriever and BM25 as the sparse retriever, with 1,000-document subsets. The conclusion states 'DeRAG reveals critical RAG vulnerabilities' without bounding this to the specific retrievers and small corpus sizes tested. Real RAG systems use more sophisticated retrievers and much larger corpora.", 33 "source": "opus" 34 }, 35 "alternative_explanations_discussed": { 36 "applies": true, 37 "answer": false, 38 "justification": "No discussion of alternative explanations. For instance, the high success rate on MS MARCO might be explained by corpus-specific properties rather than DE's general effectiveness. The paper does not consider whether results would hold with different retriever architectures, larger corpora, or whether BERT-base-uncased is unusually vulnerable.", 39 "source": "opus" 40 }, 41 "proxy_outcome_distinction": { 42 "applies": true, 43 "answer": true, 44 "justification": "The paper distinguishes between the retrieval manipulation proxy (Success@K, ΔMRR) and the downstream impact on answer quality, running a separate evaluation on SQuAD and NQ-Open (Section 4.5, Table 5) to validate that retrieval rank manipulation actually degrades generated answers.", 45 "source": "opus" 46 } 47 }, 48 "limitations_and_scope": { 49 "limitations_section_present": { 50 "applies": true, 51 "answer": false, 52 "justification": "There is no limitations, threats-to-validity, or discussion section. The paper goes directly from evaluation results (Section 4) to a brief conclusion (Section 5) with no dedicated limitations discussion.", 53 "source": "opus" 54 }, 55 "threats_to_validity_specific": { 56 "applies": true, 57 "answer": false, 58 "justification": "No threats to validity are discussed. The paper does not address limitations such as the small corpus size (1,000 documents vs. real-world millions), the single retriever model, or the gap between experimental and production RAG settings.", 59 "source": "opus" 60 }, 61 "scope_boundaries_stated": { 62 "applies": true, 63 "answer": false, 64 "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show, such as applicability to production-scale corpora, modern instruction-tuned retrievers, or multi-stage RAG pipelines with rerankers.", 65 "source": "opus" 66 } 67 }, 68 "conflicts_of_interest": { 69 "funding_disclosed": { 70 "applies": true, 71 "answer": false, 72 "justification": "No funding sources are mentioned anywhere in the paper. There is no acknowledgments section disclosing grants or sponsors.", 73 "source": "opus" 74 }, 75 "affiliations_disclosed": { 76 "applies": true, 77 "answer": true, 78 "justification": "Author affiliations are clearly listed: both authors are from the Department of Management Information Systems, National ChengChi University, Taipei, Taiwan. No conflict with evaluated systems.", 79 "source": "opus" 80 }, 81 "funder_independent_of_outcome": { 82 "applies": true, 83 "answer": false, 84 "justification": "No funding is disclosed, so independence of funder cannot be assessed. The absence of any funding disclosure makes this NO by default.", 85 "source": "opus" 86 }, 87 "financial_interests_declared": { 88 "applies": true, 89 "answer": false, 90 "justification": "No competing interests or financial interests statement is present in the paper.", 91 "source": "opus" 92 } 93 }, 94 "scope_and_framing": { 95 "key_terms_defined": { 96 "applies": true, 97 "answer": true, 98 "justification": "RAG, Differential Evolution, adversarial prompt suffix, prompt injection, sparse/dense retrieval, and success@K are all defined or explained in Section 1–3.", 99 "source": "haiku" 100 }, 101 "intended_contribution_clear": { 102 "applies": true, 103 "answer": true, 104 "justification": "The contribution is explicitly stated: a gradient-free, black-box DE-based adversarial suffix optimization method (DeRAG) for manipulating RAG retrieval rankings without model internals access.", 105 "source": "haiku" 106 }, 107 "engagement_with_prior_work": { 108 "applies": true, 109 "answer": true, 110 "justification": "Section 2 covers related work on adversarial prompts (GGPP, TrojRAG, CtrlRAG), evolutionary optimization (DE, one-pixel attack), detection methods (APS, perplexity-based), and retrieval (BM25, DPR), with direct experimental comparison to GGPP and PRADA.", 111 "source": "haiku" 112 } 113 } 114 }, 115 "type_checklist": { 116 "empirical": { 117 "artifacts": { 118 "code_released": { 119 "applies": true, 120 "answer": true, 121 "justification": "A GitHub repository is provided: https://github.com/pen9rum/Rag_attack_DeRag (reference [34]), described as containing data and experiment results.", 122 "source": "opus" 123 }, 124 "data_released": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper uses publicly available BEIR benchmark datasets (MS MARCO, SciFact, FiQA, FEVER), and states 'Data and experiment results are available at' the GitHub repository (Section 4.1).", 128 "source": "opus" 129 }, 130 "environment_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "No environment specifications (requirements.txt, Dockerfile, library versions) are provided in the paper. Only the model (BERT-base-uncased, 110M parameters) is mentioned.", 134 "source": "opus" 135 }, 136 "reproduction_instructions": { 137 "applies": true, 138 "answer": false, 139 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but the paper itself contains no README-style instructions for replicating experiments.", 140 "source": "opus" 141 } 142 }, 143 "statistical_methodology": { 144 "confidence_intervals_or_error_bars": { 145 "applies": true, 146 "answer": false, 147 "justification": "All main results in Tables 1-5 report only point estimates (e.g., Success@1 = 0.198) with no confidence intervals or error bars. The only ± notation appears in Table 15 for MLM NLL, not for the main attack results.", 148 "source": "opus" 149 }, 150 "significance_tests": { 151 "applies": true, 152 "answer": false, 153 "justification": "Welch's t-test is used only for the readability analysis (Table 16, Appendix H). The primary comparative claims — that DE outperforms GGPP and PRADA — are based solely on comparing point estimates across Tables 1 and 2 without any significance tests.", 154 "source": "opus" 155 }, 156 "effect_sizes_reported": { 157 "applies": true, 158 "answer": true, 159 "justification": "The paper reports multiple effect size metrics: ΔMRR, ΔnDCG@20, and Δcos (Tables 1, 2, 7), providing context for the magnitude of retrieval disruption. Downstream quality degradation is reported as percentage changes from baseline (Table 5).", 160 "source": "opus" 161 }, 162 "sample_size_justified": { 163 "applies": true, 164 "answer": false, 165 "justification": "The paper uses 100 queries and 1,000-document subsets per dataset without any justification for why these sizes were chosen. No power analysis or discussion of whether 100 queries is sufficient for reliable estimates.", 166 "source": "opus" 167 }, 168 "variance_reported": { 169 "applies": true, 170 "answer": false, 171 "justification": "No standard deviations, variance, or multi-run spread measures are reported for the main attack results. Tables 1-5 all present single-run point estimates. DE is stochastic but no seed sensitivity is explored.", 172 "source": "opus" 173 } 174 }, 175 "evaluation_design": { 176 "baselines_included": { 177 "applies": true, 178 "answer": true, 179 "justification": "The paper compares against GGPP (white-box gradient-guided attack, reference [23]), PRADA (black-box sparse retrieval attack, reference [16]), and a random suffix baseline (Tables 1-2).", 180 "source": "opus" 181 }, 182 "baselines_contemporary": { 183 "applies": true, 184 "answer": true, 185 "justification": "GGPP (Hu et al., 2024) and PRADA (Wu & Zhang, 2022) are recent and represent the state of the art for white-box and black-box retrieval attacks respectively.", 186 "source": "opus" 187 }, 188 "ablation_study": { 189 "applies": true, 190 "answer": true, 191 "justification": "Multiple ablations are conducted: DE_seq_stop vs DE_seq vs DE_fixed_stop (Section 4.3.2, Figure 2), prefix vs suffix attacks (Table 3), suffix length variation (Appendix D, Figure 4), hinge loss vs cosine loss (Appendix E, Table 12), and pool size variation (Table 6).", 192 "source": "opus" 193 }, 194 "multiple_metrics": { 195 "applies": true, 196 "answer": true, 197 "justification": "The paper uses Success@K (K=1,10,20), ΔMRR, ΔnDCG@20, Δcos, average tokens, and average iterations for attack evaluation. Downstream evaluation adds EM, F1, ROUGE-L, and BERTScore (Table 5).", 198 "source": "opus" 199 }, 200 "human_evaluation": { 201 "applies": true, 202 "answer": false, 203 "justification": "No human evaluation is included. The readability of adversarial suffixes is assessed only via automated MLM NLL proxy (Appendix H), not human judgment. Detection evasion is tested only against automated detectors.", 204 "source": "opus" 205 }, 206 "held_out_test_set": { 207 "applies": true, 208 "answer": false, 209 "justification": "No separation between development and test sets. The same 100 queries per dataset are used for both optimization and evaluation. There is no held-out test set mentioned.", 210 "source": "opus" 211 }, 212 "per_category_breakdown": { 213 "applies": true, 214 "answer": true, 215 "justification": "Results are broken down by dataset (SciFact, FiQA, FEVER, MS MARCO), by method variant, by retrieval threshold K (1, 10, 20), and by attack outcome stratum (Top-1, Top-10 only, Fail) in Table 5.", 216 "source": "opus" 217 }, 218 "failure_cases_discussed": { 219 "applies": true, 220 "answer": true, 221 "justification": "Table 3 reports query counts where both prefix and suffix fail. Table 5 stratifies results by 'Fail' outcome. The paper notes diminishing returns beyond 5 tokens (Appendix D) and discusses when cosine loss fails vs hinge loss (Appendix E).", 222 "source": "opus" 223 }, 224 "negative_results_reported": { 225 "applies": true, 226 "answer": true, 227 "justification": "The paper reports that cosine loss is consistently worse than hinge loss (Appendix E, Table 12), that a monotonic suffix length schedule did not improve results (Section 4.3.3, Appendix D), and that suffixes beyond 5 tokens yield diminishing or negative marginal returns (Figure 4b).", 228 "source": "opus" 229 } 230 }, 231 "setup_transparency": { 232 "model_versions_specified": { 233 "applies": true, 234 "answer": true, 235 "justification": "The main retriever is specified as 'BERT-base-uncased encoder' with 'over 110 million parameters' and '768-dimensional CLS embeddings' (Section 4.1). For detection, 'RoBERTa detector' is named (Section 4.4.1). These are specific, uniquely identifiable models.", 236 "source": "opus" 237 }, 238 "prompts_provided": { 239 "applies": true, 240 "answer": false, 241 "justification": "The downstream QA evaluation (Section 4.5) feeds retrieved chunks to an LLM generator, but the generation prompt/template is not provided. The adversarial suffixes themselves are shown (Appendix C), but the prompt format for the downstream answer generation is missing.", 242 "source": "opus" 243 }, 244 "hyperparameters_reported": { 245 "applies": true, 246 "answer": true, 247 "justification": "DE hyperparameters are reported: scale factor F (0.5-1.0), crossover rate CR (0.1-0.9), suffix length budget n_max (1-10), patience T, and population size N (Section 3.3). Specific values for the example: N=3, F=0.5, CR=0.5. The downstream evaluation uses 5-token suffix with up to 120 iterations (Section 4.5).", 248 "source": "opus" 249 }, 250 "scaffolding_described": { 251 "applies": false, 252 "answer": false, 253 "justification": "No agentic scaffolding is used. The method is a direct optimization loop (DE) over token sequences evaluated against a retriever.", 254 "source": "opus" 255 }, 256 "data_preprocessing_documented": { 257 "applies": true, 258 "answer": false, 259 "justification": "The paper states 'we randomly sample a subset of 1,000 documents and 100 queries' but leaves ambiguity in target passage selection: 'either drawn uniformly at random from the non-relevant documents, or selected as a topically confusable distractor' (Section 4.1) — it is unclear which method was actually used in which experiments.", 260 "source": "opus" 261 } 262 }, 263 "data_integrity": { 264 "raw_data_available": { 265 "applies": true, 266 "answer": true, 267 "justification": "The underlying BEIR datasets are publicly available, and the paper states 'Data and experiment results are available at' the GitHub repository (Section 4.1, reference [34]).", 268 "source": "opus" 269 }, 270 "data_collection_described": { 271 "applies": true, 272 "answer": true, 273 "justification": "Section 4.1 describes the data source (BEIR framework), sampling procedure (1,000 documents and 100 queries per dataset), encoding method (BERT-base-uncased CLS embeddings), and retrieval mechanism (cosine similarity for dense, BM25 for sparse).", 274 "source": "opus" 275 }, 276 "recruitment_methods_described": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants. All data comes from standard public benchmarks (BEIR/MS MARCO/SciFact/FiQA/FEVER).", 280 "source": "opus" 281 }, 282 "data_pipeline_documented": { 283 "applies": true, 284 "answer": false, 285 "justification": "The pipeline from raw BEIR data to experimental setup has gaps. The target passage selection method is ambiguously described ('either drawn uniformly at random... or selected as a topically confusable distractor'). The specific random seed for sampling is not stated, and the downstream QA evaluation pipeline details are sparse.", 286 "source": "opus" 287 } 288 }, 289 "contamination": { 290 "training_cutoff_stated": { 291 "applies": false, 292 "answer": false, 293 "justification": "The paper evaluates an adversarial attack method against a retriever, not a pre-trained model's knowledge on a benchmark. BERT-base-uncased is used as a retrieval encoder, and the attack's validity does not depend on whether the model has seen the benchmark data.", 294 "source": "opus" 295 }, 296 "train_test_overlap_discussed": { 297 "applies": false, 298 "answer": false, 299 "justification": "The paper tests an attack strategy on retrieval behavior, not model knowledge. Train/test overlap in the BERT pre-training data is irrelevant to whether adversarial suffixes can manipulate retrieval rankings.", 300 "source": "opus" 301 }, 302 "benchmark_contamination_addressed": { 303 "applies": false, 304 "answer": false, 305 "justification": "Contamination is not relevant because the paper evaluates the effectiveness of an adversarial attack method on retrieval systems, not the retriever's inherent knowledge of benchmark answers.", 306 "source": "opus" 307 } 308 }, 309 "human_studies": { 310 "pre_registered": { 311 "applies": false, 312 "answer": false, 313 "justification": "No human participants in this study. All experiments are automated attacks on retrieval systems.", 314 "source": "opus" 315 }, 316 "irb_or_ethics_approval": { 317 "applies": false, 318 "answer": false, 319 "justification": "No human participants. Pure computational experiment on retrieval benchmarks.", 320 "source": "opus" 321 }, 322 "demographics_reported": { 323 "applies": false, 324 "answer": false, 325 "justification": "No human participants.", 326 "source": "opus" 327 }, 328 "inclusion_exclusion_criteria": { 329 "applies": false, 330 "answer": false, 331 "justification": "No human participants.", 332 "source": "opus" 333 }, 334 "randomization_described": { 335 "applies": false, 336 "answer": false, 337 "justification": "No human participants.", 338 "source": "opus" 339 }, 340 "blinding_described": { 341 "applies": false, 342 "answer": false, 343 "justification": "No human participants.", 344 "source": "opus" 345 }, 346 "attrition_reported": { 347 "applies": false, 348 "answer": false, 349 "justification": "No human participants.", 350 "source": "opus" 351 } 352 }, 353 "cost_and_practicality": { 354 "inference_cost_reported": { 355 "applies": true, 356 "answer": true, 357 "justification": "Average iteration counts are reported per method per dataset (Tables 1-2). Pool construction and query optimization times are reported in Appendix G (Table 14), with per-query times around 40-50 seconds. Section 4.3.1 discusses the iterations-tokens trade-off in detail.", 358 "source": "opus" 359 }, 360 "compute_budget_stated": { 361 "applies": true, 362 "answer": false, 363 "justification": "No total computational budget is stated. The paper does not report total GPU hours, hardware specifications, or aggregate compute cost across all experiments.", 364 "source": "opus" 365 } 366 }, 367 "experimental_rigor": { 368 "seed_sensitivity_reported": { 369 "applies": true, 370 "answer": false, 371 "justification": "DE is a stochastic algorithm but no multi-seed results are reported. All tables present single-run point estimates without any analysis of sensitivity to random initialization.", 372 "source": "opus" 373 }, 374 "number_of_runs_stated": { 375 "applies": true, 376 "answer": false, 377 "justification": "The paper does not state how many times each experiment was run. Results appear to be from single runs, but this is never explicitly stated.", 378 "source": "opus" 379 }, 380 "hyperparameter_search_budget": { 381 "applies": true, 382 "answer": false, 383 "justification": "DE hyperparameters (F, CR, N, patience) are reported but there is no description of how they were selected — no search budget, no grid/random search, no sensitivity analysis over hyperparameter configurations.", 384 "source": "opus" 385 }, 386 "best_config_selection_justified": { 387 "applies": true, 388 "answer": false, 389 "justification": "The paper presents results for DE_seq_stop, DE_fixed_stop, and DE_seq variants but does not justify how specific hyperparameter values were chosen or whether these are the best among tested configurations.", 390 "source": "opus" 391 }, 392 "multiple_comparison_correction": { 393 "applies": true, 394 "answer": false, 395 "justification": "The paper compares multiple methods across multiple datasets and thresholds without any correction for multiple comparisons. Only the readability analysis uses a significance test (Welch's t-test), with no family-wise correction.", 396 "source": "opus" 397 }, 398 "self_comparison_bias_addressed": { 399 "applies": true, 400 "answer": false, 401 "justification": "The authors implement their own version of GGPP and PRADA baselines for comparison. There is no acknowledgment that their reimplementation may systematically underperform the original authors' implementations.", 402 "source": "opus" 403 }, 404 "compute_budget_vs_performance": { 405 "applies": true, 406 "answer": false, 407 "justification": "GGPP iterations are listed as '—' in Table 2, making matched-compute comparison impossible. The paper discusses the iterations-tokens trade-off qualitatively (Section 4.3.1) but does not compare methods at equivalent compute budgets.", 408 "source": "opus" 409 }, 410 "benchmark_construct_validity": { 411 "applies": true, 412 "answer": false, 413 "justification": "No discussion of whether Success@K on 1,000-document subsets with BERT-base-uncased actually measures vulnerability of real-world RAG systems, which use much larger corpora and more sophisticated retrievers.", 414 "source": "opus" 415 }, 416 "scaffold_confound_addressed": { 417 "applies": false, 418 "answer": false, 419 "justification": "No scaffolding is involved. The evaluation is a direct optimization against a retriever, not a scaffold-dependent agent evaluation.", 420 "source": "opus" 421 } 422 }, 423 "data_leakage": { 424 "temporal_leakage_addressed": { 425 "applies": true, 426 "answer": false, 427 "justification": "No discussion of whether BERT-base-uncased's pre-training data includes any of the BEIR benchmark documents. BERT (2018) may have been trained on data overlapping with some benchmarks.", 428 "source": "opus" 429 }, 430 "feature_leakage_addressed": { 431 "applies": true, 432 "answer": false, 433 "justification": "No discussion of whether the attack evaluation setup leaks information that would not be available in a real attack scenario (e.g., knowing the exact corpus and encoder used).", 434 "source": "opus" 435 }, 436 "non_independence_addressed": { 437 "applies": true, 438 "answer": false, 439 "justification": "No discussion of whether the sampled 100 queries and 1,000 documents are independent or share structural similarities that could inflate success rates.", 440 "source": "opus" 441 }, 442 "leakage_detection_method": { 443 "applies": true, 444 "answer": false, 445 "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, or decontamination analysis.", 446 "source": "opus" 447 } 448 } 449 } 450 }, 451 "claims": [ 452 { 453 "claim": "DE-based black-box attacks (DeRAG) achieve competitive or higher success rates vs. white-box GGPP on dense retrievers using ≤5 tokens", 454 "evidence": "Table 2: DE_seq_stop achieves 0.739 Succ@20 vs GGPP 0.565 on SciFact; DE_seq_stop matches or exceeds GGPP on 3/4 datasets at Succ@10 and Succ@20", 455 "supported": "strong" 456 }, 457 { 458 "claim": "DE_seq_stop outperforms PRADA at Succ@1 on sparse retrievers while using comparable token budgets", 459 "evidence": "Table 1: DE_seq_stop Succ@1 ranges 0.25–0.71 vs PRADA 0.01–0.07 across all four datasets, though PRADA achieves higher Succ@10/20 on most datasets", 460 "supported": "moderate" 461 }, 462 { 463 "claim": "Early stopping reduces query cost by ~40% while maintaining attack success", 464 "evidence": "Section 4.3.2 and Figure 2: DE_seq_stop reaches 50% Succ@1 with 2 tokens vs DE_seq requiring 5 tokens; Table 2 shows iteration savings", 465 "supported": "moderate" 466 }, 467 { 468 "claim": "DE-generated suffixes evade BERT-based detection at near-chance accuracy (AUROC 0.2023)", 469 "evidence": "Table 4: AUROC 0.2023, AUPRC 0.4665 at target FPR 0.5%; at 2% FPR threshold, recall is only 0.62%", 470 "supported": "strong" 471 }, 472 { 473 "claim": "Readability-aware MLM pooling strategy significantly reduces suffix perplexity without degrading attack success", 474 "evidence": "Tables 15–16: Welch's t-test confirms significant NLL reduction (p < 10^-9 across all datasets) for pool_size=5000 vs full pool; Table 6 shows stable Succ@1 across pool sizes", 475 "supported": "strong" 476 }, 477 { 478 "claim": "Adversarial chunk insertion causes 14–27% downstream QA quality degradation even at Top-10 occupancy", 479 "evidence": "Table 5: SQuAD average EM drops 26.7%; NQ-Open average EM drops 14.8%; Top-10-only success still causes 49.7% EM drop on SQuAD", 480 "supported": "moderate" 481 }, 482 { 483 "claim": "Combining prefix and suffix attacks is complementary and improves overall attack success", 484 "evidence": "Table 3: On FiQA, 17/100 queries succeed with both prefix and suffix where neither alone succeeds, and 45/100 succeed with either strategy", 485 "supported": "moderate" 486 } 487 ], 488 "methodology_tags": [ 489 "benchmark-eval", 490 "empirical" 491 ], 492 "key_findings": "DeRAG demonstrates that gradient-free Differential Evolution optimization can generate adversarial token suffixes (≤5 tokens) that effectively manipulate RAG retrieval rankings without requiring access to model internals, matching or exceeding white-box GGPP attacks on dense retrievers. The approach evades BERT-based adversarial prompt detection with near-chance AUROC (0.2023), suggesting token-level perturbations are difficult to distinguish from benign queries. Downstream QA quality degrades 14–27% on average when adversarial chunks enter the retrieval window, with the worst degradation occurring even when the target reaches only Top-10 (not rank 1). A readability-aware MLM pooling strategy significantly reduces suffix perplexity (Welch's t-test p < 10^-9) while maintaining attack effectiveness.", 493 "red_flags": [ 494 { 495 "flag": "Tiny evaluation corpus", 496 "detail": "Main experiments use only 1000-document subsets and 100 queries per dataset; real RAG deployments have millions of documents, and attack success may not transfer to large-scale corpora where adversarial suffixes compete against far more documents." 497 }, 498 { 499 "flag": "Single retriever architecture", 500 "detail": "All dense retriever experiments use BERT-base-uncased (2018); no modern dense retrievers (DPR, Contriever, E5, Arctic-Embed) are tested, despite being mentioned in the introduction as state-of-the-art." 501 }, 502 { 503 "flag": "Downstream QA generator unspecified", 504 "detail": "Table 5 reports EM/F1/ROUGE-L/BERTScore for a RAG pipeline, but the LLM generator model, version, prompt template, and generation parameters are never stated, making these results irreproducible." 505 }, 506 { 507 "flag": "No variance on main metrics", 508 "detail": "Attack success rates in Tables 1–4 are point estimates over 100 queries with no confidence intervals or standard deviations; claimed differences of a few percentage points may not be statistically significant." 509 }, 510 { 511 "flag": "CtrlRAG omitted from comparison", 512 "detail": "CtrlRAG (Sui 2025), a directly comparable black-box MLM-based RAG attack from the same year, is cited in the introduction but not included as a baseline." 513 }, 514 { 515 "flag": "No limitations section", 516 "detail": "Despite significant scope restrictions (small corpus, one retriever, no defense analysis, workshop venue), the paper contains no limitations or threats-to-validity discussion." 517 } 518 ], 519 "cited_papers": [ 520 { 521 "title": "Prompt Perturbation in Retrieval-Augmented Generation based Large Language Models (GGPP)", 522 "relevance": "Primary white-box baseline for dense retriever attack comparison throughout the paper" 523 }, 524 { 525 "title": "PRADA: Practical Black-box Adversarial Attacks against Neural Ranking Models", 526 "relevance": "Primary black-box baseline for sparse retriever attack comparison" 527 }, 528 { 529 "title": "BEIR: A Heterogeneous Benchmark for Zero-shot Evaluation of Information Retrieval Models", 530 "relevance": "Evaluation framework used to standardize results across four QA datasets" 531 }, 532 { 533 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 534 "relevance": "Foundational RAG paper establishing the pipeline architecture being attacked" 535 }, 536 { 537 "title": "CtrlRAG: Black-box Adversarial Attacks Based on Masked Language Models in Retrieval-Augmented Language Generation", 538 "relevance": "Contemporary competing black-box RAG attack method using MLM; cited but not compared" 539 }, 540 { 541 "title": "BadRAG: Identifying Vulnerabilities in Retrieval Augmented Generation of Large Language Models", 542 "relevance": "Related poisoning-based backdoor attack on RAG systems" 543 }, 544 { 545 "title": "One Pixel Attack for Fooling Deep Neural Networks", 546 "relevance": "Motivating prior work showing DE can craft adversarial examples in non-gradient settings" 547 }, 548 { 549 "title": "Black-box Adversarial Sample Generation Based on Differential Evolution", 550 "relevance": "Prior work applying DE to adversarial sample generation; methodological foundation" 551 } 552 ], 553 "engagement_factors": { 554 "practical_relevance": { 555 "score": 2, 556 "justification": "Security researchers and RAG system developers could use this to test retrieval robustness; code is released but requires domain expertise to apply." 557 }, 558 "surprise_contrarian": { 559 "score": 1, 560 "justification": "RAG retrieval vulnerability to adversarial inputs is already known; the contribution is a specific black-box method rather than a surprising finding." 561 }, 562 "fear_safety": { 563 "score": 2, 564 "justification": "Demonstrates that RAG systems can be attacked without any access to model internals and that attacks evade detection, raising practical security concerns." 565 }, 566 "drama_conflict": { 567 "score": 0, 568 "justification": "No controversy, no challenges to specific companies or products, straightforward technical contribution." 569 }, 570 "demo_ability": { 571 "score": 2, 572 "justification": "Code and data are released on GitHub; a researcher could reproduce the attacks, but there is no pip-installable tool or live demo." 573 }, 574 "brand_recognition": { 575 "score": 0, 576 "justification": "Authors are from National ChengChi University with no industry affiliation or brand recognition in the AI community." 577 } 578 }, 579 "hn_data": { 580 "threads": [ 581 { 582 "hn_id": "44120359", 583 "title": "Diffusion vs. Autoregressive Language Models: A Text Embedding Perspective", 584 "points": 19, 585 "comments": 1, 586 "url": "https://news.ycombinator.com/item?id=44120359" 587 }, 588 { 589 "hn_id": "36931866", 590 "title": "Universal and Transferable Adversarial Attacks on LLM", 591 "points": 3, 592 "comments": 0, 593 "url": "https://news.ycombinator.com/item?id=36931866" 594 }, 595 { 596 "hn_id": "36903968", 597 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 598 "points": 1, 599 "comments": 0, 600 "url": "https://news.ycombinator.com/item?id=36903968" 601 } 602 ], 603 "top_points": 19, 604 "total_points": 23, 605 "total_comments": 1 606 } 607 }