scan.json (31764B)
1 { 2 "paper": { 3 "title": "RedVisor: Reasoning-Aware Prompt Injection Defense via Zero-Copy KV Cache Reuse", 4 "authors": [ 5 "Mingrui Liu", 6 "Sixiao Zhang", 7 "Cheng Long", 8 "Kwok-Yan Lam" 9 ], 10 "year": 2026, 11 "venue": "arXiv", 12 "arxiv_id": "2602.01795" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "RedVisor proposes a two-phase defense against prompt injection: a lightweight adapter generates reasoning about potential injections (Phase 1), then the frozen backbone generates a safe response conditioned on that reasoning (Phase 2). The architecture enables zero-copy KV cache reuse, halving latency compared to decoupled detection pipelines. Across Llama-3-8B, Mistral-7B, and Qwen-2.5-7B, RedVisor achieves near-zero attack success rates on five injection types while preserving backbone utility (WinRate within 1.5pp of raw backbone) and maintaining robustness against adaptive GCG attacks.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "An anonymous repository URL is provided: https://anonymous.4open.science/r/RedVisor-82E2 (footnote, Section 1)." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "All evaluation benchmarks are publicly available: Alpaca-Farm, AgentDojo, NQ-simplified, and Alpaca-Cleaned for training. The augmentation pipeline to construct the attack dataset is described in Algorithm 2 and the code repository is provided." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "The paper mentions 4×NVIDIA A100-40GB GPUs and names specific model versions, but provides no requirements.txt, Dockerfile, or library version specifications (e.g., PyTorch, vLLM, NLTK versions)." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper provides algorithmic pseudocode (Algorithms 1-2) and hyperparameters, but no step-by-step reproduction instructions (e.g., commands to run, README with setup steps). The anonymous repository may contain these, but the paper itself does not." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "All results in Tables 2-10 and Figure 3-4 are reported as point estimates with no confidence intervals, error bars, or ± notation." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims RedVisor 'outperforms state-of-the-art defenses' (Abstract, Section 7) but provides no statistical significance tests. All comparative claims are based solely on comparing raw numbers." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "Tables provide baseline context alongside method results (e.g., None ASR 0.58 vs RedVisor 0.00 for Llama-3 Naive in Table 3). Section 7.6 reports specific utility numbers (85.78% vs 87.27% WinRate). Section 7.7 states '>2× faster.' Raw numbers with baselines allow magnitude assessment." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "Sample sizes are stated (208 Alpaca-Farm samples, 949 AgentDojo samples, 1000 NQ queries) but no justification is given for why these sizes are adequate for the claims made." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No standard deviations, variance across seeds, or spread measures are reported for any experiment. All results appear to be single-run numbers." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "Six baselines are compared: PromptArmor, PromptLocate, DataFilter (detection-based) and Sandwich Defense, StruQ, SecAlign (prevention-based). See Section 7.2 and Tables 2-8." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "All baselines are from 2023-2025: PromptLocate (2025), DataFilter (2025), SecAlign (2025), StruQ (2025), PromptArmor (2025), Sandwich Defense (2023). These represent the current state of the art." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": true, 82 "justification": "Section 7.8 and Table 9 present ablation studies removing the Gate (RV_w/o Gate) and FFN (RV_w/o FFN) components, plus a None baseline (reasoning only). Results show FFN is critical for complex attacks." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": true, 87 "justification": "Multiple metrics are used: ROUGE-L F1 and Embedding Similarity for detection, Attack Success Rate for prevention, WinRate (AlpacaEval 2.0) for utility, and total latency/throughput for efficiency (Section 7.1.2)." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "No human evaluation is performed. Utility is measured via AlpacaEval 2.0 with Qwen3-Max as an LLM judge (Section 7.6). Detection and prevention metrics are fully automated." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": true, 97 "justification": "Section E.2 explicitly states: 'Note that while we test on Alpaca-Farm, the training is performed strictly on the standard Alpaca train set to ensure zero data leakage.' AgentDojo and NQ-simplified are also separate evaluation benchmarks." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Results are broken down by attack type (Naive, Ignore, Escape, Completion, Multi-round), by benchmark (Alpaca-Farm, AgentDojo, NQ-simplified), and by backbone model (Llama-3, Mistral, Qwen). See Tables 2-8." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "While aggregate results show degraded performance on complex attacks (Completion and Multi-round ASR 2-7%), no qualitative failure case analysis is provided. Section G.4 discusses adaptive attack degradation in aggregate terms but shows no specific failure examples." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": false, 112 "justification": "Every experiment and configuration shows RedVisor outperforming or matching baselines. The ablation shows expected component contributions (all help). No approaches that failed or configurations that were tried and abandoned are reported." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims superior detection accuracy (supported by Tables 2, 5-6), lower ASR (Tables 3, 7-8), negligible utility loss (Figure 3), and better throughput (Figure 4). All claims are substantiated by experimental results." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "Causal claims from the ablation study (Section 7.8) use controlled single-variable manipulation: removing Gate or FFN while keeping everything else constant. Table 1's preliminary study also controls for the reasoning variable. These designs are adequate for the causal claims made." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The paper claims a 'universal framework' (Section 1) but tests only on 7-8B parameter models. No experiments with larger models (13B, 70B) or different model families. The title implies broad applicability without bounding to the tested scale." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations for the results are discussed. For example, the paper does not consider whether the improvement comes primarily from the additional compute of the reasoning phase rather than the adapter architecture, or whether the synthetic attack patterns are too easy to detect." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "The measurements closely match the claims: ASR directly measures attack success, ROUGE-L measures localization accuracy, WinRate measures utility, and latency measures efficiency. The paper's claims stay at the granularity of these measurements without inflating to broader security guarantees." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "Main backbone models are precisely specified: Llama-3-8B-Instruct, Mistral-7B-v0.3-Instruct, Qwen-2.5-7B-Instruct (Section 7.3). The judger (Qwen3-Max) and AgentDojo backbone (GPT-4o) lack snapshot dates but are peripheral to the main evaluation." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": true, 151 "justification": "Full system instructions for Phase 1 are provided in Appendix A.2. The user input format with XML tags is shown in Appendix A.1. Reasoning templates for all five attack categories are provided verbatim in Appendix C.2." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": true, 156 "justification": "Appendix E.2 reports: AdamW optimizer, learning rate 1e-4, batch size 128, early stopping, validation every 100 steps, adapter architecture details (hidden dims, head counts, GateNet dimensions d=512, inner d=64). Training runs 10 epochs on 4×A100-40GB." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "RedVisor is a two-phase inference mechanism with an adapter, not an agentic scaffold. For AgentDojo, GPT-4o is used as a third-party backbone agent (black box), and RedVisor acts as a sidecar detector." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 4.2.1 describes NLTK sentence segmentation with indexed labels. Section 6.2 details the spaCy intent extraction pipeline. Algorithm 2 formalizes the full dataset construction. Appendix D.1 describes NQ-simplified retrieval via LangChain with BAAI/bge-small-en embeddings." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": true, 173 "justification": "The Impact Statement includes a 'Limitations and Over-Reliance' subsection stating 'no defense is impenetrable' and that 'RedVisor should be deployed as part of a defense-in-depth strategy, not as a standalone guarantee of safety.'" 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "The limitations are generic: 'no defense is impenetrable' and warning against over-reliance. No specific threats such as the synthetic nature of attacks, limited model scale, or the gap between template-based attacks and real-world adversaries are discussed." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not explicitly state what was not tested: no mention of larger models being out of scope, no acknowledgment that only English-language attacks were tested, no discussion of real-world (non-synthetic) injection scenarios being excluded." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "Only aggregate metrics are reported in tables and figures. No per-example predictions, raw model outputs, or individual detection results are made available for independent verification." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section 6 and Algorithm 2 describe the full dataset construction pipeline. Appendix D.1 details how each benchmark was configured, including the NQ-simplified retrieval setup (1000 queries, top-10 documents). Attack payloads use random UUIDs (Appendix E.1)." 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. All data comes from standard public benchmarks (Alpaca-Farm, AgentDojo, NQ-simplified)." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": true, 205 "justification": "Algorithm 2 formalizes the complete pipeline from clean data through injection generation to reasoning synthesis. Section 6.2 details the intent extraction and template application stages. Appendix C provides all templates and construction details." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "All authors are listed as affiliated with Nanyang Technological University, Singapore. They are not evaluating their own commercial product." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "No funding is disclosed, so the independence of funders from outcomes cannot be assessed." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial interest declaration is present in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "This paper tests a defense mechanism against prompt injection attacks, not a pre-trained model's capability on benchmarks. The backbone models are frozen and used as-is; the evaluation measures defense effectiveness, not model knowledge." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Same as above — the paper evaluates a defense tool rather than model knowledge. Contamination of model training data is not relevant to the defense evaluation." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "Same as above — the evaluation measures whether the adapter can detect injections and guide safe responses, not whether the backbone has memorized benchmark answers." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study. All experiments use automated benchmarks and LLM-based evaluation." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": true, 288 "justification": "Section 7.7 and Figure 4 report total latency for processing 1,000 RAG queries across all methods. Section 5.3 provides theoretical complexity analysis showing RedVisor halves TTFT. The paper demonstrates >2× speedup over decoupled pipelines." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "The paper states training was on 4×NVIDIA A100-40GB GPUs for 10 epochs, but does not report total GPU hours, wall-clock training time, or total compute cost." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single training runs." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs is never stated. Results are presented without indicating whether they are from single or multiple runs." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "Fixed hyperparameters are reported (lr=1e-4, batch=128) but no search budget, search method, or number of configurations tried is mentioned." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "The paper uses early stopping on validation loss but does not explain how the reported hyperparameters (learning rate, batch size, adapter dimensions) were selected." 316 }, 317 "multiple_comparison_correction": { 318 "applies": false, 319 "answer": false, 320 "justification": "No statistical significance tests are performed, so the question of correcting for multiple comparisons does not arise." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors re-implement DataFilter and SecAlign baselines ('As the official code is unavailable, we reproduce the method' — Appendix D.3) without acknowledging the systematic bias of author-implemented baselines documented by Lucic et al. (2018)." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "While latency is compared in Figure 4, performance is not reported as a function of compute budget. No compute-normalized comparisons are made, and the additional compute cost of the reasoning phase is not controlled for." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "The paper does not discuss whether synthetic template-based attacks on Alpaca-Farm/NQ-simplified adequately represent real-world prompt injection threats, or whether ASR on UUID payloads measures real-world defense effectiveness." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": true, 339 "answer": true, 340 "justification": "For Alpaca-Farm and NQ-simplified, all methods use the same frozen backbone models. For AgentDojo, all methods use GPT-4o as the backbone agent (Appendix D.1). The backbone is controlled across all comparisons." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "The backbone models (Llama-3, Mistral, Qwen) were likely trained on data including Alpaca-family datasets, but this temporal overlap is not discussed. No training cutoff dates for the backbone models are stated." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the evaluation setup inadvertently leaks information about whether an attack is present (e.g., through formatting cues introduced by the injection synthesis pipeline)." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": true, 357 "justification": "Appendix E.2 explicitly addresses train/test separation: 'the training is performed strictly on the standard Alpaca train set to ensure zero data leakage.' Attack payloads use random UUIDs unseen during training (Appendix E.1)." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No formal leakage detection methods (canary strings, membership inference, n-gram overlap analysis) are applied. The use of random UUID payloads prevents payload memorization but is not a systematic leakage detection method." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "LLMs achieve 0% ASR when provided with ground-truth reasoning about injection attempts", 369 "evidence": "Table 1 shows Llama-3-8B, Mistral-7B, and Qwen-2.5-7B all achieve 0% ASR across five attack categories on Alpaca-Farm when ground-truth reasoning is appended (Section 1).", 370 "supported": "strong" 371 }, 372 { 373 "claim": "RedVisor achieves near-zero ASR across diverse attack types and backbones", 374 "evidence": "Table 3 shows 0% ASR for Naive, Ignore, and Escape attacks across all three backbones, with 2-7% for complex attacks (Completion, Multi-round). Near-zero ASR (0%) also against GCG adaptive attacks targeting the backbone.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "RedVisor preserves backbone utility with negligible degradation", 379 "evidence": "Figure 3 shows AlpacaEval 2.0 WinRate: Llama-3 85.78% (vs 87.27% raw), Mistral maintained similarly, while StruQ drops to 56.75% on Qwen (Section 7.6).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "RedVisor processes queries >2× faster than decoupled pipelines", 384 "evidence": "Figure 4 shows total latency for 1,000 NQ queries. RedVisor (unified) is approximately half the latency of RedVisor_sep (decoupled variant) across all backbones (Section 7.7).", 385 "supported": "strong" 386 }, 387 { 388 "claim": "RedVisor outperforms state-of-the-art defenses in detection accuracy", 389 "evidence": "Table 2 shows RedVisor achieves ROUGE-L ≥0.97 on atomic attacks and >0.78 on complex attacks, exceeding PromptLocate (0.69-0.97) and PromptArmor (0.12-0.31). Similar patterns in Appendix Tables 5-6.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "RedVisor is robust against adaptive GCG attacks targeting the classifier", 394 "evidence": "Table 4/10 shows GCG_classifier reduces detection ROUGE-L (0.94→0.82 for Llama-3) but ASR remains low (0.03→0.05). The paper argues adversarial perturbations that evade detection also disrupt the injection's utility (Section G.4).", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "RedVisor is the first approach to leverage fine-grained reasoning paths to simultaneously detect attacks and guide safe responses", 399 "evidence": "Stated in the Abstract as a novelty claim. The related work section (Section 2) distinguishes RedVisor from prior detection-only and prevention-only methods, but the 'first' claim cannot be verified from the paper alone.", 400 "supported": "weak" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No variance or uncertainty reporting", 406 "detail": "All results across Tables 2-10 and Figures 3-4 are point estimates with no error bars, standard deviations, confidence intervals, or indication of multiple runs. It is impossible to assess result stability or determine if reported differences are meaningful." 407 }, 408 { 409 "flag": "Synthetic-only attack evaluation", 410 "detail": "All attacks are synthetically generated using five template-based categories (Section 6.1). No testing against real-world, human-crafted prompt injections or adversarial red-teaming beyond GCG suffix optimization. The payloads are benign (print UUID), not actual malicious commands." 411 }, 412 { 413 "flag": "Limited model scale tested", 414 "detail": "Despite claiming a 'universal framework,' only 7-8B parameter models are tested (Llama-3-8B, Mistral-7B, Qwen-2.5-7B). No experiments with larger models (13B, 70B+), leaving scalability claims unsupported." 415 }, 416 { 417 "flag": "No statistical significance tests", 418 "detail": "Comparative claims ('outperforms state-of-the-art') are made without any statistical tests. Given the absence of variance reporting, it is unclear whether the observed differences are statistically meaningful." 419 }, 420 { 421 "flag": "Re-implemented baselines without bias acknowledgment", 422 "detail": "DataFilter and SecAlign baselines are re-implemented by the authors due to unavailable code (Appendix D.3). The systematic bias of author-implemented baselines (Lucic et al. 2018) is not acknowledged." 423 } 424 ], 425 "cited_papers": [ 426 { 427 "title": "StruQ: Defending against prompt injection with structured queries", 428 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 429 "year": 2025, 430 "relevance": "A key baseline for prevention-based prompt injection defense via instruction tuning on injection-augmented datasets." 431 }, 432 { 433 "title": "SecAlign: Defending against prompt injection with preference optimization", 434 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"], 435 "year": 2025, 436 "relevance": "DPO-based alignment defense against prompt injection, key prevention baseline showing utility-safety tradeoff." 437 }, 438 { 439 "title": "PromptLocate: Localizing prompt injection attacks", 440 "authors": ["Yuting Jia", "Yi Liu", "Zhibo Shao", "Jinyuan Jia", "Neil Zhenqiang Gong"], 441 "year": 2025, 442 "arxiv_id": "2510.12252", 443 "relevance": "Detection-based defense using iterative binary search for fine-grained injection localization; primary detection baseline." 444 }, 445 { 446 "title": "DataSentinel: A game-theoretic detection of prompt injection attacks", 447 "authors": ["Yi Liu", "Yuting Jia", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"], 448 "year": 2025, 449 "relevance": "Game-theoretic approach to detecting prompt injection via missing canary signals; used as classifier in PromptLocate." 450 }, 451 { 452 "title": "PromptArmor: Simple yet effective prompt injection defenses", 453 "authors": ["Tong Shi", "Kairan Zhu", "Zijie Wang", "Yuting Jia", "Wei Cai", "Wenbo Liang", "Han Wang", "Hamza Alzahrani", "Jin Lu", "Kenji Kawaguchi"], 454 "year": 2025, 455 "arxiv_id": "2507.15219", 456 "relevance": "Detection framework using auxiliary LLMs for prompt injection identification; detection baseline." 457 }, 458 { 459 "title": "Defending against prompt injection with DataFilter", 460 "authors": ["Yupei Wang", "Sizhe Chen", "Reem Alkhudair", "Basel Alomair", "David Wagner"], 461 "year": 2025, 462 "arxiv_id": "2510.19207", 463 "relevance": "Generative defense that rewrites/sanitizes context to remove injections; detection baseline showing hallucination issues." 464 }, 465 { 466 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 467 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 468 "year": 2023, 469 "relevance": "Foundational work demonstrating indirect prompt injection attacks against real-world LLM applications." 470 }, 471 { 472 "title": "Ignore previous prompt: Attack techniques for language models", 473 "authors": ["Fábio Perez", "Ian Ribeiro"], 474 "year": 2022, 475 "arxiv_id": "2211.09527", 476 "relevance": "Early characterization of prompt injection attack techniques against language models." 477 }, 478 { 479 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 480 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], 481 "year": 2024, 482 "relevance": "Benchmark for evaluating prompt injection attacks and defenses in autonomous agent workflows; used as evaluation benchmark." 483 }, 484 { 485 "title": "Universal and transferable adversarial attacks on aligned language models", 486 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"], 487 "year": 2023, 488 "arxiv_id": "2307.15043", 489 "relevance": "Introduced GCG attack method for optimizing adversarial suffixes; used as adaptive attack in RedVisor evaluation." 490 }, 491 { 492 "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks", 493 "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"], 494 "year": 2024, 495 "relevance": "Learned execution triggers for prompt injection attacks, representing advanced adversarial techniques." 496 }, 497 { 498 "title": "Can LLMs separate instructions from data? And what do we even mean by that?", 499 "authors": ["Egor Zverev", "Sahar Abdelnabi", "Soroush Tabesh", "Mario Fritz", "Christoph H. Lampert"], 500 "year": 2024, 501 "arxiv_id": "2403.06833", 502 "relevance": "Investigates the fundamental instruction-data separation problem in LLMs relevant to prompt injection." 503 }, 504 { 505 "title": "Tensor Trust: Interpretable prompt injection attacks from an online game", 506 "authors": ["Sam Toyer", "Olivia Watkins", "Ethan Adrian Mendes", "Justin Svegliato", "Luke Bailey", "Tiffany Wang", "Isaac Ong", "Karim Elmaaroufi", "Pieter Abbeel", "Trevor Darrell"], 507 "year": 2023, 508 "arxiv_id": "2311.01011", 509 "relevance": "Gamified dataset of interpretable prompt injection attacks from human adversaries." 510 }, 511 { 512 "title": "Automatic and universal prompt injection attacks against large language models", 513 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 514 "year": 2024, 515 "arxiv_id": "2403.04957", 516 "relevance": "Automated methods for generating universal prompt injection attacks against LLMs." 517 } 518 ], 519 "engagement_factors": { 520 "practical_relevance": { 521 "score": 2, 522 "justification": "Practitioners could deploy the adapter on vLLM-served models, but requires training on domain-specific data and adapter integration." 523 }, 524 "surprise_contrarian": { 525 "score": 1, 526 "justification": "The core insight (models can reject attacks if guided by reasoning) is intuitive rather than surprising; the contribution is engineering, not a paradigm shift." 527 }, 528 "fear_safety": { 529 "score": 2, 530 "justification": "Addresses the real and growing threat of prompt injection in RAG and agent systems, with demonstrated vulnerabilities in undefended models." 531 }, 532 "drama_conflict": { 533 "score": 0, 534 "justification": "No controversy or conflict; straightforward defense contribution." 535 }, 536 "demo_ability": { 537 "score": 1, 538 "justification": "Anonymous code repository is provided but requires GPU training to reproduce; no live demo or pip-installable tool." 539 }, 540 "brand_recognition": { 541 "score": 0, 542 "justification": "Authors are from Nanyang Technological University; not a well-known AI lab and no famous product involvement." 543 } 544 } 545 }