scan.json (31007B)
1 { 2 "paper": { 3 "title": "Defense Against Indirect Prompt Injection via Tool Result Parsing", 4 "authors": [ 5 "Qiang Yu", 6 "Xinran Cheng", 7 "Chuanyi Liu" 8 ], 9 "year": 2026, 10 "venue": "arXiv.org", 11 "arxiv_id": "2601.04795", 12 "doi": "10.48550/arXiv.2601.04795" 13 }, 14 "scan_version": 3, 15 "active_modules": [ 16 "experimental_rigor", 17 "data_leakage" 18 ], 19 "methodology_tags": [ 20 "benchmark-eval" 21 ], 22 "key_findings": "The paper proposes ParseData and CheckTool, two prompt-based defense modules against indirect prompt injection in LLM agents. ParseData extracts only needed data from tool results using format/logic constraints, while CheckTool detects and removes content that triggers unauthorized tool calls. On the AgentDojo benchmark with three LLMs (gpt-oss-120b, llama-3.1-70b, qwen3-32b), the combined methods achieve average ASR of 0–1.33% and average Risk of 0–1.33%, roughly 1/10 that of the next-best defense (Tool Filter), at the cost of 28–45% reduction in benign utility.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "Abstract states 'Code is available at GitHub' with a link to https://github.com/qiang-yu/agentdojo/tree/tool-result-extract." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The benchmark used is AgentDojo (Debenedetti et al., 2024), which is a publicly available framework. No custom data was collected." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "Only temperature (0) and context length (64KB) are specified. No requirements.txt, Dockerfile, or detailed environment setup listing library versions is provided." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions are provided in the paper. Code is released but no README or reproduction guide is described." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All results in Tables 1–5 are point estimates with no confidence intervals, error bars, or uncertainty measures." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims its methods 'significantly outperform' baselines (abstract, Section 4.2) but provides no statistical significance tests — comparisons are made by inspecting raw numbers only." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Effect sizes are reported with baseline context: 'our methods yield a risk value of only 0.2%–1%, which is approximately 1/10 to 1/8 that of Tool Filter' (Section 4.2.1). Tables provide absolute values for all methods enabling direct comparison." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "AgentDojo contains 97 user tasks across 4 domains. No justification for why this sample size is adequate for the claims made, and no power analysis." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run numbers. Temperature is set to 0 but this is not discussed as justification for omitting variance." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Four baseline defenses are compared: DeBERTa Detector, Repeat User Prompt, Spotlighting with Delimiting, and Tool Filter (Section 4.1)." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "All baselines are from 2024: DeBERTa Detector (ProtectAI.com, 2024), AgentDojo defenses (Debenedetti et al., 2024), Spotlighting (Hines et al., 2024). These represent the current state of the art." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 4.3 presents ablation studies showing individual contributions of ParseData and CheckTool (Table 3), and ParseData vs ParseFull (Table 4). Multiple combination orderings are tested." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "Four metrics are used: Benign Utility (BU), Utility under Attack (UA), Attack Success Rate (ASR), and Risk (ASR/UA). Defined in Section 4.1." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation is included. All evaluation is automated through the AgentDojo benchmark. Human evaluation could assess false positives where legitimate data is incorrectly filtered by ParseData." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "No distinction between development and test sets. The prompts in Appendices B–C were likely designed iteratively while observing AgentDojo results, but no held-out subset was used for final evaluation." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 5 provides per-attack-type breakdowns (Direct, Ignore Previous, Important Messages) and per-model breakdowns across all defense methods." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "The Limitations section discusses parameter hijacking as a specific failure mode that bypasses the defense. Section 4.3.1 discusses how CheckTool's reasoning leads to errors. Section 4.2.2 notes that qwen3-32b's deep thinking causes more mistakes with defenses." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "Several negative results are reported: CheckTool decreases BU with stronger reasoning models (Section 4.3.1), ParseFull decreases BU by 32.43% for llama-3.1-70b (Section 4.3.2), and combining modules 'diminishes overall utility compared to their standalone performance' (Section 4.3.1)." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims 'competitive Utility under Attack (UA) while maintaining the lowest Attack Success Rate (ASR) to date.' Table 1 confirms the lowest ASR (0–0.34%). UA is moderate but comparable to DeBERTa Detector (the other low-ASR method), supporting 'competitive' UA relative to ASR-comparable defenses." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": false, 129 "justification": "The conclusion claims 'deeper reasoning in LLMs positively correlates with improved BU, UA and ASR for ParseData' and that 'increased reasoning depth tends to introduce more errors' for CheckTool. These causal claims about reasoning depth are based on comparing only 3 models that differ in many ways beyond reasoning depth, with no controlled manipulation." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims 'Defense Against Indirect Prompt Injection' broadly, and the abstract motivates with 'autonomous systems and robotics.' Results are only on AgentDojo's four domains (banking, slack, travel, workspace) with 3 models and 3 attack types. The Limitations section notes English-only but doesn't bound to AgentDojo specifically." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "No discussion of alternative explanations for the results. For example, the UA improvement under attack (Section 4.2.2) could have alternative explanations beyond the proposed mechanism, but only one interpretation is offered." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper measures ASR, UA, BU, and Risk on AgentDojo, which directly tests defense against indirect prompt injection — the claimed outcome. The metrics measure exactly what is claimed (task completion rate and attack success rate). The broader framing about robotics is motivation, not a measured claim." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Models are listed as 'gpt-oss-120b', 'Llama-3.1-70b', and 'qwen3-32b' (Section 4.1). No snapshot dates or API versions are provided. 'gpt-oss-120b' is not a recognized public model identifier." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompt text for ParseData (Appendix B) and CheckTool (Appendix C) modules is provided, including the anticipation prompt, data extraction prompt, and trigger removal prompt." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 4.1 states 'Temperature and context length are set to 0 and 64KB respectively.' These are the key LLM generation hyperparameters." 162 }, 163 "scaffolding_described": { 164 "applies": true, 165 "answer": true, 166 "justification": "Figure 1 provides architecture diagrams for both ParseData and CheckTool modules. Section 3.3–3.4 describe the workflow in detail: anticipation step, data extraction with format/logic constraints, tool trigger detection, and sanitization. Prompt text is in appendices." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "The benchmark (AgentDojo) is used directly. Section 4.1 describes the benchmark structure: 4 domains, 97 total tasks, attack injection mechanism. No custom preprocessing is applied to the data." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing parameter hijacking attacks and English-only evaluation." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "The Limitations section identifies a specific threat: parameter hijacking attacks (e.g., redirecting payments via injected email addresses) bypass the defense because no unauthorized tool is called. This is specific to this study's approach." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "The Limitations section explicitly states what is NOT covered: parameter hijacking attacks and non-English languages. It also notes the lack of benchmarks for parameter hijacking." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "Code is released but no raw experimental logs, model outputs, or per-task results are mentioned as available. Only aggregated results in tables are provided." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 4.1 describes AgentDojo's structure: 4 domains (banking, slack, travel, workspace), 16+21+20+40 tasks, tool-based interactions, injection mechanism, and verification process." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. The study uses the AgentDojo benchmark exclusively." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The experimental pipeline is clear: user task → LLM reasoning → tool call → defense module (ParseData/CheckTool) → sanitized output → LLM response. Metric formulas are explicitly stated in Table 5." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed anywhere in the paper. No acknowledgments section mentioning grants or sponsors." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are clearly listed: all three authors are from Harbin Institute of Technology with institutional email addresses." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is not the same as confirming no conflicts." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests statement or financial disclosure is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper tests defense mechanisms against prompt injection, not model knowledge on benchmarks. AgentDojo evaluates agent behavior under attack, not whether models have memorized benchmark solutions." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "The paper tests defenses/tools rather than model knowledge. Contamination of AgentDojo tasks is not relevant to the defense evaluation." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "The paper tests defenses/tools rather than model knowledge. The benchmark measures whether attacks succeed and tasks complete, not model recall capability." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "The method adds extra LLM calls per tool invocation (ParseData requires 2 additional LLM calls, CheckTool requires 1-2). No inference cost, latency, or token overhead is reported." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "No total computational budget is stated. The paper runs experiments across 3 models, 3 attacks, and 12 defense configurations but does not quantify total API costs or compute time." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Temperature is set to 0 which reduces but does not eliminate stochasticity in API-based models." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "The number of experimental runs is not stated. Results appear to be from single runs." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget is reported. The prompt designs in Appendices B–C were presumably iterated but no search process is documented." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "Multiple configurations are tested (ParseData, ParseFull, CheckTool, and combinations in both orderings) but no justification for how the 'best' configuration was selected. No validation set is used for selection." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "No statistical tests are performed at all, so no correction for multiple comparisons. The paper makes many comparisons across 3 models × 4 attacks × 12 defenses without any statistical framework." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors implement their own defense modules and compare against baselines within the same framework. No acknowledgment of potential bias in implementing and evaluating their own system." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "The proposed method adds multiple extra LLM calls per tool invocation compared to baselines that add zero or one. This compute cost difference is never discussed in relation to performance gains." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of whether AgentDojo accurately represents real-world indirect prompt injection threats. The benchmark's 4 domains and 97 tasks may not capture the full diversity of real-world agent deployments." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "The paper compares model performance (gpt-oss-120b vs llama-3.1-70b vs qwen3-32b) and attributes differences to 'reasoning depth' without controlling for other model differences. The defense modules interact with each model's capabilities differently but this confound is not addressed." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the models could have seen AgentDojo tasks or similar patterns during training. AgentDojo was published in 2024; models may have been trained on its contents." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of feature leakage. The evaluation setup provides models with full tool definitions and conversation context, but whether this mirrors real-world information availability is not discussed." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of independence between AgentDojo tasks. Tasks within domains may share structural patterns that inflate performance estimates." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No leakage detection or prevention method is used." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "ParseData+CheckTool and CheckTool+ParseData achieve the lowest Attack Success Rate (ASR) to date, with average ASR below 1%.", 374 "evidence": "Table 1 shows average ASR of 0.19%/0.11% (gpt-oss-120b), 0.34%/0.24% (llama-3.1-70b), and 0.11%/0.00% (qwen3-32b) for the two combined methods, compared to next-best Tool Filter at 1.71%/2.32%/2.58%.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "The proposed methods achieve a Risk metric of 0.2%–1%, approximately 1/10 to 1/8 that of Tool Filter.", 379 "evidence": "Figure 3 shows Avg Risk values of 0.22–1.33% for the proposed methods vs 2.93–6.28% for Tool Filter across three models.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Deeper LLM reasoning positively correlates with improved ParseData performance.", 384 "evidence": "Section 4.3.1 compares three models: qwen3-32b achieves BU 51.22% higher than CheckTool and ASR 19.26% lower, attributed to stronger reasoning. However, this is based on only 3 models that differ in many dimensions beyond reasoning.", 385 "supported": "weak" 386 }, 387 { 388 "claim": "CheckTool performance degrades with deeper reasoning as the LLM mistakenly identifies normal data as tool triggers.", 389 "evidence": "Section 4.3.1 notes that for qwen3-32b, CheckTool's BU is 42.27% (lowest), attributed to the model's deep thinking introducing errors. Only 3 models compared.", 390 "supported": "weak" 391 }, 392 { 393 "claim": "The proposed defense maintains competitive Utility under Attack (UA) compared to existing methods.", 394 "evidence": "Table 1: for gpt-oss-120b, ParseData+CheckTool achieves 51.84% Avg UA vs 61.46% No Defense. For llama-3.1-70b, 26.64% vs 40.35%. UA is comparable to DeBERTa Detector (the other low-ASR method) but substantially lower than no-defense and some prompt-based defenses.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Under severe attack (Important Messages), the proposed methods improve utility compared to no-defense conditions, unlike other defense mechanisms which show utility decline.", 399 "evidence": "Table 2: for qwen3-32b, ParseData+CheckTool achieves 47.42% UA under Important Messages attack vs 45.36% BU under no attack — an increase. Section 4.2.2 explains this anomaly: parsing filters attacks that otherwise cause task failures.", 400 "supported": "moderate" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No uncertainty quantification", 406 "detail": "All results are single-run point estimates with no error bars, confidence intervals, or variance measures across any of the 108+ experimental configurations (3 models × 4 attack conditions × 12 defenses). Claims of 'significantly outperforming' are made without statistical tests." 407 }, 408 { 409 "flag": "Unrecognized model identifier", 410 "detail": "'gpt-oss-120b' is not a publicly recognized model name. It may be anonymized, a private model, or a future release. This undermines reproducibility since readers cannot identify or access this model." 411 }, 412 { 413 "flag": "Significant utility cost downplayed", 414 "detail": "BU drops of 28% (gpt-oss-120b) to 45% (llama-3.1-70b, qwen3-32b) compared to no defense are noted in passing but not prominently discussed as a major limitation. A defense that halves task completion rate has serious practical implications." 415 }, 416 { 417 "flag": "Causal claims from 3 data points", 418 "detail": "The conclusion makes causal claims about 'reasoning depth' affecting ParseData and CheckTool performance based on comparing only 3 models that differ in many dimensions (architecture, training data, size, instruction tuning approach)." 419 }, 420 { 421 "flag": "No cost analysis despite extra LLM calls", 422 "detail": "ParseData adds 2 LLM calls per tool invocation (anticipation + extraction), and CheckTool adds 1-2 more (check + optional sanitization). This potentially triples or quadruples inference cost, but no cost analysis is provided." 423 }, 424 { 425 "flag": "Potential benchmark overfitting", 426 "detail": "The prompts were presumably iteratively designed while testing on AgentDojo, but no held-out set or cross-validation is used. Results may not generalize beyond AgentDojo's 97 tasks in 4 domains." 427 } 428 ], 429 "cited_papers": [ 430 { 431 "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", 432 "authors": [ 433 "Edoardo Debenedetti", 434 "Jie Zhang", 435 "Mislav Balunovic", 436 "Luca Beurer-Kellner", 437 "Marc Fischer", 438 "Florian Tramèr" 439 ], 440 "year": 2024, 441 "relevance": "Primary benchmark used for evaluation; provides standardized framework for testing indirect prompt injection attacks and defenses on LLM agents." 442 }, 443 { 444 "title": "StruQ: Defending Against Prompt Injection with Structured Queries", 445 "authors": [ 446 "Sizhe Chen", 447 "Julien Piet", 448 "Chawin Sitawarin", 449 "David Wagner" 450 ], 451 "year": 2025, 452 "relevance": "Training-based defense approach that fine-tunes LLMs to separate instructions from data using structured queries." 453 }, 454 { 455 "title": "MELON: Provable Defense Against Indirect Prompt Injection Attacks in AI Agents", 456 "authors": [ 457 "Kaijie Zhu", 458 "Xianjun Yang", 459 "Jindong Wang", 460 "Wenbo Guo", 461 "William Yang Wang" 462 ], 463 "year": 2025, 464 "relevance": "Detection-based defense that identifies prompt injections when suspicious tool calls are about to be executed." 465 }, 466 { 467 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 468 "authors": [ 469 "Kai Greshake", 470 "Sahar Abdelnabi", 471 "Shailesh Mishra", 472 "Christoph Endres", 473 "Thorsten Holz", 474 "Mario Fritz" 475 ], 476 "year": 2023, 477 "relevance": "Foundational work demonstrating indirect prompt injection attacks on LLM-integrated applications including privacy leakage and malware distribution." 478 }, 479 { 480 "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting", 481 "authors": [ 482 "Keegan Hines", 483 "Gary Lopez", 484 "Matthew Hall", 485 "Federico Zarfati", 486 "Yonatan Zunger", 487 "Emre Kiciman" 488 ], 489 "year": 2024, 490 "relevance": "Baseline defense method using delimiters to separate data from instructions, evaluated as a comparison in this work." 491 }, 492 { 493 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 494 "authors": [ 495 "Eric Wallace", 496 "Kai Xiao", 497 "Reimar Leike", 498 "Lilian Weng", 499 "Johannes Heidecke", 500 "Alex Beutel" 501 ], 502 "year": 2024, 503 "arxiv_id": "2404.13208", 504 "relevance": "Proposes training LLMs to prioritize user instructions over injected content, relevant to the instruction-data separation problem." 505 }, 506 { 507 "title": "The Task Shield: Enforcing Task Alignment to Defend Against Indirect Prompt Injection in LLM Agents", 508 "authors": [ 509 "Feiran Jia", 510 "Tong Wu", 511 "Xin Qin", 512 "Anna Cinzia Squicciarini" 513 ], 514 "year": 2025, 515 "relevance": "Defense approach that evaluates correlation between user instructions and assistant messages to detect workflow compromises." 516 }, 517 { 518 "title": "Can Indirect Prompt Injection Attacks Be Detected and Removed?", 519 "authors": [ 520 "Yulin Chen", 521 "Haoran Li", 522 "Yuan Sui", 523 "Yufei He", 524 "Yue Liu", 525 "Yangqiu Song", 526 "Bryan Hooi" 527 ], 528 "year": 2025, 529 "relevance": "Trains auxiliary lightweight models to detect and remove indirect prompt injections from external prompts." 530 }, 531 { 532 "title": "Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents", 533 "authors": [ 534 "Qiusi Zhan", 535 "Richard Fang", 536 "Henil Shalin Panchal", 537 "Daniel Kang" 538 ], 539 "year": 2025, 540 "relevance": "Demonstrates that adaptive attacks can circumvent existing IPI defenses, relevant to evaluating defense robustness." 541 }, 542 { 543 "title": "IsolateGPT: An Execution Isolation Architecture for LLM-Based Agentic Systems", 544 "authors": [ 545 "Yuhao Wu", 546 "Franziska Roesner", 547 "Tadayoshi Kohno", 548 "Ning Zhang", 549 "Umar Iqbal" 550 ], 551 "year": 2025, 552 "relevance": "Proposes execution isolation as a defense mechanism for LLM agents, representing the privilege-control approach to IPI defense." 553 }, 554 { 555 "title": "Ignore Previous Prompt: Attack Techniques For Language Models", 556 "authors": [ 557 "Fábio Perez", 558 "Ian Ribeiro" 559 ], 560 "year": 2022, 561 "relevance": "Foundational work on prompt injection attack techniques, including the 'ignore previous instructions' attack pattern used in this paper's evaluation." 562 }, 563 { 564 "title": "A Survey on Trustworthy LLM Agents: Threats and Countermeasures", 565 "authors": [ 566 "Miao Yu", 567 "Fanci Meng", 568 "Xinyun Zhou" 569 ], 570 "year": 2025, 571 "relevance": "Comprehensive survey of threats and countermeasures for LLM agents, providing broader context for the prompt injection defense landscape." 572 } 573 ], 574 "engagement_factors": { 575 "practical_relevance": { 576 "score": 2, 577 "justification": "Proposes deployable prompt-based defense modules (ParseData/CheckTool) that practitioners building LLM agents could integrate, though the 28-45% utility cost limits immediate adoption." 578 }, 579 "surprise_contrarian": { 580 "score": 1, 581 "justification": "The approach of parsing/filtering tool results rather than detecting injections is a modest reframing, but the core finding that prompt-based defenses can achieve <1% ASR is incrementally better rather than surprising." 582 }, 583 "fear_safety": { 584 "score": 2, 585 "justification": "Directly addresses indirect prompt injection in LLM agents with concrete attack demonstrations and defense benchmarks, a real and growing security concern as agents gain tool-use capabilities." 586 }, 587 "drama_conflict": { 588 "score": 0, 589 "justification": "No controversy or conflict — straightforwardly proposes a defense and compares against baselines without challenging any company or widely-held belief." 590 }, 591 "demo_ability": { 592 "score": 1, 593 "justification": "Code is available on GitHub and uses the AgentDojo benchmark, but requires setting up multiple LLM APIs and the benchmark framework to reproduce." 594 }, 595 "brand_recognition": { 596 "score": 0, 597 "justification": "From Harbin Institute of Technology with no famous-lab cachet, uses an unrecognized model name (gpt-oss-120b), and the topic lacks association with a household-name product." 598 } 599 } 600 }