scan-v5.json (23491B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Defense Against Indirect Prompt Injection via Tool Result Parsing", 6 "authors": [ 7 "Qiang Yu", 8 "Xinran Cheng", 9 "Chuanyi Liu" 10 ], 11 "year": 2026, 12 "venue": "arXiv.org", 13 "arxiv_id": "2601.04795", 14 "doi": "10.48550/arXiv.2601.04795" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "Abstract claims of lowest ASR (<1%) and competitive UA are supported by Tables 1–3 showing ParseData+CheckTool achieving 0.00–0.35% Avg Risk vs 2.93–28.96% for all baselines across three models.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Causal claims about module contributions are tested via a dedicated ablation (Section 4.3) that isolates ParseData and CheckTool individually and in both combination orders on a controlled benchmark.", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper is scoped to AgentDojo benchmark and English; the Limitations section explicitly acknowledges parameter hijacking attacks and non-English settings are outside the evaluated scope.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "The paper does not consider whether the ASR reduction could partly result from reduced agent task completion (lower UA) rather than genuine defense, nor whether AgentDojo attack patterns are atypically easy to parse.", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "ASR directly measures unauthorized tool execution via AgentDojo's verification logic, and UA directly measures task completion under attack; the Risk metric (ASR/UA) explicitly quantifies the tradeoff with no proxy substitution.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "A dedicated Limitations section appears after the Conclusion, discussing parameter hijacking attacks and English-only evaluation.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "Specific threats are named: parameter hijacking (with concrete email-redirect example that bypasses the defense entirely), and lack of non-English evaluation.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "The paper explicitly states coverage of action hijacking only, not parameter hijacking, and English-language settings only; these are explicit scope statements rather than generic disclaimers.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding source is mentioned anywhere in the paper.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "All three authors list Harbin Institute of Technology affiliation in the paper header with institutional email addresses.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funding is disclosed; criterion is not applicable.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests or financial interests statement appears in the paper.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "LLM Agent is formally defined in Section 3.1 with mathematical notation; IPI attack is defined; all four evaluation metrics (BU, UA, ASR, Risk) are precisely defined with formulas in Table 5.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The contribution—ParseData and CheckTool prompt-based modules for IPI defense without model training—is explicitly stated in both the abstract and introduction.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 categorizes prior defenses into model-based and prompt-based, explains limitations of each category, and explicitly positions this work's advantages over both paradigms.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": true, 122 "justification": "A GitHub URL is provided in Abstract footnote 1, though the Ethical Considerations section says 'The source code will be made publicly available,' creating ambiguity; the explicit URL is given and treated as released.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "AgentDojo is a publicly available benchmark; no custom dataset was created.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "No requirements.txt, Dockerfile, or dependency list is provided; only temperature (0) and context length (64KB) are mentioned in Section 4.1.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "Appendices B and C provide verbatim prompts but no end-to-end experimental setup instructions; a reader could not reproduce results without guessing framework integration details.", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "All results are single-run percentages with no confidence intervals or error bars reported anywhere in the paper.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "No statistical significance tests are applied to any comparative claims across defense methods or models.", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Effect sizes are reported as proportional comparisons with baseline context (e.g., '0.2%–1%, approximately 1/10 to 1/8 that of Tool Filter') in Figure 3 and Section 4.2.1.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "AgentDojo's 97 user tasks are used without power analysis or justification of whether this is sufficient to detect reliable differences between defense methods.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "No variance, standard deviation, or multiple-run results are reported; experiments appear to be single deterministic runs (temperature=0).", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Four baselines are included: DeBERTa Detector, Repeat User Prompt, Spotlighting with Delimiting, and Tool Filter.", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "Baselines include 2024–2025 published work (DeBERTa Detector, Spotlighting with Delimiting from Hines 2024, Tool Filter from AgentDojo 2024); these represent current state of the art.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 4.3 presents ablation examining ParseData and CheckTool individually and in both combination orders (ParseData+CheckTool vs CheckTool+ParseData) across all three models.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Four metrics are used: Benign Utility (BU), Utility under Attack (UA), Attack Success Rate (ASR), and Risk (ASR/UA).", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": false, 203 "answer": false, 204 "justification": "Human evaluation is not relevant to this automated security benchmark evaluation.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": false, 209 "answer": false, 210 "justification": "No training is performed; AgentDojo is a benchmark, not a prediction task requiring train/test splits.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Table 5 (Appendix A) provides full breakdowns across 4 attack types (NoAttack, Direct, Ignore Previous, Important Messages) and 3 models for every defense method.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Parameter hijacking is explicitly identified as a concrete failure case with an example (email address substitution) that bypasses the defense entirely.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "Substantial BU decreases of 28–55% relative to no-defense baseline are reported honestly across all models and discussed in Section 4.2.2.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "'gpt-oss-120b' is not a publicly known model and has no snapshot date; llama-3.1-70b and qwen3-32b lack version snapshot dates or API access dates.", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "Appendices B and C provide the complete ParseData and CheckTool prompts verbatim, including all placeholder variables.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": true, 248 "justification": "Temperature=0 and context length=64KB are specified in Section 4.1; these are the only relevant hyperparameters for inference.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "Figure 1 and Section 3 describe how ParseData and CheckTool integrate into the agent pipeline step-by-step, including the anticipation/extraction two-phase process.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "Section 4.1 describes AgentDojo benchmark structure, the three attack types selected, and the four domains; no additional preprocessing was performed.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": true, 268 "justification": "Appendix A (Table 5) contains the complete numerical results for all conditions, models, attacks, and metrics.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 4.1 describes AgentDojo benchmark structure with 16/21/20/40 tasks across 4 domains and how attacks are injected into tool results.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants; recruitment is not applicable.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "Section 3.1 formalizes the full pipeline mathematically and Figure 1 shows exactly where defense modules integrate into tool call execution.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No training cutoffs are stated for any of the three evaluated models; AgentDojo (2024) tasks could plausibly appear in training data.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "No discussion of whether AgentDojo tasks or attack patterns were present in model training corpora.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "AgentDojo was published in 2024 and could be in training data for 2025/2026 model versions; this is not addressed.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "ParseData and CheckTool both add extra LLM calls per agent step, but no latency, token count, or monetary cost estimates are provided.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "No total compute budget, number of API calls, or experiment runtime is reported.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "ParseData+CheckTool achieves the lowest Attack Success Rate among all evaluated defenses, below 1%.", 373 "evidence": "Table 1: Avg ASR 0.19% (gpt-oss-120b), 0.34% (llama-3.1-70b), 0.11% (qwen3-32b) vs next best Tool Filter at 1.71%, 2.32%, 2.58%.", 374 "supported": "strong" 375 }, 376 { 377 "claim": "The proposed method achieves competitive Utility under Attack compared to baselines.", 378 "evidence": "Table 1 shows UA drops to 51.84% vs 64.02% for Tool Filter (gpt-oss-120b), a 12pp gap; framing as 'competitive' is generous given consistent 10–15pp deficit.", 379 "supported": "weak" 380 }, 381 { 382 "claim": "Risk (ASR/UA) for Parse+Check is approximately 1/10 to 1/8 that of Tool Filter.", 383 "evidence": "Figure 3: CheckTool+ParseData at 0.22–0.76% vs Tool Filter 2.93–6.28% across three models, yielding approximately the claimed ratio.", 384 "supported": "strong" 385 }, 386 { 387 "claim": "Stronger LLM reasoning depth improves ParseData performance but degrades CheckTool performance.", 388 "evidence": "Table 3: qwen3-32b ParseData BU=63.92% vs CheckTool BU=42.27%; gpt-oss-120b shows near parity (54.64% vs 53.61%), consistent with the reasoning-depth hypothesis.", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Under the most powerful attack (Important Messages), existing defenses have ASR >4% while the proposed method achieves 0.11–0.53%.", 393 "evidence": "Table 2: DeBERTa 4.11–6.32%, Tool Filter 5.69–7.90%, Repeat Prompt 14.75–16.86%, vs ParseData+CheckTool 0.11–0.53%.", 394 "supported": "strong" 395 } 396 ], 397 "methodology_tags": [ 398 "benchmark-eval" 399 ], 400 "key_findings": "ParseData+CheckTool reduces Attack Success Rate to below 1% across three models on AgentDojo—a 3–10x improvement over the next best defense (Tool Filter at 2.93–6.28% risk) and roughly 1/100 the ASR of no defense. The cost is a 28–55% reduction in Benign Utility compared to no defense, making the utility-security tradeoff significant but quantified. A key identified failure mode—parameter hijacking attacks—bypasses the defense entirely and is left for future work. The defense scales with underlying LLM reasoning capability for ParseData but degrades for CheckTool as models reason more aggressively.", 401 "red_flags": [ 402 { 403 "flag": "No statistical significance testing", 404 "detail": "All results are single-run percentages with no confidence intervals, error bars, or significance tests, despite making strong comparative quantitative claims." 405 }, 406 { 407 "flag": "Unknown primary model 'gpt-oss-120b'", 408 "detail": "The primary model is not a publicly known model with a snapshot date or documentation, making independent reproduction impossible and comparison with external benchmarks invalid." 409 }, 410 { 411 "flag": "Utility drop understated", 412 "detail": "BU/UA drops of 28–55% from no-defense baseline are consistently described as 'competitive' in the abstract and framing, despite being substantial practical costs for deployment." 413 }, 414 { 415 "flag": "Inference cost omitted", 416 "detail": "ParseData and CheckTool each add full LLM calls per agent step; no latency, token cost, or overhead analysis is provided despite this being central to practical adoption." 417 }, 418 { 419 "flag": "Single benchmark, English only", 420 "detail": "All experiments use only AgentDojo in English; no other IPI benchmarks or non-English settings are tested, limiting claimed generalizability." 421 }, 422 { 423 "flag": "Code availability contradiction", 424 "detail": "Abstract provides a GitHub URL implying current availability, but Ethical Considerations says 'The source code will be made publicly available,' suggesting it was not released at submission." 425 }, 426 { 427 "flag": "Benchmark contamination unaddressed", 428 "detail": "AgentDojo (2024) could appear in training data for the 2025/2026 model versions evaluated; this is not discussed." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", 434 "relevance": "Primary benchmark for all experiments; defines UA, ASR metrics and provides attack scenarios and defense integration framework" 435 }, 436 { 437 "title": "StruQ: Defending Against Prompt Injection with Structured Queries", 438 "relevance": "Competing training-based defense that fine-tunes LLMs to distinguish instructions from data; used as conceptual baseline" 439 }, 440 { 441 "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting", 442 "relevance": "Competing prompt-based defense using delimiters; included as experimental baseline" 443 }, 444 { 445 "title": "Can Indirect Prompt Injection Attacks Be Detected and Removed?", 446 "relevance": "Lightweight model-based detection approach for IPI; conceptual and experimental baseline" 447 }, 448 { 449 "title": "MELON: Provable Defense Against Indirect Prompt Injection Attacks in AI Agents", 450 "relevance": "Defense that monitors for suspicious tool calls at execution time; related to CheckTool's monitoring approach" 451 }, 452 { 453 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 454 "relevance": "Foundational work demonstrating IPI as a practical threat against real deployed LLM applications" 455 }, 456 { 457 "title": "Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents", 458 "relevance": "Shows existing defenses fail against adaptive attackers; motivates the stronger defense approach in this paper" 459 } 460 ], 461 "engagement_factors": { 462 "practical_relevance": { 463 "score": 3, 464 "justification": "Addresses a live deployment threat in LLM agents with a training-free, prompt-based solution that can be integrated into existing agent pipelines without model changes." 465 }, 466 "surprise_contrarian": { 467 "score": 1, 468 "justification": "The core insight (extract only what you need, discard the rest) is intuitive; magnitude of improvement over baselines is notable but the direction is expected." 469 }, 470 "fear_safety": { 471 "score": 2, 472 "justification": "Explicitly frames IPI as an escalating threat as agents gain physical control over autonomous systems and robotics, raising genuine safety stakes." 473 }, 474 "drama_conflict": { 475 "score": 1, 476 "justification": "Participates in an active security arms race between attack and defense research, but the paper itself takes no controversial positions." 477 }, 478 "demo_ability": { 479 "score": 2, 480 "justification": "Code is released on GitHub with AgentDojo integration; anyone with API access to compatible models could run the experiments." 481 }, 482 "brand_recognition": { 483 "score": 0, 484 "justification": "Authors are from Harbin Institute of Technology; no famous AI lab, industry affiliation, or well-known product involved." 485 } 486 }, 487 "hn_data": { 488 "threads": [ 489 { 490 "hn_id": "46624374", 491 "title": "Quantum Automated Theorem Proving", 492 "points": 5, 493 "comments": 0, 494 "url": "https://news.ycombinator.com/item?id=46624374", 495 "created_at": "2026-01-14T22:06:27Z" 496 } 497 ], 498 "top_points": 5, 499 "total_points": 5, 500 "total_comments": 0 501 } 502 }