scan-v4.json (31161B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Defense Against Indirect Prompt Injection via Tool Result Parsing", 6 "authors": [ 7 "Qiang Yu", 8 "Xinran Cheng", 9 "Chuanyi Liu" 10 ], 11 "year": 2026, 12 "venue": "arXiv.org", 13 "arxiv_id": "2601.04795", 14 "doi": "10.48550/arXiv.2601.04795" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "The abstract claims 'competitive Utility under Attack (UA) while maintaining the lowest Attack Success Rate (ASR) to date.' Table 1 confirms the lowest ASR (0–0.34%). UA is moderate but comparable to DeBERTa Detector (the other low-ASR method), supporting 'competitive' UA relative to ASR-comparable defenses.", 22 "source": "opus" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": false, 27 "justification": "The conclusion claims 'deeper reasoning in LLMs positively correlates with improved BU, UA and ASR for ParseData' and that 'increased reasoning depth tends to introduce more errors' for CheckTool. These causal claims about reasoning depth are based on comparing only 3 models that differ in many ways beyond reasoning depth, with no controlled manipulation.", 28 "source": "opus" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "The title claims 'Defense Against Indirect Prompt Injection' broadly, and the abstract motivates with 'autonomous systems and robotics.' Results are only on AgentDojo's four domains (banking, slack, travel, workspace) with 3 models and 3 attack types. The Limitations section notes English-only but doesn't bound to AgentDojo specifically.", 34 "source": "opus" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "No discussion of alternative explanations for the results. For example, the UA improvement under attack (Section 4.2.2) could have alternative explanations beyond the proposed mechanism, but only one interpretation is offered.", 40 "source": "opus" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "The paper measures ASR, UA, BU, and Risk on AgentDojo, which directly tests defense against indirect prompt injection — the claimed outcome. The metrics measure exactly what is claimed (task completion rate and attack success rate). The broader framing about robotics is motivation, not a measured claim.", 46 "source": "opus" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": true, 53 "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing parameter hijacking attacks and English-only evaluation.", 54 "source": "opus" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": true, 59 "justification": "The Limitations section identifies a specific threat: parameter hijacking attacks (e.g., redirecting payments via injected email addresses) bypass the defense because no unauthorized tool is called. This is specific to this study's approach.", 60 "source": "opus" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": true, 65 "justification": "The Limitations section explicitly states what is NOT covered: parameter hijacking attacks and non-English languages. It also notes the lack of benchmarks for parameter hijacking.", 66 "source": "opus" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding information is disclosed anywhere in the paper. No acknowledgments section mentioning grants or sponsors.", 74 "source": "opus" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": true, 79 "justification": "Author affiliations are clearly listed: all three authors are from Harbin Institute of Technology with institutional email addresses.", 80 "source": "opus" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": true, 84 "answer": false, 85 "justification": "No funding is disclosed, so independence cannot be assessed. The absence of a funding disclosure is not the same as confirming no conflicts.", 86 "source": "opus" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement or financial disclosure is present in the paper.", 92 "source": "opus" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "IPI is defined in Section 1, LLM Agent (A, M, F) is formally defined in Section 3.1 with mathematical notation, and ParseData and CheckTool are precisely described; ASR, UA, BU, and Risk metrics are all defined in Table 5.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "The abstract and introduction clearly state the contribution: a prompt-based defense mechanism using tool result parsing (ParseData module) and trigger sanitization (CheckTool module) to achieve lowest ASR against IPI.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section 2 systematically categorizes prior work into model-based and prompt-based defenses, explains weaknesses of each, and explicitly positions the proposed approach as addressing gaps in prompt-based detection accuracy.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": true, 122 "justification": "Abstract states 'Code is available at GitHub' with a link to https://github.com/qiang-yu/agentdojo/tree/tool-result-extract.", 123 "source": "opus" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": true, 128 "justification": "The benchmark used is AgentDojo (Debenedetti et al., 2024), which is a publicly available framework. No custom data was collected.", 129 "source": "opus" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "Only temperature (0) and context length (64KB) are specified. No requirements.txt, Dockerfile, or detailed environment setup listing library versions is provided.", 135 "source": "opus" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "No step-by-step reproduction instructions are provided in the paper. Code is released but no README or reproduction guide is described.", 141 "source": "opus" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "All results in Tables 1–5 are point estimates with no confidence intervals, error bars, or uncertainty measures.", 149 "source": "opus" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "The paper claims its methods 'significantly outperform' baselines (abstract, Section 4.2) but provides no statistical significance tests — comparisons are made by inspecting raw numbers only.", 155 "source": "opus" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Effect sizes are reported with baseline context: 'our methods yield a risk value of only 0.2%–1%, which is approximately 1/10 to 1/8 that of Tool Filter' (Section 4.2.1). Tables provide absolute values for all methods enabling direct comparison.", 161 "source": "opus" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "AgentDojo contains 97 user tasks across 4 domains. No justification for why this sample size is adequate for the claims made, and no power analysis.", 167 "source": "opus" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run numbers. Temperature is set to 0 but this is not discussed as justification for omitting variance.", 173 "source": "opus" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Four baseline defenses are compared: DeBERTa Detector, Repeat User Prompt, Spotlighting with Delimiting, and Tool Filter (Section 4.1).", 181 "source": "opus" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "All baselines are from 2024: DeBERTa Detector (ProtectAI.com, 2024), AgentDojo defenses (Debenedetti et al., 2024), Spotlighting (Hines et al., 2024). These represent the current state of the art.", 187 "source": "opus" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 4.3 presents ablation studies showing individual contributions of ParseData and CheckTool (Table 3), and ParseData vs ParseFull (Table 4). Multiple combination orderings are tested.", 193 "source": "opus" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": true, 198 "justification": "Four metrics are used: Benign Utility (BU), Utility under Attack (UA), Attack Success Rate (ASR), and Risk (ASR/UA). Defined in Section 4.1.", 199 "source": "opus" 200 }, 201 "human_evaluation": { 202 "applies": true, 203 "answer": false, 204 "justification": "No human evaluation is included. All evaluation is automated through the AgentDojo benchmark. Human evaluation could assess false positives where legitimate data is incorrectly filtered by ParseData.", 205 "source": "opus" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": false, 210 "justification": "No distinction between development and test sets. The prompts in Appendices B–C were likely designed iteratively while observing AgentDojo results, but no held-out subset was used for final evaluation.", 211 "source": "opus" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": true, 216 "justification": "Table 5 provides per-attack-type breakdowns (Direct, Ignore Previous, Important Messages) and per-model breakdowns across all defense methods.", 217 "source": "opus" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": true, 222 "justification": "The Limitations section discusses parameter hijacking as a specific failure mode that bypasses the defense. Section 4.3.1 discusses how CheckTool's reasoning leads to errors. Section 4.2.2 notes that qwen3-32b's deep thinking causes more mistakes with defenses.", 223 "source": "opus" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "Several negative results are reported: CheckTool decreases BU with stronger reasoning models (Section 4.3.1), ParseFull decreases BU by 32.43% for llama-3.1-70b (Section 4.3.2), and combining modules 'diminishes overall utility compared to their standalone performance' (Section 4.3.1).", 229 "source": "opus" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": false, 236 "justification": "Models are listed as 'gpt-oss-120b', 'Llama-3.1-70b', and 'qwen3-32b' (Section 4.1). No snapshot dates or API versions are provided. 'gpt-oss-120b' is not a recognized public model identifier.", 237 "source": "opus" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": true, 242 "justification": "Full prompt text for ParseData (Appendix B) and CheckTool (Appendix C) modules is provided, including the anticipation prompt, data extraction prompt, and trigger removal prompt.", 243 "source": "opus" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": true, 248 "justification": "Section 4.1 states 'Temperature and context length are set to 0 and 64KB respectively.' These are the key LLM generation hyperparameters.", 249 "source": "opus" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "Figure 1 provides architecture diagrams for both ParseData and CheckTool modules. Section 3.3–3.4 describe the workflow in detail: anticipation step, data extraction with format/logic constraints, tool trigger detection, and sanitization. Prompt text is in appendices.", 255 "source": "opus" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": true, 260 "justification": "The benchmark (AgentDojo) is used directly. Section 4.1 describes the benchmark structure: 4 domains, 97 total tasks, attack injection mechanism. No custom preprocessing is applied to the data.", 261 "source": "opus" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": false, 268 "justification": "Code is released but no raw experimental logs, model outputs, or per-task results are mentioned as available. Only aggregated results in tables are provided.", 269 "source": "opus" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section 4.1 describes AgentDojo's structure: 4 domains (banking, slack, travel, workspace), 16+21+20+40 tasks, tool-based interactions, injection mechanism, and verification process.", 275 "source": "opus" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants. The study uses the AgentDojo benchmark exclusively.", 281 "source": "opus" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": true, 286 "justification": "The experimental pipeline is clear: user task → LLM reasoning → tool call → defense module (ParseData/CheckTool) → sanitized output → LLM response. Metric formulas are explicitly stated in Table 5.", 287 "source": "opus" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": false, 293 "answer": false, 294 "justification": "The paper tests defense mechanisms against prompt injection, not model knowledge on benchmarks. AgentDojo evaluates agent behavior under attack, not whether models have memorized benchmark solutions.", 295 "source": "opus" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": false, 299 "answer": false, 300 "justification": "The paper tests defenses/tools rather than model knowledge. Contamination of AgentDojo tasks is not relevant to the defense evaluation.", 301 "source": "opus" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": false, 305 "answer": false, 306 "justification": "The paper tests defenses/tools rather than model knowledge. The benchmark measures whether attacks succeed and tasks complete, not model recall capability.", 307 "source": "opus" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "No human participants in this study.", 315 "source": "opus" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "No human participants in this study.", 321 "source": "opus" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "No human participants in this study.", 327 "source": "opus" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "No human participants in this study.", 333 "source": "opus" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "No human participants in this study.", 339 "source": "opus" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "No human participants in this study.", 345 "source": "opus" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "No human participants in this study.", 351 "source": "opus" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "The method adds extra LLM calls per tool invocation (ParseData requires 2 additional LLM calls, CheckTool requires 1-2). No inference cost, latency, or token overhead is reported.", 359 "source": "opus" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "No total computational budget is stated. The paper runs experiments across 3 models, 3 attacks, and 12 defense configurations but does not quantify total API costs or compute time.", 365 "source": "opus" 366 } 367 }, 368 "experimental_rigor": { 369 "seed_sensitivity_reported": { 370 "applies": true, 371 "answer": false, 372 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Temperature is set to 0 which reduces but does not eliminate stochasticity in API-based models.", 373 "source": "opus" 374 }, 375 "number_of_runs_stated": { 376 "applies": true, 377 "answer": false, 378 "justification": "The number of experimental runs is not stated. Results appear to be from single runs.", 379 "source": "opus" 380 }, 381 "hyperparameter_search_budget": { 382 "applies": true, 383 "answer": false, 384 "justification": "No hyperparameter search budget is reported. The prompt designs in Appendices B–C were presumably iterated but no search process is documented.", 385 "source": "opus" 386 }, 387 "best_config_selection_justified": { 388 "applies": true, 389 "answer": false, 390 "justification": "Multiple configurations are tested (ParseData, ParseFull, CheckTool, and combinations in both orderings) but no justification for how the 'best' configuration was selected. No validation set is used for selection.", 391 "source": "opus" 392 }, 393 "multiple_comparison_correction": { 394 "applies": true, 395 "answer": false, 396 "justification": "No statistical tests are performed at all, so no correction for multiple comparisons. The paper makes many comparisons across 3 models × 4 attacks × 12 defenses without any statistical framework.", 397 "source": "opus" 398 }, 399 "self_comparison_bias_addressed": { 400 "applies": true, 401 "answer": false, 402 "justification": "The authors implement their own defense modules and compare against baselines within the same framework. No acknowledgment of potential bias in implementing and evaluating their own system.", 403 "source": "opus" 404 }, 405 "compute_budget_vs_performance": { 406 "applies": true, 407 "answer": false, 408 "justification": "The proposed method adds multiple extra LLM calls per tool invocation compared to baselines that add zero or one. This compute cost difference is never discussed in relation to performance gains.", 409 "source": "opus" 410 }, 411 "benchmark_construct_validity": { 412 "applies": true, 413 "answer": false, 414 "justification": "No discussion of whether AgentDojo accurately represents real-world indirect prompt injection threats. The benchmark's 4 domains and 97 tasks may not capture the full diversity of real-world agent deployments.", 415 "source": "opus" 416 }, 417 "scaffold_confound_addressed": { 418 "applies": true, 419 "answer": false, 420 "justification": "The paper compares model performance (gpt-oss-120b vs llama-3.1-70b vs qwen3-32b) and attributes differences to 'reasoning depth' without controlling for other model differences. The defense modules interact with each model's capabilities differently but this confound is not addressed.", 421 "source": "opus" 422 } 423 }, 424 "data_leakage": { 425 "temporal_leakage_addressed": { 426 "applies": true, 427 "answer": false, 428 "justification": "No discussion of whether the models could have seen AgentDojo tasks or similar patterns during training. AgentDojo was published in 2024; models may have been trained on its contents.", 429 "source": "opus" 430 }, 431 "feature_leakage_addressed": { 432 "applies": true, 433 "answer": false, 434 "justification": "No discussion of feature leakage. The evaluation setup provides models with full tool definitions and conversation context, but whether this mirrors real-world information availability is not discussed.", 435 "source": "opus" 436 }, 437 "non_independence_addressed": { 438 "applies": true, 439 "answer": false, 440 "justification": "No discussion of independence between AgentDojo tasks. Tasks within domains may share structural patterns that inflate performance estimates.", 441 "source": "opus" 442 }, 443 "leakage_detection_method": { 444 "applies": true, 445 "answer": false, 446 "justification": "No leakage detection or prevention method is used.", 447 "source": "opus" 448 } 449 } 450 } 451 }, 452 "claims": [ 453 { 454 "claim": "ParseData+CheckTool and CheckTool+ParseData achieve ASR below 1%, approximately 1/10 that of the best competing defenses (DeBERTa Detector and Tool Filter).", 455 "evidence": "Table 1 shows Avg ASR of 0.19% and 0.11% for gpt-oss-120b, compared to DeBERTa's 1.19% and Tool Filter's 1.71%.", 456 "supported": "strong" 457 }, 458 { 459 "claim": "The proposed method achieves an Avg Risk of 0.2%–1%, approximately 1/10 to 1/8 that of Tool Filter (the next best baseline at 3%–6%).", 460 "evidence": "Figure 3 shows Risk values of 0.22%–1.33% for the proposed methods vs. 2.93%–6.28% for Tool Filter across all three models.", 461 "supported": "strong" 462 }, 463 { 464 "claim": "Deeper reasoning capability in LLMs positively correlates with ParseData performance but negatively affects CheckTool performance.", 465 "evidence": "Table 3 shows qwen3-32b achieves BU 51.22% higher than CheckTool with ParseData, while CheckTool BU is substantially lower; attributed to deep thinking causing over-identification of triggers.", 466 "supported": "moderate" 467 }, 468 { 469 "claim": "The 'Important Messages' attack achieves consistently the highest ASR across all models and defenses.", 470 "evidence": "Table 5 shows Important Messages ASR is higher than Direct and Ignore Previous for every defense-model combination, consistent with the claim in Debenedetti et al. 2024.", 471 "supported": "strong" 472 }, 473 { 474 "claim": "Existing prompt-based defenses (Repeat User Prompt, Spotlighting with Delimiting) leave ASR above 10% under the Important Messages attack, making them impractical.", 475 "evidence": "Table 2 shows Repeat User Prompt ASR of 14.75%/11.80%/16.86% and Spotlighting ASR of 23.29%/13.17%/25.08% for gpt-oss-120b, llama-3.1-70b, qwen3-32b respectively under Important Messages.", 476 "supported": "strong" 477 }, 478 { 479 "claim": "qwen3-32b achieves 0% ASR under the Important Messages attack with CheckTool+ParseData, but at the cost of substantially reduced benign utility.", 480 "evidence": "Table 2 shows qwen3-32b CheckTool+ParseData achieves 0.00% ASR but only 37.11% BU vs. 83.51% BU for No Defense.", 481 "supported": "strong" 482 } 483 ], 484 "methodology_tags": [ 485 "benchmark-eval" 486 ], 487 "key_findings": "The paper proposes a two-module prompt-based defense against indirect prompt injection: ParseData extracts only the minimal necessary data from tool results using LLM-driven format and logic constraints, while CheckTool detects and sanitizes content that triggers unauthorized tool calls. Evaluated on AgentDojo with three LLMs (gpt-oss-120b, llama-3.1-70b, qwen3-32b) and three attack types, the combined ParseData+CheckTool achieves average ASR below 1% across all models—approximately 1/10 that of the next best defenses—while maintaining moderate utility. A key tradeoff is identified: deeper-reasoning models (qwen3-32b) benefit ParseData but are harmed by CheckTool, which over-triggers and removes legitimate data. The defense does not address parameter hijacking attacks, which represent a significant uncovered attack class.", 488 "red_flags": [ 489 { 490 "flag": "Unrecognizable model name", 491 "detail": "'gpt-oss-120b' is not a known public OpenAI model; results on this model cannot be independently verified or reproduced, undermining a third of the experimental evidence." 492 }, 493 { 494 "flag": "No statistical significance tests", 495 "detail": "All comparative claims between defense methods are made without significance tests, confidence intervals, or variance across runs despite the small benchmark size (97 total tasks)." 496 }, 497 { 498 "flag": "Single-run results only", 499 "detail": "With temperature=0 determinism reduces variance, but no multi-run or cross-domain breakdown is given; all results are single point estimates with no uncertainty quantification." 500 }, 501 { 502 "flag": "Large benign utility drop unaddressed", 503 "detail": "The proposed methods reduce benign utility by 28–45% relative to No Defense, but the paper does not discuss the practical acceptability of this degradation or compare it to user tolerance thresholds." 504 }, 505 { 506 "flag": "No funding disclosure", 507 "detail": "No funding source or acknowledgments section; conflicts of interest are absent." 508 }, 509 { 510 "flag": "Contamination not discussed", 511 "detail": "Models tested (especially qwen3-32b, released 2025) may have been trained on AgentDojo (NeurIPS 2024); this could affect baseline susceptibility estimates but is not acknowledged." 512 } 513 ], 514 "cited_papers": [ 515 { 516 "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", 517 "relevance": "Primary benchmark used for all experiments; provides the attack suite and evaluation framework" 518 }, 519 { 520 "title": "StruQ: Defending Against Prompt Injection with Structured Queries", 521 "relevance": "Key baseline fine-tuning approach that separates instructions from data at the model level" 522 }, 523 { 524 "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting", 525 "relevance": "Baseline defense using delimiters to distinguish data from instructions" 526 }, 527 { 528 "title": "MELON: Provable Defense Against Indirect Prompt Injection Attacks in AI Agents", 529 "relevance": "Recent defense using monitoring of suspicious tool calls, directly comparable approach" 530 }, 531 { 532 "title": "Adaptive Attacks Break Defenses Against Indirect Prompt Injection Attacks on LLM Agents", 533 "relevance": "Establishes threat model for adaptive attacks that defenses must handle" 534 }, 535 { 536 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 537 "relevance": "Foundational work on IPI attacks including privacy leakage and fraud scenarios" 538 }, 539 { 540 "title": "Defense Against Prompt Injection Attack by Leveraging Attack Techniques", 541 "relevance": "Baseline prompt-based defense that uses hijacking techniques defensively" 542 }, 543 { 544 "title": "The Task Shield: Enforcing Task Alignment to Defend Against Indirect Prompt Injection in LLM Agents", 545 "relevance": "Alternative defense using correlation between user instructions and assistant messages" 546 } 547 ], 548 "engagement_factors": { 549 "practical_relevance": { 550 "score": 2, 551 "justification": "Proposes deployable prompt-based defense modules (ParseData/CheckTool) that practitioners building LLM agents could integrate, though the 28-45% utility cost limits immediate adoption." 552 }, 553 "surprise_contrarian": { 554 "score": 1, 555 "justification": "The approach of parsing/filtering tool results rather than detecting injections is a modest reframing, but the core finding that prompt-based defenses can achieve <1% ASR is incrementally better rather than surprising." 556 }, 557 "fear_safety": { 558 "score": 2, 559 "justification": "Directly addresses indirect prompt injection in LLM agents with concrete attack demonstrations and defense benchmarks, a real and growing security concern as agents gain tool-use capabilities." 560 }, 561 "drama_conflict": { 562 "score": 0, 563 "justification": "No controversy or conflict — straightforwardly proposes a defense and compares against baselines without challenging any company or widely-held belief." 564 }, 565 "demo_ability": { 566 "score": 1, 567 "justification": "Code is available on GitHub and uses the AgentDojo benchmark, but requires setting up multiple LLM APIs and the benchmark framework to reproduce." 568 }, 569 "brand_recognition": { 570 "score": 0, 571 "justification": "From Harbin Institute of Technology with no famous-lab cachet, uses an unrecognized model name (gpt-oss-120b), and the topic lacks association with a household-name product." 572 } 573 }, 574 "hn_data": { 575 "threads": [ 576 { 577 "hn_id": "46624374", 578 "title": "Quantum Automated Theorem Proving", 579 "points": 5, 580 "comments": 0, 581 "url": "https://news.ycombinator.com/item?id=46624374", 582 "created_at": "2026-01-14T22:06:27Z" 583 } 584 ], 585 "top_points": 5, 586 "total_points": 5, 587 "total_comments": 0 588 } 589 }