scan.json (32732B)
1 { 2 "paper": { 3 "title": "ICON: Indirect Prompt Injection Defense for Agents based on Inference-Time Correction", 4 "authors": [ 5 "Che Wang", 6 "Fuyao Zhang", 7 "Jiaming Zhang", 8 "Ziqi Zhang", 9 "Yinghui Wang", 10 "Longtao Huang", 11 "Jianbo Gao", 12 "Zhong Chen", 13 "Wei Yang Bryan Lim" 14 ], 15 "year": 2026, 16 "venue": "arXiv", 17 "arxiv_id": "2602.20708" 18 }, 19 "scan_version": 3, 20 "active_modules": ["experimental_rigor", "data_leakage"], 21 "methodology_tags": ["benchmark-eval"], 22 "key_findings": "ICON achieves 0.4% average attack success rate across three LLM backbones while maintaining substantially higher task utility than existing defenses (10% average UA improvement over baselines). The framework trains in under 2 minutes on 255 samples and generalizes to out-of-distribution benchmarks with >80% detection rate. The approach also extends to multimodal LLMs, achieving 2.9% average ASR with 47.2% utility under attack.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": false, 28 "justification": "No source code repository URL is provided anywhere in the paper. No GitHub, GitLab, or archive link is mentioned." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The evaluation uses publicly available benchmarks: InjectAgent (Zhan et al., 2024), AgentDojo (Debenedetti et al., 2024), TrojanTools (Anonymous, 2025), and Visual Prompt Injection Benchmarks (Wan et al., 2024). However, the 255 synthesized training samples generated by their LLM-as-Optimizer pipeline are not released." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No environment specifications are provided — no requirements.txt, Dockerfile, conda environment, hardware details, or library versions are mentioned." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Table 3 (OOD results) includes ± standard deviations over 5 cross-validation runs, but Table 2 (the main results table with ASR and UA across all models and attacks) reports only point estimates with no uncertainty measures." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "No statistical significance tests are used anywhere. Claims like 'ICON achieves an average 10% improvement in UA' are based solely on comparing point estimates without any tests." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper reports absolute differences with baseline context throughout — e.g., 'ICON provides an average 10% improvement in UA compared to other defense baselines' and 'our method maintains a UA of 53.8%, whereas the next best defense drops below 49%' (Section 5.2). Table 2 provides full baseline numbers for context." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "No justification is provided for the sample sizes. The training set of 255 samples is stated in Table 4 without explanation. The number of evaluation examples from each benchmark is not stated or justified." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "Table 3 reports standard deviations over 5 cross-validation runs for OOD evaluation, but Table 2 (the main results) reports single values with no variance, standard deviation, or spread measure. The main claims rest on Table 2." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Table 2 compares ICON against five baselines: No Defense, Repeat Prompt, Delimiting, MELON, and LLM-Detector (Qwen3Guard). Table 5 adds Gemini as a baseline for multimodal experiments." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines include MELON (Zhu et al., 2025), Qwen3Guard (Zhao et al., 2025), and AgentDojo's Repeat Prompt defense (Debenedetti et al., 2024). These are recent and represent state-of-the-art approaches across template, filter, and fine-tuning defense categories." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": false, 87 "justification": "ICON has two core modules (LSTP for detection and MR for rectification), but no formal ablation study tests each component independently. Table 3 separates ADR (detection) from URR (utility recovery), but there is no experiment showing LSTP-only or MR-only performance to isolate each module's contribution." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper uses ASR (Attack Success Rate), UA (Utility under Attack), ADR (Attack Detection Rate), and URR (Utility Recovery Rate) across different evaluation settings." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation is performed. All evaluation is automated using ASR and UA metrics. Human evaluation could help verify that 'utility recovery' genuinely produces correct and useful agent outputs rather than just non-malicious ones." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "The prober is trained exclusively on TrojanTools and tested on unseen InjectAgent and AgentDojo benchmarks (Section 5.3, Table 3), providing a clear held-out OOD evaluation." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Table 2 breaks results down by attack type (Ignore Instruction, Combined Attacks, TrojanTools) and by backbone model (QWEN, LLAMA, MISTRAL). Table 5 provides per-model breakdowns for multimodal experiments." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": false, 112 "justification": "No failure cases are shown or discussed. The paper does not analyze where ICON fails, what types of attacks evade it, or when utility recovery is incomplete." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": false, 117 "justification": "Every experimental result shows ICON performing well or comparably to the best baseline. No experiments where ICON underperforms, no configurations that failed, and no approaches that were tried and abandoned are reported." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": false, 124 "justification": "The abstract claims 'a competitive 0.4% ASR' which matches Table 2 averages. However, 'over 50% task utility gain' is misleading — this appears to conflate utility recovery rate (URR from Table 3, which is a percentage of recovered utility, not a 50-point improvement in UA). Actual UA improvements in Table 2 are around 10 percentage points over the next best baseline." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper's main causal claim is that ICON improves defense (lower ASR) while preserving utility (higher UA). The experimental design — comparing multiple defenses on the same attacks, datasets, and models — is a controlled comparison adequate for this causal inference. Section 5.4 provides mechanistic evidence through attention visualization." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims a general 'Indirect Prompt Injection Defense for Agents' but all experiments use only 8B-parameter open-weight models. No experiments with larger models (70B+) or commercial APIs are shown. The paper does not discuss whether the attention-based detection mechanism would work on models with different architectures or scales." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "No alternative explanations for the results are discussed. For example, the paper does not consider whether the attention collapse pattern might be an artifact of specific attack strategies rather than a general IPI signature, or whether the utility gains come from the softmax redistribution simply reducing confidence in all tool calls." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper directly measures what it claims: ASR measures whether attacks succeed (binary tool-call matching), and UA measures task completion under attack. These are not proxies — they are the actual outcomes of interest for evaluating a defense mechanism." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Models are listed as 'QWEN-3-8B', 'LLAMA-3.1-8B', and 'MISTRAL-8B' in Table 2. While Qwen3-8B and Llama-3.1-8B identify model families and sizes, no exact checkpoint identifiers or snapshot dates are provided. 'MISTRAL-8B' is particularly ambiguous — no version number is given, and the citation points only to the Mistral docs page." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": false, 156 "justification": "No actual prompts are provided. The paper describes using the ReAct framework and mentions an 'LLM-as-Optimizer' for attack synthesis, but the actual prompt text used for agent execution, attack generation, or defense calibration is not shown." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": false, 161 "justification": "The paper introduces hyperparameters τ (steering scope) and γ (steering intensity) as core design parameters but does not report their specific values used in experiments. LLM inference parameters (temperature, top-p, max tokens) are also not reported." 162 }, 163 "scaffolding_described": { 164 "applies": true, 165 "answer": true, 166 "justification": "The ReAct framework is described and cited (Yao et al., 2022). The agent formalization is given in Section 3 (Eq. 3) including trajectory state, tool set, and iterative reasoning-acting loop. The ICON framework itself is described in detail in Section 4 with architecture diagrams (Figure 2)." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": false, 171 "justification": "Section 4.3 describes the adaptive attack data synthesis at a conceptual level (contextual tool alignment, iterative stealthiness optimization) but does not provide concrete preprocessing details. The number of evaluation examples per benchmark, filtering criteria, or how benchmark tasks were prepared for the ReAct framework are not documented." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "There is no limitations section in the paper. The paper goes directly from Section 5 (Experiments) to Section 6 (Conclusion) with an Impact Statement. The Impact Statement says 'none of which we feel must be specifically highlighted here' — actively declining to discuss limitations." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": false, 183 "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of internal or external validity threats specific to this study." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "No explicit scope boundaries are stated. The paper does not discuss what the results do NOT show — e.g., whether ICON works on larger models, closed-source APIs, or more sophisticated adaptive attacks beyond those tested." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "No raw experimental data is available. Per-example predictions, attention maps, or individual trial outcomes are not released. Only aggregate results in tables are provided." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Evaluation datasets are established benchmarks with citations: InjectAgent (Zhan et al., 2024), AgentDojo (Debenedetti et al., 2024), TrojanTools (Anonymous, 2025), and Visual Prompt Injection Benchmarks (Wan et al., 2024). The training data synthesis process is described in Section 4.3 with two specific dimensions (contextual tool alignment, iterative stealthiness optimization)." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants in this study. All data comes from automated benchmark evaluation." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": false, 210 "justification": "The full pipeline from raw benchmark data to final results is not documented. Key missing details include: how many examples from each benchmark were used, how the ReAct agent was initialized for each task, and how ASR and UA were computed from raw agent outputs." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding information is disclosed. There is no acknowledgments section listing grants or sponsors." 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are listed: Peking University, Nanyang Technological University, Ant Group, and Alibaba. These are clearly stated on the first page." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "No funding is disclosed, but authors from Ant Group and Alibaba — companies that deploy LLM-based agents — have a potential financial interest in demonstrating effective, lightweight defenses. This potential conflict is not acknowledged." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement is present in the paper." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "This paper tests a defense mechanism against prompt injection attacks, not model knowledge on benchmarks. The evaluation measures whether the defense detects and mitigates attacks, which does not depend on whether the model has seen the benchmark tasks in training." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "This paper evaluates a defense tool rather than a pre-trained model's capability on benchmarks. Train/test overlap of the underlying LLM is not the relevant concern." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "This paper evaluates a defense mechanism, not model knowledge. Benchmark contamination of the underlying LLM would affect both defense and no-defense conditions equally." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants in this study." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "Training cost is reported (< 2 minutes, Table 4), but inference cost — the runtime overhead of ICON during agent execution — is not reported. No latency, tokens consumed, or cost-per-example figures are provided for the defense at inference time." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "Table 4 states training takes 'less than 2 mins' on 255 samples with ~31k parameters, but does not specify hardware (GPU type), nor the total compute budget for running evaluations across 3 models × 3 attack types × 3+ benchmarks." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "Table 3 reports standard deviations over 5 cross-validation runs for OOD evaluation, but the main results in Table 2 (which supports the core claims) do not report multi-seed results." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": false, 310 "justification": "Table 3 notes '5 cross validation' but Table 2 does not state how many runs produced its results. The main claims rest on Table 2." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget is reported. The paper introduces τ and γ as key hyperparameters but does not describe how their values were selected or how many configurations were tried." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": false, 320 "justification": "No description of how the final configuration (values of τ, γ, number of selected layers K) was chosen. Only final results are shown." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "The paper compares ICON against 5 baselines across 3 models, 3 attack types, and multiple metrics without any correction for multiple comparisons." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors implemented and evaluated their own system (ICON) against baselines, likely using their own implementations of at least some baselines (template-based methods). The self-comparison bias is not acknowledged." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Table 4 compares training costs but does not analyze performance as a function of compute budget. The comparison between ICON (~31k params, 2 min training) and Qwen3Guard (8B params, 10+ hours) does not normalize performance by compute." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of whether the benchmarks (InjectAgent, AgentDojo, TrojanTools) actually measure real-world defense effectiveness. The paper does not question whether ASR and UA metrics in these controlled settings translate to production security." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": true, 344 "answer": true, 345 "justification": "All experiments use the same ReAct framework (Section 5.1), ensuring the scaffold is held constant across defense comparisons. The independent variable is the defense method, not the scaffold." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the LLMs were trained on data that includes the benchmark attack patterns or tasks. The TrojanTools dataset (used for training the prober) is 'under review' and may overlap with the models' training data." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup leaks information — e.g., whether the attack payloads in the test benchmarks are structurally similar to the training data synthesis patterns." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "While the OOD evaluation (train on TrojanTools, test on InjectAgent/AgentDojo) addresses distribution shift, the paper does not discuss potential structural similarities or shared attack templates between the training and test benchmarks." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No concrete leakage detection or prevention method is used. The OOD evaluation design partially mitigates leakage concerns but is not framed or analyzed as a leakage prevention measure." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "ICON achieves a competitive 0.4% average attack success rate, matching commercial-grade detectors.", 374 "evidence": "Table 2 shows average ASR across 3 attack types and 3 LLM backbones: 0.4% for QWEN, 0.3% for LLAMA, 0.3% for MISTRAL, averaging ~0.3%. The LLM-Detector (Qwen3Guard) achieves 0.2% average. The 0.4% figure cited in the abstract appears to be an upper bound rather than an exact average.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "ICON yields over 50% task utility gain compared to baselines.", 379 "evidence": "The abstract claims 'over 50% task utility gain.' Table 3 shows Utility Recovery Rate (URR) of 62.3-69.6% for QWEN and 47.6-55.9% for LLAMA. However, the actual UA improvements in Table 2 are ~10 percentage points over the next-best defense. The '50%' figure conflates utility recovery rate with utility improvement.", 380 "supported": "weak" 381 }, 382 { 383 "claim": "ICON generalizes out-of-distribution, maintaining >97% detection on AgentDojo when trained on TrojanTools.", 384 "evidence": "Table 3 shows ADR of 98.0% ± 1.2 for QWEN and 97.3% ± 1.8 for LLAMA on AgentDojo, trained exclusively on TrojanTools. InjectAgent detection is lower at 80.1-85.4%.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "ICON trains in less than two minutes on only 255 samples with ~31k parameters.", 389 "evidence": "Table 4 states training cost is 'Less than 2 mins' on dataset size 255 with 31,553 parameters. No hardware specification is provided, making the timing claim unverifiable across different setups.", 390 "supported": "moderate" 391 }, 392 { 393 "claim": "ICON extends effectively to multimodal agents, achieving 42% average utility recovery.", 394 "evidence": "Table 5 shows ICON on three MLLMs: QWEN3VL (ASR 3.8%, UA 44.9%), INTERNVL (ASR 2.6%, UA 47.8%), MINICPM (ASR 2.4%, UA 48.8%). Average UA is 47.2% vs 28.8% for the LLM-Detector (Gemini).", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "IPI attacks manifest as intrinsic 'attention collapse' patterns detectable in latent space.", 399 "evidence": "Section 4.5 introduces Focus Intensity Score to quantify attention anomalies. Figure 3 visualizes attention maps showing concentrated focus on malicious payload positions under attack versus dispersed attention after mitigation. The mechanism is demonstrated on Head 21 of Layers 22-25 in QWEN-3-8B.", 400 "supported": "moderate" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "No code released", 406 "detail": "The paper provides no source code, making it impossible to verify the LSTP and MR implementations, reproduce the attention analysis, or confirm the reported efficiency claims." 407 }, 408 { 409 "flag": "No limitations section", 410 "detail": "The paper has no limitations or threats-to-validity section. The Impact Statement actively dismisses the need for specific concerns: 'none of which we feel must be specifically highlighted here.' This is unusual for a security paper." 411 }, 412 { 413 "flag": "No variance on main results", 414 "detail": "Table 2, which supports the core claims, reports only point estimates with no error bars, standard deviations, or number of runs. Only the secondary OOD evaluation (Table 3) includes uncertainty." 415 }, 416 { 417 "flag": "Misleading utility gain claim", 418 "detail": "The abstract claims 'over 50% task utility gain' but this conflates Utility Recovery Rate (a ratio of recovered utility) with actual utility improvement. The actual UA improvements in Table 2 are approximately 10 percentage points, not 50%." 419 }, 420 { 421 "flag": "All results positive — no negative findings", 422 "detail": "Every experiment shows ICON outperforming or matching baselines. No failure modes, attack types that evade detection, or scenarios where utility recovery fails are reported." 423 }, 424 { 425 "flag": "Undisclosed industry conflicts", 426 "detail": "Authors from Ant Group and Alibaba — companies that deploy LLM agents and would benefit from lightweight defense mechanisms — contribute to this paper. No funding source or competing interests are disclosed." 427 }, 428 { 429 "flag": "Only 8B parameter models tested", 430 "detail": "All experiments use 8B-parameter open-weight models. Whether the attention collapse signature generalizes to larger models (70B+) or models with different architectures (e.g., Mixture of Experts) is unknown." 431 }, 432 { 433 "flag": "Missing key hyperparameter values", 434 "detail": "The steering hyperparameters τ (scope) and γ (intensity) are central to the method but their specific values used in experiments are not reported, making reproduction impossible even with the code." 435 } 436 ], 437 "cited_papers": [ 438 { 439 "title": "Agentdojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 440 "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic", "L. Beurer-Kellner", "M. Fischer", "F. Tramèr"], 441 "year": 2024, 442 "relevance": "Primary evaluation benchmark for LLM agent prompt injection defense, directly relevant to agent security evaluation methodology." 443 }, 444 { 445 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 446 "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"], 447 "year": 2024, 448 "relevance": "Key benchmark for evaluating indirect prompt injection attacks in LLM agents with tool integration." 449 }, 450 { 451 "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents", 452 "authors": ["Q. Zhan", "R. Fang", "H. S. Panchal", "D. Kang"], 453 "year": 2025, 454 "relevance": "Demonstrates that adaptive attacks can bypass existing defenses, motivating the need for latent-space approaches like ICON." 455 }, 456 { 457 "title": "MELON: Provable defense against indirect prompt injection attacks in AI agents", 458 "authors": ["K. Zhu", "X. Yang", "J. Wang", "W. Guo", "W. Y. Wang"], 459 "year": 2025, 460 "arxiv_id": "2502.05174", 461 "relevance": "State-of-the-art tool-filtering defense baseline, directly compared against ICON." 462 }, 463 { 464 "title": "SecAlign: Defending against prompt injection with preference optimization", 465 "authors": ["S. Chen", "A. Zharmagambetov", "S. Mahloujifar", "K. Chaudhuri", "D. Wagner", "C. Guo"], 466 "year": 2025, 467 "relevance": "Fine-tuning-based defense approach that represents the safety fine-tuning paradigm ICON aims to improve upon." 468 }, 469 { 470 "title": "AutoHijacker: Automatic indirect prompt injection against black-box LLM agents", 471 "authors": ["X. Liu", "S. Jha", "P. McDaniel", "B. Li", "C. Xiao"], 472 "year": 2025, 473 "relevance": "Optimization-based attack generation technique relevant to understanding adaptive prompt injection threats." 474 }, 475 { 476 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 477 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 478 "year": 2023, 479 "relevance": "Foundational work on indirect prompt injection attacks against real-world LLM applications." 480 }, 481 { 482 "title": "Refusal in language models is mediated by a single direction", 483 "authors": ["A. Arditi", "O. Obeso", "A. Syed", "D. Paleka", "N. Panickssery", "W. Gurnee", "N. Nanda"], 484 "year": 2024, 485 "relevance": "Demonstrates that refusal behavior in LLMs can be manipulated via single directions in activation space — the approach ICON's rectifier extends beyond." 486 }, 487 { 488 "title": "Attention Tracker: Detecting prompt injection attacks in LLMs", 489 "authors": ["K.-H. Hung", "C.-Y. Ko", "A. Rawat", "I.-H. Chung", "W. H. Hsu", "P.-Y. Chen"], 490 "year": 2025, 491 "relevance": "Closely related attention-based detection method for prompt injection that ICON builds upon and extends." 492 }, 493 { 494 "title": "ReAct: Synergizing reasoning and acting in language models", 495 "authors": ["S. Yao", "J. Zhao", "D. Yu", "N. Du", "I. Shafran", "K. R. Narasimhan", "Y. Cao"], 496 "year": 2022, 497 "relevance": "Core agentic framework used for all ICON experiments; foundational work on LLM agent architecture." 498 }, 499 { 500 "title": "Qwen3Guard technical report", 501 "authors": ["H. Zhao", "C. Yuan", "F. Huang", "X. Hu", "Y. Zhang"], 502 "year": 2025, 503 "arxiv_id": "2510.14276", 504 "relevance": "Fine-tuned safety guardrail model used as the LLM-Detector baseline in ICON's evaluation." 505 }, 506 { 507 "title": "CyberSecEval 3: Advancing the evaluation of cybersecurity risks and capabilities in large language models", 508 "authors": ["S. Wan", "C. Nikolaidis", "D. Song", "D. Molnar"], 509 "year": 2024, 510 "arxiv_id": "2408.01605", 511 "relevance": "Source of the Visual Prompt Injection Benchmarks used for multimodal evaluation." 512 }, 513 { 514 "title": "Dissecting adversarial robustness of multimodal LM agents", 515 "authors": ["C. H. Wu", "R. R. Shah", "J. Y. Koh", "R. Salakhutdinov", "D. Fried", "A. Raghunathan"], 516 "year": 2025, 517 "relevance": "Analyzes adversarial robustness of multimodal LLM agents, directly relevant to ICON's multimodal evaluation." 518 } 519 ], 520 "engagement_factors": { 521 "practical_relevance": { 522 "score": 2, 523 "justification": "The defense mechanism could be applied to LLM agents in production, but requires white-box access to model internals (attention weights), limiting applicability to self-hosted models." 524 }, 525 "surprise_contrarian": { 526 "score": 1, 527 "justification": "The attention collapse insight is novel but builds incrementally on known work in model interpretability and attention analysis for safety." 528 }, 529 "fear_safety": { 530 "score": 2, 531 "justification": "Addresses the real and growing threat of indirect prompt injection attacks on LLM agents, with concrete attack demonstrations." 532 }, 533 "drama_conflict": { 534 "score": 0, 535 "justification": "No controversial claims or direct challenges to specific companies or products." 536 }, 537 "demo_ability": { 538 "score": 0, 539 "justification": "No code, demo, or tool is released — the method cannot be tried." 540 }, 541 "brand_recognition": { 542 "score": 1, 543 "justification": "Peking University and NTU are well-regarded; Ant Group and Alibaba are recognizable but not the primary AI research labs driving public attention." 544 } 545 } 546 }