scan.json (27736B)
1 { 2 "scan_version": 2, 3 "active_modules": ["experimental_rigor", "data_leakage"], 4 "paper": { 5 "title": "Prompt Injection Attack to Tool Selection in LLM Agents", 6 "authors": ["Jiawen Shi", "Zenghui Yuan", "Guiyao Tie", "Pan Zhou", "Neil Zhenqiang Gong", "Lichao Sun"], 7 "year": 2025, 8 "venue": "NDSS 2026", 9 "arxiv_id": "2504.19793" 10 }, 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "ToolHijacker achieves high attack success rates (up to 99.6%) against tool selection in LLM agents by injecting a single malicious tool document, even in no-box scenarios where the attacker has no access to the target system. The attack transfers across architecturally different LLMs (e.g., 96.7% ASR from Llama-3.3-70B shadow to GPT-4o target). Existing defenses — both prevention-based (StruQ, SecAlign) and detection-based (known-answer, DataSentinel, PPL) — are insufficient, with gradient-free attacks achieving 99.6% ASR even under StruQ.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "The paper states 'We will release code and data under restricted access — interested parties must request permission' in the Ethics section. This is not a public release." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper uses two public benchmark datasets: MetaTool and ToolBench, both publicly available. The 10 target tasks and 1,000 task descriptions per dataset are described in detail in Appendix C (Figures 14-15)." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment specifications, requirements files, or dependency details are provided in the paper." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided. The paper describes methods but does not include scripts or commands to replicate experiments." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results are reported as point estimates (e.g., '96.7% ASR') without confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper makes comparative claims (e.g., 'outperforms baselines') based solely on comparing percentages without any statistical significance tests." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Effect sizes are reported in context: e.g., 'gradient-free attack achieves a higher ASR by 4.5% when targeting GPT-4o' and 'Llama-3-8B increases the ASR by 15.12% over Llama-2-7B' (Section IV-C). Absolute percentages with baselines provide magnitude context." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper uses 100 target task descriptions per task and 10 tasks per dataset but provides no justification for why these numbers are adequate." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs averaged across tasks." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Seven baselines are compared: five manual attacks (naive, escape characters, context ignore, fake completion, combined) and two automated attacks (JudgeDeceiver, PoisonedRAG). Results in Table III." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include recent automated attacks: JudgeDeceiver (CCS 2024) and PoisonedRAG (2024). Also includes StruQ and SecAlign as defense baselines. These represent current state of the art." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Extensive ablation studies in Section IV-C: impact of R vs S components (Table V), impact of k, k', shadow task descriptions (Figure 6), shadow LLM choice (Tables VI-VII), similarity metrics (Table VIII), loss terms (Table XV), and hyperparameters α and β (Figure 9)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Four metrics used: ACC (accuracy without attack), ASR (attack success rate), HR (hit rate for retrieval), and AHR (attack hit rate). Defined in Section IV-A5." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": true, 87 "justification": "A user study with 6 participants was conducted to evaluate whether humans can detect malicious tool documents (Table XVII in Appendix B)." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "Shadow task descriptions Q' used for optimization are explicitly disjoint from target task descriptions Q (Q ∩ Q' = ∅, Section II-B). The 100 target task descriptions per task are distinct from the 5 shadow descriptions used for optimization." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results broken down per task (Figure 3 shows 10 tasks), per LLM (Table I shows 8 LLMs), per retriever (Table IV shows 4 retrievers), and per dataset (MetaTool vs ToolBench throughout)." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "The paper discusses cases where the attack is less effective: Claude-3-Haiku is 'the least sensitive' (Section IV-B), gradient-based attack with Llama-2-7B shadow drops to 34% ASR on Llama-3-70B (Table VII), and low k values reduce effectiveness (Figure 5)." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "Several negative findings reported: defenses are insufficient (Section V), gradient-based attack has lower transferability with weak shadow LLMs (Table VII), small k' leads to declining ASR (Figure 5), and removing any loss term significantly hurts performance (Table XV)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims about high ASR, outperforming baselines, and defenses being insufficient are all supported by Tables I, III, and IX-X respectively." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "Causal claims are primarily from ablation studies (e.g., 'removing L3 reduces ASR from 95% to 5%' in Table XV) which use controlled single-variable manipulation. The two-phase design isolates R's effect on retrieval and S's effect on selection (Table V)." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title says 'Prompt Injection Attack to Tool Selection in LLM Agents' broadly, but the evaluation is limited to a specific two-step retrieval+selection pipeline. Many LLM agents use different tool selection mechanisms (e.g., function calling APIs) that are not tested." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "The paper discusses why attacks transfer ('shared alignment objectives and training paradigms make LLMs inherently vulnerable' and 'LLM homogenization'), why gradient-free outperforms gradient-based on closed-source models, and why Claude-3-Haiku is more resistant (Section IV-B)." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper clearly distinguishes what it measures (ASR = selecting the malicious tool name) from the broader security implication (executing harmful tools). The metrics are precisely defined (Section IV-A5) and the paper explicitly discusses the gap between tool selection manipulation and actual harm potential." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Specific model versions are listed: Llama-2-7B-chat, Llama-3-8B-Instruct, Llama-3-70B-Instruct, Llama-3.3-70B-Instruct, Claude-3-Haiku, Claude-3.5-Sonnet, GPT-3.5, GPT-4o (Section IV-A3). However, no snapshot dates for closed-source models." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompt texts are provided: the selection prompt (Figure 2), shadow task description generation prompt (Figure 10), shadow tool document generation prompt (Figure 11), attacker LLM system instruction (Figure 13), and initial R and S (Figure 12)." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Hyperparameters reported in Section IV-A4: m'=5, k'=5, Titer=10, B=2, W=10 for gradient-free; α=2.0, β=0.1, R iterations=3, S iterations=400 for gradient-based." 152 }, 153 "scaffolding_described": { 154 "applies": true, 155 "answer": true, 156 "justification": "The two-step tool selection pipeline (retrieval + selection) is formally described in Section II-A with mathematical formulations. The shadow framework construction is detailed in Section III." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "For ToolBench: 'After removing duplicate tools and empty descriptions, the tool library contains 9,650 benign tool documents' from 16,464 originals (Section IV-A1). Task description generation process described with templates." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": false, 168 "justification": "There is no dedicated limitations section. The Conclusion mentions future work directions but does not discuss limitations of the current study." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "No threats to validity are discussed. The paper does not address potential confounds or weaknesses in the experimental design." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what the results do not show. It doesn't acknowledge that the attack is only tested on one specific tool selection architecture (retrieval + selection) and may not apply to other agent frameworks." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "Raw experimental results (individual trial outcomes, per-query results) are not available. Only aggregated percentages are reported." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Data collection is described: MetaTool (21,127 instances, 199 tools from OpenAI Plugins) and ToolBench (126,486 samples, 16,464 tools from RapidAPI). Task descriptions generated via LLM + human evaluation (Section IV-A1)." 191 }, 192 "recruitment_methods_described": { 193 "applies": true, 194 "answer": false, 195 "justification": "The user study mentions '6 participants' but provides no details on how they were recruited, their background, or potential selection bias." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline from dataset selection through shadow framework construction, optimization, and evaluation is documented in Sections III and IV. ToolBench filtering from 16,464 to 9,650 tools is explained." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding information is disclosed in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: Huazhong University of Science and Technology, Duke University, and Lehigh University." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interests statement is included in the paper." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": false, 228 "answer": false, 229 "justification": "This paper tests prompt injection attacks against tool selection, not model knowledge on benchmarks. The LLMs are used as tool selectors, not evaluated for their pre-trained knowledge." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": false, 233 "answer": false, 234 "justification": "Not applicable — the paper tests attack effectiveness on tool selection, not model capability on benchmark tasks." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": false, 238 "answer": false, 239 "justification": "Not applicable — contamination of benchmark solutions in training data is irrelevant to this attack evaluation." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": true, 245 "answer": false, 246 "justification": "The user study with 6 participants evaluating malicious tool documents is not pre-registered." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": true, 250 "answer": false, 251 "justification": "No IRB or ethics approval is mentioned for the user study. The Ethics section discusses responsible disclosure and informed consent but not IRB review." 252 }, 253 "demographics_reported": { 254 "applies": true, 255 "answer": false, 256 "justification": "The 6 participants are not characterized — no information about their expertise, background, or demographics." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": true, 260 "answer": false, 261 "justification": "No inclusion or exclusion criteria for the 6 participants are stated." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "This is not a between-subjects experiment requiring randomization — all participants evaluated the same tool documents." 267 }, 268 "blinding_described": { 269 "applies": true, 270 "answer": false, 271 "justification": "No blinding details are provided for the user study. It's unclear whether participants knew the ratio of malicious to benign tools." 272 }, 273 "attrition_reported": { 274 "applies": true, 275 "answer": false, 276 "justification": "No information about whether all 6 participants completed the study or if any were excluded." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "Appendix B reports attack costs: gradient-free R requires 1 LLM query and S requires ~18 LLM queries; gradient-based R requires ~1 GPU-minute and S requires ~8 GPU-hours on one NVIDIA A800 GPU." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": true, 288 "justification": "Compute budget stated in Appendix B: gradient-based requires ~8 GPU-hours on one NVIDIA A800 GPU for S optimization. Gradient-free requires ~19 LLM queries total." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is not explicitly stated. It is unclear whether results are from single or multiple optimization/evaluation runs." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "While hyperparameter sensitivity is explored (α, β in Figure 9), no search budget is reported for finding the default values (α=2.0, β=0.1)." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "The paper shows ablation results across multiple α and β values (Figure 9) and reports that ASR remains above 95% for a range of values, justifying the chosen defaults." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors implement both their attack and the baselines (JudgeDeceiver, PoisonedRAG) without acknowledging potential bias in re-implementing competitors' methods." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "No comparison of performance at matched compute budgets between ToolHijacker and baselines. The gradient-based method uses 8 GPU-hours while baselines may use significantly less." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "The paper does not discuss whether MetaTool and ToolBench adequately represent real-world tool selection scenarios. No discussion of construct validity." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "The paper evaluates a specific tool selection pipeline, not comparing models across different scaffolds. The pipeline IS the thing being attacked." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "This paper tests attack effectiveness, not model knowledge. Temporal leakage of benchmark solutions is not relevant to measuring whether an LLM selects a malicious tool." 343 }, 344 "feature_leakage_addressed": { 345 "applies": false, 346 "answer": false, 347 "justification": "Not applicable — the evaluation measures attack success on tool selection, not model capability." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": true, 352 "justification": "The paper explicitly ensures independence between shadow and target descriptions (Q ∩ Q' = ∅) and evaluates on target tasks not used during optimization. Table XII shows 0% ASR on non-target tasks." 353 }, 354 "leakage_detection_method": { 355 "applies": false, 356 "answer": false, 357 "justification": "Not applicable — the paper is not evaluating pre-trained model knowledge on benchmarks." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "ToolHijacker achieves high attack success rates across different LLMs, with gradient-free achieving 96.7% ASR on GPT-4o (MetaTool) using Llama-3.3-70B as shadow LLM.", 364 "evidence": "Table I shows ASRs across 8 LLMs and 2 datasets. GPT-4o gradient-free ASR = 96.7% on MetaTool, 88.2% on ToolBench.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "ToolHijacker significantly outperforms existing prompt injection attacks when applied to tool selection.", 369 "evidence": "Table III: gradient-free achieves 96.7% vs best baseline PoisonedRAG at 39.3% on MetaTool with GPT-4o. All 7 baselines are far below ToolHijacker.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Prevention-based defenses (StruQ, SecAlign) fail to defend against ToolHijacker.", 374 "evidence": "Table IX: gradient-free achieves 99.6% ASR under StruQ on MetaTool. SecAlign reduces ASR to 97.5% on MetaTool. Both defenses are insufficient.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Detection-based defenses are insufficient, with known-answer detection and DataSentinel having FNRs exceeding 90%.", 379 "evidence": "Table X: known-answer detection has 100% FNR for both attacks. DataSentinel has 100% FNR for gradient-free, 90% for gradient-based. PPL detects some gradient-based but misses 90%.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "The attack is targeted with minimal impact on non-target tasks.", 384 "evidence": "Table XII: gradient-free achieves 0% ASR and 0.22% AHR on non-target tasks; gradient-based achieves 0.11% ASR and 4% AHR.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Humans struggle to detect malicious tool documents generated by ToolHijacker.", 389 "evidence": "Table XVII: 6 participants failed to detect ≥71% of malicious tools (FNR 71-100%) while incorrectly flagging 5.6-30.35% of benign tools.", 390 "supported": "weak" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Tiny user study sample", 396 "detail": "The human detection study uses only 6 participants with no reported demographics, recruitment criteria, or statistical analysis. This is insufficient to draw conclusions about human detection ability." 397 }, 398 { 399 "flag": "No variance or uncertainty reporting", 400 "detail": "All results are point estimates with no error bars, confidence intervals, or variance across runs. Given the stochastic nature of LLM inference, results could vary across runs." 401 }, 402 { 403 "flag": "No limitations section", 404 "detail": "The paper has no dedicated limitations section and does not discuss scope boundaries. The attack is only tested on one specific tool selection architecture (retrieval + selection with dual encoders) but is presented broadly." 405 }, 406 { 407 "flag": "Self-comparison bias in baseline implementations", 408 "detail": "The authors implement competitor baselines (JudgeDeceiver, PoisonedRAG) themselves without acknowledging potential bias. These methods were designed for different problem settings and may be disadvantaged." 409 }, 410 { 411 "flag": "Closed-source model versions unspecified", 412 "detail": "GPT-3.5, GPT-4o, Claude-3-Haiku, Claude-3.5-Sonnet are listed without snapshot dates or API versions. Model behavior can change across versions, affecting reproducibility." 413 } 414 ], 415 "cited_papers": [ 416 { 417 "title": "SWE-agent: Agent-computer interfaces enable automated software engineering", 418 "authors": ["J. Yang", "C. E. Jimenez", "A. Wettig"], 419 "year": 2024, 420 "arxiv_id": "2405.15793", 421 "relevance": "Key agent framework for code-level software engineering tasks." 422 }, 423 { 424 "title": "MetaGPT: Meta programming for multi-agent collaborative framework", 425 "authors": ["S. Hong", "X. Zheng", "J. Chen"], 426 "year": 2023, 427 "arxiv_id": "2308.00352", 428 "relevance": "Multi-agent collaborative framework for software development." 429 }, 430 { 431 "title": "Gorilla: Large language model connected with massive APIs", 432 "authors": ["S. G. Patil", "T. Zhang", "X. Wang"], 433 "year": 2023, 434 "arxiv_id": "2305.15334", 435 "relevance": "LLM agent connected with massive APIs for tool use." 436 }, 437 { 438 "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs", 439 "authors": ["Y. Qin", "S. Liang", "Y. Ye"], 440 "year": 2023, 441 "arxiv_id": "2307.16789", 442 "relevance": "Major benchmark for LLM tool use with 16K+ APIs, used as evaluation dataset in this paper." 443 }, 444 { 445 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 446 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra"], 447 "year": 2023, 448 "relevance": "Foundational work on indirect prompt injection in LLM-integrated applications." 449 }, 450 { 451 "title": "PoisonedRAG: Knowledge poisoning attacks to retrieval-augmented generation of large language models", 452 "authors": ["W. Zou", "R. Geng", "B. Wang", "J. Jia"], 453 "year": 2024, 454 "arxiv_id": "2402.07867", 455 "relevance": "Key baseline: adversarial text injection into RAG knowledge bases." 456 }, 457 { 458 "title": "Optimization-based prompt injection attack to LLM-as-a-Judge", 459 "authors": ["J. Shi", "Z. Yuan", "Y. Liu"], 460 "year": 2024, 461 "relevance": "JudgeDeceiver - gradient-optimized prompt injection baseline for LLM judging." 462 }, 463 { 464 "title": "StruQ: Defending against prompt injection with structured queries", 465 "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"], 466 "year": 2024, 467 "arxiv_id": "2402.06363", 468 "relevance": "Prevention-based defense against prompt injection evaluated in this paper." 469 }, 470 { 471 "title": "Aligning LLMs to be robust against prompt injection", 472 "authors": ["S. Chen", "A. Zharmagambetov", "S. Mahloujifar"], 473 "year": 2024, 474 "arxiv_id": "2410.05451", 475 "relevance": "SecAlign defense against prompt injection via preference optimization." 476 }, 477 { 478 "title": "DataSentinel: A game-theoretic detection of prompt injection attacks", 479 "authors": ["Y. Liu", "Y. Jia", "J. Jia", "D. Song", "N. Z. Gong"], 480 "year": 2025, 481 "relevance": "State-of-the-art detection-based defense against prompt injection." 482 }, 483 { 484 "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents", 485 "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic"], 486 "year": 2024, 487 "relevance": "Comprehensive evaluation framework for prompt injection in LLM agents." 488 }, 489 { 490 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 491 "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"], 492 "year": 2024, 493 "arxiv_id": "2403.02691", 494 "relevance": "Benchmark for indirect prompt injection in tool-using LLM agents." 495 }, 496 { 497 "title": "From allies to adversaries: Manipulating LLM tool-calling through adversarial injection", 498 "authors": ["H. Wang", "R. Zhang", "J. Wang"], 499 "year": 2024, 500 "arxiv_id": "2412.10198", 501 "relevance": "Adversarial manipulation of LLM tool calling mechanisms." 502 }, 503 { 504 "title": "A critical evaluation of defenses against prompt injection attacks", 505 "authors": ["Y. Jia", "Z. Shao", "Y. Liu"], 506 "year": 2025, 507 "arxiv_id": "2505.18333", 508 "relevance": "Critical evaluation showing defenses sacrifice general capabilities and remain vulnerable to adaptive attacks." 509 }, 510 { 511 "title": "Defeating prompt injections by design", 512 "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan"], 513 "year": 2025, 514 "arxiv_id": "2503.18813", 515 "relevance": "Security policy enforcement approach to prevent prompt injection in LLM agents." 516 } 517 ] 518 }