scan.json (35365B)
1 { 2 "paper": { 3 "title": "Simple Prompt Injection Attacks Can Leak Personal Data Observed by LLM Agents During Task Execution", 4 "authors": [ 5 "Meysam Alizadeh", 6 "Zeynab Samei", 7 "Daria Stetsenko", 8 "Fabrizio Gilardi" 9 ], 10 "year": 2025, 11 "venue": "arXiv (preprint, under review)", 12 "arxiv_id": "2506.01055", 13 "doi": "10.48550/arXiv.2506.01055" 14 }, 15 "scan_version": 3, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "Prompt injection attacks targeting data exfiltration cause a 15–50% utility drop across 6 LLMs tested on AgentDojo's banking suite, with average attack success rates around 20%. Most LLMs resist leaking passwords due to safety alignment but remain vulnerable to disclosing other personal data; password leakage increases when requested alongside additional sensitive fields. Some defenses (prompt injection detector, prompt sandwiching) reduce ASR to zero on the original 16 tasks, but no defense fully prevents leakage across an expanded 48-task evaluation.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper states 'All results are based on AgentDojo which has a Github repository' and 'We will release our new synthetic dataset upon acceptance of the paper.' The extended attack code and synthetic dataset are not yet released. A promise of future release counts as NO." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": false, 29 "justification": "The underlying AgentDojo framework is public, but the paper's novel contributions—32 additional synthetic banking tasks, expanded injection tasks, and generalized injection templates—are not yet released. 'We will release our new synthetic dataset upon acceptance of the paper' (NeurIPS checklist, Q5)." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No requirements.txt, Dockerfile, or environment specification is provided. The paper mentions using model APIs but provides no dependency or environment details beyond model names." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions are provided. The paper describes methods at a conceptual level and references AgentDojo's repository, but does not include commands, scripts, or a README for reproducing the experiments." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports 95% confidence intervals in Tables 6 and 7, computed using statsmodels.stats.proportion.proportion_confint (NeurIPS checklist, Q7). Figures 4 and 7 reference these tables for CI values." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "No formal significance tests (t-tests, chi-squared, bootstrap tests) are reported. The paper makes comparative claims (e.g., 'the Important message variant outperforming prior approaches') based on numerical comparison with confidence intervals but no explicit hypothesis tests." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Effect sizes are reported as absolute percentage differences with baseline context throughout: '15%–50% drop in utility' (Section 1), 'additional 2.5% boost' for Max attack (Section 4.3), 'accurate name knowledge increases attack success by 4.1%' (Table 3)." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification is given for the sample sizes. The original 16 AgentDojo tasks are inherited from the benchmark, and the 32 additional synthetic tasks are added without power analysis or justification for why 48 total is sufficient." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": false, 66 "justification": "No variance or standard deviation across experimental runs is reported. The 95% CIs in Tables 6-7 are binomial proportion confidence intervals (from proportion_confint), not measures of run-to-run variance. It appears each scenario was run once per model." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "Multiple baselines are included: 4 attack types from prior work (Important message, Ignore previous instruction, InjecAgent, TODO) compared in Figure 5 and Table 1. Defense evaluation compares no-defense baseline against 4 defense strategies (Section 4.2)." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "Both models and defense methods are contemporary. Models include GPT-4o, Claude 3.5 Sonnet, and Llama-4 (2024-2025 era). Defenses include recent methods from AgentDojo (2024), ProtectAI's BERT classifier, and TaskShield." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 4.3 presents an ablation study on attack components: comparing 4 attack phrasings, evaluating the effect of attacker knowledge (correct vs. incorrect user/model names), and an adaptive Max strategy. Table 3 quantifies each factor's contribution." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Three evaluation metrics are used (Section 3.4): Benign Utility, Utility Under Attack, and Targeted Attack Success Rate (ASR). These are consistently reported throughout." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": false, 93 "justification": "No human evaluation is included. All evaluation is automated through AgentDojo's framework, which programmatically checks whether data was leaked and whether the user task was completed. No human review of model outputs is described." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "No explicit train/test separation is described. The paper notes that 'early experiments revealed that LLMs respond inconsistently depending on the nature of the requested data' (Section 4.1), suggesting iterative design based on observed results without a held-out evaluation set." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Extensive per-category breakdowns are provided: per-model results (Figures 2-3), per-injection-task results (Figure 3), per-defense results (Figure 4), per-task-category results (Figures 6, 14), and per-sensitivity-level results (Figures 13, 16-18)." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Failure cases are discussed throughout. Table 2 shows examples of LLM refusals to disclose passwords. The paper discusses when attacks fail (Only Password injection has 0% ASR on most models), when defenses fail (no defense achieves 0% on 48 tasks), and model-specific vulnerabilities." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "Several negative results are reported: the 'Only Password' injection has 0% ASR on most models (Figure 3b); GPT-3.5 Turbo unexpectedly performs better under attack (Figure 2a); defenses that achieved 0% ASR on 16 tasks fail on 48 tasks (Section 4.4); and the discrepancy with AgentDojo's original findings about defense utility improvement." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims are supported: '15%–50% drop in utility' matches Figure 2a; 'average ASR around 20%' matches Figure 2b; 'some defenses reduce ASR to zero' matches Figure 4b; 'average ASR is around 15%' for 48 tasks matches Figure 6b. All quantitative claims in the abstract have corresponding results." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper makes causal claims about prompt injection causing data leakage. The experimental design uses controlled injection of specific prompts into specific tasks, measuring the effect on ASR. The ablation study (Section 4.3) isolates individual attack components. This controlled-manipulation design is adequate for the causal claims made." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": true, 130 "justification": "Claims are generally bounded to the tested setting. The title specifies 'LLM Agents During Task Execution,' and results are reported per-model and per-task-category. The limitations acknowledge the work 'does not capture the full range of adversarial scenarios' and that results are 'sensitive to variations in dataset characteristics and model architectures.'" 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "Alternative explanations for the results are not substantively discussed. The paper attributes password resistance to 'safety alignments' and task-type effects to semantic similarity, but does not systematically consider confounds such as model size, training data composition, RLHF intensity, or prompt formatting effects on the observed ASR differences across models." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper measures data exfiltration directly: ASR checks whether specific personal data was actually leaked via the email tool. The measurement (was data sent to attacker email?) directly matches the claimed outcome (data exfiltration risk). No proxy gap exists." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models are identified by marketing names only: 'Claude 3.5 Sonnet', 'GPT-3.5 Turbo', 'GPT-4 Turbo', 'GPT-4o', 'Llama-3 (70B)', 'Llama-4 (17B)' (Section 4). No API snapshot dates, version IDs (e.g., gpt-4o-2024-05-13), or access dates are provided. Model behavior changes across versions." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Full system prompts are provided in Appendix A.2: Figure 10 (default prompt), Figure 11 (Claude-specific), Figure 12 (Llama-specific). Injection prompt templates with full text are in Table 1. Synthetic benchmark creation prompts are in Figures 8-9." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": false, 157 "justification": "No hyperparameters are reported for any model API calls. Temperature, top-p, max tokens, and other generation parameters are not mentioned anywhere in the paper." 158 }, 159 "scaffolding_described": { 160 "applies": true, 161 "answer": true, 162 "justification": "The AgentDojo framework is described in Section 3.2, including its environment (tools, state), task structure, and evaluation pipeline. The banking suite's 11 tools and task structure are described. AgentDojo is a public framework with open-source code, enabling full scaffold inspection." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": false, 167 "justification": "The synthetic dataset creation process (Section 3.5) describes a 'multi-step process' using 4 LLMs, with prompts in Figures 8-9. However, the aggregation and curation steps ('These structured outputs were aggregated and curated') lack detail—no filtering criteria, no counts of items generated vs. retained, no inter-annotator agreement on curation decisions." 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "The conclusion contains a substantive limitations paragraph: 'This study has several limitations. While it approximates real-world conditions, it does not capture the full range of adversarial scenarios. Attackers with domain-specific expertise or operating under alternative threat models may exploit vulnerabilities not addressed in this analysis.'" 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": false, 179 "justification": "The limitations are mostly generic rather than specific to this study. 'Does not capture the full range of adversarial scenarios' and 'results are sensitive to variations in dataset characteristics and model architectures' apply to virtually any benchmark study. No specific threats are identified (e.g., AgentDojo's synthetic environment may not reflect real banking agent behavior, different API versions could yield different results)." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "No explicit list of what the results do NOT show. The limitations paragraph mentions general caveats but does not state specific scope boundaries such as: results do not extend to non-English prompts, multi-turn attacks, production deployments, or models beyond those tested. Future work mentions are vague ('other sensitive domains')." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "Raw experimental data (model outputs, tool call traces, per-task results) are not available. Only aggregated results in figures and tables are presented. The synthetic dataset is also not released ('upon acceptance')." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Data collection is described: AgentDojo's banking suite provides the base 16 tasks and 11 tools (Section 3.2), synthetic dataset creation uses 4 LLMs with documented prompts (Section 3.5, Figures 8-9), and injection task design is detailed in Tables 1-2 and Figure 15." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. The paper evaluates LLM agents on synthetic tasks. Data sources are AgentDojo (a standard benchmark) and LLM-generated synthetic data." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": false, 206 "justification": "The pipeline from raw LLM-generated outputs to the final 32 synthetic tasks is not fully documented. The paper states outputs were 'aggregated and curated' but does not specify filtering criteria, how many candidate tasks were generated vs. retained, or what curation decisions were made. Table 5 shows the final 48 tasks but the reduction pipeline is opaque." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section listing grants, sponsors, or funding agencies." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: University of Zurich (Alizadeh, Stetsenko, Gilardi) and IPM (Samei). The authors are not affiliated with any of the LLM companies whose products are evaluated." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, so independence cannot be assessed. Without a funding statement, this defaults to NO since absence of disclosure is not absence of conflict." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interests statement is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": true, 234 "answer": false, 235 "justification": "No training data cutoff dates are stated for any of the 6 models tested. This is relevant because AgentDojo was published in 2024 and could appear in training data for models trained after that date." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": true, 239 "answer": false, 240 "justification": "No discussion of whether AgentDojo's banking tasks, injection templates, or evaluation logic appeared in any model's training data. Given AgentDojo has a public GitHub repository, models trained after mid-2024 could have seen the exact task descriptions and expected behaviors." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": true, 244 "answer": false, 245 "justification": "AgentDojo was published in 2024 (arXiv:2406.13352) with public code and data. Models like GPT-4o and Claude 3.5 Sonnet may have been trained on data including AgentDojo contents. This is not discussed, and could affect both utility and ASR measurements if models learned to handle these specific tasks." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The paper evaluates LLM agents on synthetic benchmark tasks." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants. All data is synthetic and experiments involve only LLM API calls." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants or experimental conditions requiring randomization of human subjects." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants or evaluators requiring blinding." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": true, 289 "justification": "Appendix C provides detailed cost estimates: preliminary evaluation costs ~$10 for GPT models, ~$10 for Claude; defense evaluation ~$10; ablation analysis ~$15; expanded evaluation ~$30 for GPT, ~$30 for Claude. Total costs: ~$127.5 for GPT models, ~$40 for Claude." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": true, 294 "justification": "Appendix C states the total computational budget: approximately $127.5 for all GPT model experiments and $40 for Claude 3.5 Sonnet experiments, covering all task suites including both injection and benign evaluations." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": false, 301 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs per scenario. LLM outputs can vary significantly across runs even with identical inputs." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": false, 306 "justification": "The number of experimental runs per scenario is never explicitly stated. The paper does not mention whether each task-injection pair was run once or multiple times." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search is described. Temperature, top-p, and other generation parameters are neither reported nor tuned. The 4 injection variants were designed manually, not through systematic search." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "All attack variants and defense configurations are reported, not just the best. The 'Max' adaptive attack explicitly selects the best per-task attack and is clearly described (Section 4.3). All defense results are shown side-by-side in Figures 4 and 7." 317 }, 318 "multiple_comparison_correction": { 319 "applies": true, 320 "answer": false, 321 "justification": "Many comparisons are made across 6 models × 4 injection types × 4+ defenses × 48 tasks without any multiple comparison correction. No Bonferroni, Holm, or Benjamini-Hochberg correction is applied." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors' own 'Important message' attack outperforms baseline attacks from prior work (Figure 5), but no discussion of self-comparison bias is provided. The authors designed the attack, the evaluation framework extensions, and the synthetic tasks—a setup that could favor their approach." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": false, 330 "answer": false, 331 "justification": "Compute differences across models and defenses are negligible for this type of API-based evaluation. The paper is not comparing methods with different computational requirements." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "The paper does not discuss whether AgentDojo's simulated banking environment and synthetic tasks validly represent real-world data exfiltration risk. The gap between a sandboxed evaluation framework with fictional bank data and actual deployed banking agents handling real user data is not examined." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": true, 340 "answer": false, 341 "justification": "Different system prompts are used for different models (Figures 10-12): GPT models get one prompt, Claude gets a different prompt with chain-of-thought tags, and Llama gets a substantially different prompt with XML-style tags. These prompt differences could confound model comparisons but are not discussed as a potential confounder." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "Not discussed. AgentDojo was published in June 2024 and its tasks could appear in training data for models released after that date. No temporal analysis of when models were trained relative to when benchmark data was published." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Not discussed. The evaluation setup provides the full system prompt and user task context to the agent, which is realistic, but whether any evaluation artifacts leak answer information is not analyzed." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "Not discussed. The 16 original AgentDojo tasks and 32 synthetic tasks may share structural patterns (all are banking-domain, similar tool-call patterns), which could inflate apparent consistency of results." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference tests, or decontamination procedures are applied to check whether models have seen AgentDojo data." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "LLMs experience a 15–50% drop in absolute utility when under prompt injection attack.", 370 "evidence": "Figure 2a shows benign utility vs. utility under attack across 6 models. Most models fall well below the diagonal, with drops ranging from ~15% (GPT-4o) to ~50% (Claude 3.5 Sonnet). Exception: GPT-3.5 Turbo slightly improves under attack.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Average targeted attack success rate is around 20%, with Llama-4 (17B) at 40% being a notable outlier.", 375 "evidence": "Figure 2b shows ASR vs. benign utility. Most models cluster near 20% ASR. Llama-4 (17B) shows ~40% ASR, substantially higher than other models.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Most LLMs resist leaking passwords due to safety alignment, but remain vulnerable to disclosing other personal data.", 380 "evidence": "Table 2 shows model refusal examples. Figure 3b shows 'Only Password' injection achieves 0% ASR on all models except Llama-4 (17B), while 'No Password' injection achieves up to 93% ASR on Claude 3.5 Sonnet. Figure 13b shows password leakage rates consistently below overall ASR.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Password leakage increases when a password is requested alongside one or two additional personal details.", 385 "evidence": "Figure 13a shows password leakage rates increase from 0% (password-only) to non-zero when combined with other data. In 'Password + 2 Sensitive Data', every model except GPT-4 and GPT-3.5 shows some password leakage vulnerability.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Some defense strategies (prompt injection detector, repeat user prompt) can reduce ASR to 0% on the original 16 AgentDojo tasks.", 390 "evidence": "Figure 4b and Table 6 show both prompt injection detector and repeat user prompt achieve 0% ASR. Tool filtering achieves 3.1% ASR. These results are for GPT-4o on 16 tasks.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "No built-in AgentDojo defense fully prevents leakage on the expanded 48-task evaluation.", 395 "evidence": "Figure 7b and Table 7 show all defenses have non-zero ASR on 48 tasks: tool filter 1.0%, PI detector 1.5%, repeat prompt 7.3%, delimiting 10.3%. This contrasts with the 0% ASR achieved by some defenses on only 16 tasks.", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Tasks involving data extraction or authorization workflows exhibit the highest attack success rates.", 400 "evidence": "Figure 14b shows Account Information and Profile & Authentication Management task categories have the highest ASR, while Fund Transfer & Payment and Transactions & Insights show lower susceptibility.", 401 "supported": "moderate" 402 }, 403 { 404 "claim": "The 'Important message' attack outperforms prior injection approaches, and the adaptive Max strategy adds an additional 2.5% improvement.", 405 "evidence": "Figure 5 compares 5 attack types on GPT-4o. Important message achieves the highest single-attack ASR, and Max achieves an additional 2.5% boost. However, the improvement is modest and no significance test is reported.", 406 "supported": "moderate" 407 } 408 ], 409 "red_flags": [ 410 { 411 "flag": "Single-run results without variance", 412 "detail": "Results appear to be from single runs per scenario with no seed sensitivity analysis. LLM outputs are stochastic, and the paper's binomial CIs capture sampling uncertainty but not run-to-run variance. This is especially concerning given only 16-48 tasks per evaluation." 413 }, 414 { 415 "flag": "Different prompts per model confound comparisons", 416 "detail": "Three different system prompts are used across model families (Figures 10-12). Claude gets chain-of-thought tags, Llama gets XML-style function-calling instructions, and GPT models get a simpler prompt. These substantial differences in prompting could confound cross-model ASR and utility comparisons, but are not discussed as a potential confounder." 417 }, 418 { 419 "flag": "No hyperparameters reported", 420 "detail": "Temperature, top-p, and other generation parameters are not reported for any model API call. These settings significantly affect output variability and could influence both attack success and defense effectiveness." 421 }, 422 { 423 "flag": "Misleading NeurIPS checklist response on LLM usage", 424 "detail": "Question 16 asks about LLM usage in core methods, and the paper answers NA ('The core method development in this research does not involve LLMs as any important, original, or non-standard components'). However, 4 LLMs were used to generate the synthetic benchmark dataset (Section 3.5), which is a core methodological contribution." 425 }, 426 { 427 "flag": "Benchmark contamination risk unaddressed", 428 "detail": "AgentDojo was published in June 2024 with public code. Models like GPT-4o, Claude 3.5 Sonnet, and Llama-4 may have been trained on data containing AgentDojo's task descriptions and expected behaviors. This could inflate both utility scores (models learned correct behavior) and deflate ASR (models learned to resist known attacks)." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "AgentDojo: A dynamic environment to evaluate attacks and defenses for LLM agents", 434 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"], 435 "year": 2024, 436 "arxiv_id": "2406.13352", 437 "relevance": "Core benchmark framework used in this paper; foundational agentic security evaluation with 97 tasks and 629 security test cases." 438 }, 439 { 440 "title": "Defeating prompt injections by design", 441 "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan"], 442 "year": 2025, 443 "arxiv_id": "2503.18813", 444 "relevance": "CaMeL framework for design-based defenses against prompt injection in LLM agents; identified as future work direction." 445 }, 446 { 447 "title": "SecAlign: Defending against prompt injection with preference optimization", 448 "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"], 449 "year": 2025, 450 "arxiv_id": "2410.05451", 451 "relevance": "Preference-based defense achieving near-zero injection success while preserving utility; relevant to defense evaluation methodology." 452 }, 453 { 454 "title": "StruQ: Defending against prompt injection with structured queries", 455 "authors": ["Siyuan Chen", "Peter Yong Zhong", "Ruiqi Wang"], 456 "year": 2025, 457 "arxiv_id": "2502.08966", 458 "relevance": "Structure-based defense separating control and data channels, cutting injection success to under 2%." 459 }, 460 { 461 "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents", 462 "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], 463 "year": 2024, 464 "arxiv_id": "2403.02691", 465 "relevance": "Benchmark for indirect prompt injection in tool-augmented LLM agents; provides attack templates used as baselines." 466 }, 467 { 468 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 469 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 470 "year": 2023, 471 "relevance": "Foundational work demonstrating indirect prompt injection attacks on LLM-integrated applications, including data exfiltration." 472 }, 473 { 474 "title": "AI agents that matter", 475 "authors": ["Sayash Kapoor", "Benedikt Stroebl", "Zachary S Siegel", "Nitya Nadgir", "Arvind Narayanan"], 476 "year": 2024, 477 "arxiv_id": "2407.01502", 478 "relevance": "Critique of AI agent evaluation practices; relevant to methodological quality in agentic AI research." 479 }, 480 { 481 "title": "Jatmo: Task-specific fine-tuning for robust prompt injection defense", 482 "authors": ["Jonathan Piet", "Yicheng Zhang", "Yang Liu"], 483 "year": 2023, 484 "arxiv_id": "2309.13756", 485 "relevance": "Defense approach using synthetic teacher-generated data to reduce injection rates from 87% to below 0.5%." 486 }, 487 { 488 "title": "TaskShield: Runtime verification for task-aligned LLM agents", 489 "authors": ["Xinyu Jia", "Yicheng Zhang", "Yang Liu"], 490 "year": 2024, 491 "arxiv_id": "2403.02691", 492 "relevance": "Runtime defense strategy enforcing security policies during inference to reduce indirect attacks." 493 }, 494 { 495 "title": "Extracting training data from large language models", 496 "authors": ["Nicholas Carlini", "Florian Tramer", "Eric Wallace"], 497 "year": 2021, 498 "relevance": "Foundational work on training data extraction from LLMs; related to the data exfiltration threat studied in this paper." 499 }, 500 { 501 "title": "Instruction hierarchy: Enhancing LLM robustness against conflicting instructions", 502 "authors": ["Eric Wallace", "Yicheng Zhang", "Yang Liu"], 503 "year": 2024, 504 "arxiv_id": "2403.02691", 505 "relevance": "Hierarchical instruction model training LLMs to prioritize high-privilege commands; relevant defense strategy." 506 }, 507 { 508 "title": "Can LLMs separate instructions from data? And what do we even mean by that?", 509 "authors": ["Egor Zverev", "Sahar Abdelnabi", "Soroush Tabesh", "Mario Fritz", "Christoph H Lampert"], 510 "year": 2024, 511 "arxiv_id": "2403.06833", 512 "relevance": "Fundamental analysis of the instruction-data separation problem underlying prompt injection vulnerabilities." 513 } 514 ], 515 "engagement_factors": { 516 "practical_relevance": { 517 "score": 2, 518 "justification": "Security practitioners can apply these attack patterns to test their own LLM agents, and the defense evaluations provide actionable guidance for deploying banking agents." 519 }, 520 "surprise_contrarian": { 521 "score": 1, 522 "justification": "That prompt injection works is not surprising, but the finding that passwords are harder to extract alone than when bundled with other data challenges the intuition that requesting more data is harder." 523 }, 524 "fear_safety": { 525 "score": 3, 526 "justification": "Directly demonstrates that simple, unsophisticated attacks can cause LLM agents to exfiltrate personal banking data, with no defense fully preventing leakage in the expanded evaluation." 527 }, 528 "drama_conflict": { 529 "score": 1, 530 "justification": "No major controversy or direct challenge to specific companies, though the finding that all tested defenses fail on expanded tasks could generate discussion." 531 }, 532 "demo_ability": { 533 "score": 1, 534 "justification": "AgentDojo is publicly available and the attack prompts are fully specified, but the extended dataset is not yet released and there is no standalone demo." 535 }, 536 "brand_recognition": { 537 "score": 2, 538 "justification": "Tests well-known models (GPT-4o, Claude 3.5 Sonnet, Llama) but the research lab (University of Zurich) is not a major AI brand." 539 } 540 } 541 }