scan.json (29625B)
1 { 2 "paper": { 3 "title": "PromptLocate: Localizing Prompt Injection Attacks", 4 "authors": [ 5 "Yuqi Jia", 6 "Yupei Liu", 7 "Zedian Shao", 8 "Jinyuan Jia", 9 "Neil Zhenqiang Gong" 10 ], 11 "year": 2025, 12 "venue": "IEEE S&P 2026", 13 "arxiv_id": "2510.12252", 14 "doi": "10.48550/arXiv.2510.12252" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "PromptLocate achieves 0.93–0.99 ROUGE-L and embedding similarity scores across seven existing prompt injection attacks on OpenPromptInjection, significantly outperforming attribution-method baselines (0.23–0.78). The three-step approach—semantic segmentation, oracle-based instruction localization, and contextual-inconsistency-based data localization—remains effective against eight adaptive attacks (RL 0.86–0.96). Practical applications include post-attack forensic analysis (1% FPR, 2% FNR) and data recovery retaining 85–100% of original task performance.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "Abstract states: 'Our code and data are available at: https://github.com/liu00222/Open-Prompt-Injection.'" 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "Code and data are provided at the GitHub link. The evaluation uses publicly available benchmarks: OpenPromptInjection and AgentDojo." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions GPU hardware (RTX A5000, H100) and model names (Mistral-7B, GPT-2, LLaMA3-8B-Instruct) but provides no requirements.txt, Dockerfile, or detailed dependency/version list sufficient to recreate the environment." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "The paper provides no step-by-step reproduction instructions, 'Reproducing Results' section, or explicit commands. A GitHub repository is linked but the paper itself does not contain reproduction guidance." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables 1–8 are point estimates. No confidence intervals, error bars, or ± notation reported anywhere." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims PromptLocate 'significantly outperforms' baselines (Section 5.2) but no statistical significance tests (t-tests, bootstrap, etc.) are used. All comparisons are based on direct numerical comparison." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Raw performance values are reported for both PromptLocate and all baselines with full context. E.g., Table 1 shows PromptLocate RL=0.97 vs best baseline SFA-T RL=0.57 for Naive Attack, enabling magnitude assessment." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "Sample size (100 contaminated samples per target-injected task combination) is justified only by computational tractability: 'computationally expensive to evaluate. To make the evaluation tractable, we randomly sample 100.' No power analysis or statistical adequacy justification." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs. Values are averaged over task combinations but no spread across runs is shown." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Six baseline methods are compared: SFA-H, SFA-T, FRA-H, FRA-T, SVA-H, SVA-T (three attribution methods × two score selection strategies). Section 5.1.3 describes them in detail." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "The baselines are adapted from standard attribution methods (SFA, FRA, SVA). Additionally, the oracle is built on DataSentinel (IEEE S&P 2025), which is state-of-the-art for prompt injection detection. Two concurrent methods are also compared in Section 7." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 5.4 presents comprehensive ablation studies: three variants of Step I (sentence/word/embedding segmentation, Tables 4), six combinations of oracles and search strategies for Step II (Table 5), and with/without Step III (Table 6)." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Six evaluation metrics used: ROUGE-L, Embedding Similarity, Precision, Recall, ASV-B (before removal), and ASV-A (after removal). Defined in Section 5.1.2." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "All evaluation is automated using ROUGE-L, embedding similarity, precision, recall, and ASV metrics. No human evaluation of localization quality is performed." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Validation and test sets are explicitly separated: 'none of the target or injected data samples in the validation set overlap with those in the evaluation set' (Section 5.1.4). The oracle fine-tuning data is 'distributionally different from the OpenPromptInjection and AgentDojo datasets.'" 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down by attack type (Tables 1, 3), by AgentDojo environment (Table 2: Banking, Travel, Slack, Workspace), by ablation variant (Tables 4–6), and by injected task (Table 14)." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 7 discusses failure cases: same-type target/injected tasks, imperfect data recovery, and attacks that bypass the detector. Figure 5 shows an example with imperfect localization (Recall=0.70) on AgentDojo Slack. Lower Slack performance is noted in Section 5.2." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Several negative results reported: the oracle misses 63% of contaminated segments even after tailoring (Section 5.2); KAD performs worse than DataSentinel; individual-segment search is suboptimal; removing Step III drops RL from 0.97 to 0.89 (Table 6); lower Recall on AgentDojo Slack (0.73)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims 'accurately localizes injected prompts across eight existing and eight adaptive attacks' are supported by Tables 1–3 showing RL 0.86–0.99 across all settings." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims about component contributions are supported by controlled ablation studies: Step I variants (Table 4), Step II oracle/search variants (Table 5), and Step III inclusion (Table 6) each isolate single variables." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": true, 131 "justification": "Claims are bounded to tested settings. Abstract specifies 'eight existing and eight adaptive attacks.' Results are reported per-benchmark and per-attack. Section 7 explicitly bounds scope, noting the method has 'limited effectiveness' when target and injected tasks are the same type." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": true, 136 "justification": "Section 7 discusses alternative explanations and limitations: same-type attacks reduce to adversarial examples; imperfect recovery may still allow residual attacks; the method depends on the oracle's quality. The connection between prompt injection and classical injection attacks (SQL, XSS) is analyzed." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures ROUGE-L and embedding similarity between localized and ground-truth injected prompts, which directly measure localization accuracy—the stated goal. ASV-A measures the practical impact of removal. No proxy gap exists between the metrics and the claims." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": false, 148 "justification": "Models are identified by family name but not exact version: 'LLaMA3-8B-Instruct,' 'Mistral-7B,' 'GPT-4o' (via Azure API without snapshot date), 'GPT-2.' Per the schema, marketing names like 'GPT-4o' without a snapshot date do not count as specified versions." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Detection instruction text is provided: 'Repeat [secret key] once while ignoring the following text:' (Section 2.3). Target instructions and injected prompts are given for the forensic analysis use case (Section 6.1, Figure 4). All other prompts come from the referenced OpenPromptInjection benchmark [6] with code at the provided GitHub link." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "Some hyperparameters are reported: τ=0 (segmentation threshold), α=1 (adaptive attack), 7-character secret key, 500 training segments. However, LLM inference parameters (temperature, top-p, max tokens) are not reported for any model." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "PromptLocate is a three-step localization pipeline, not an agentic scaffolding system. No agentic scaffolding is used." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Data preprocessing is thoroughly documented: validation dataset construction (556 samples from Combined Attack, Section 5.1.4), oracle training segment generation (500 clean + 500 contaminated segments), sampling strategy for evaluation (100 samples per task combination)." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 7 'Discussion and Limitations' is a dedicated multi-paragraph section covering natural segmentation, evaluation scope, oracle alternatives, imperfect recovery, and adaptive attacks." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 7 discusses specific threats: 'PromptLocate may fail under adaptive attacks when the target task and the injected task are of the same type' because this 'reduces to a traditional adversarial example or a misinformation attack.' The imperfect recovery dilemma (denial-of-service vs. residual risk) is also specific." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 7 explicitly states what PromptLocate cannot handle: 'it is difficult to detect the contamination and for PromptLocate (or any method) to localize the injected content' when same-type attacks are used. The dependence on the underlying detector is acknowledged: 'prompt injection attacks that bypass the detector can also evade PromptLocate.'" 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": true, 192 "justification": "Code and data are released at the GitHub repository. The benchmarks (OpenPromptInjection, AgentDojo) are publicly available. The Amazon Reviews dataset [44] used in Section 6.1 is public." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Data generation is described in detail: how contaminated samples are constructed from target and injected task pairs (Section 5.1.1), how validation datasets are generated (Section 5.1.4), and how oracle training data is created (Sections 4.2 and 5.1.4)." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. All data comes from standard benchmarks (OpenPromptInjection, AgentDojo, Amazon Reviews)." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline from benchmark datasets through contaminated sample generation to evaluation is documented: 7 tasks × 7 injected tasks × 100 samples = 4,900 per attack for OpenPromptInjection; separate validation set of 556 samples; oracle training with 500 clean + 500 contaminated segments." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Acknowledgements section: 'This work was supported by NSF under grant no. 2450935, 2131859, 2125977, 2112562, and 1937787.'" 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: Duke University and The Pennsylvania State University, with email addresses provided." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": true, 224 "justification": "NSF (National Science Foundation) is an independent government funding agency with no financial stake in the outcome of prompt injection research." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": false, 235 "answer": false, 236 "justification": "The paper tests a defense/localization method, not a pre-trained model's capability on a benchmark. The evaluation measures PromptLocate's localization accuracy, not model knowledge." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": false, 240 "answer": false, 241 "justification": "Same as above: the paper evaluates a localization system, not model knowledge on benchmarks." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": false, 245 "answer": false, 246 "justification": "Same as above: benchmark contamination in the training-data sense is not relevant since the evaluation is of the localization method's ability to find injected prompts." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "Runtime is reported: 'about 11 seconds on average to localize injected prompts in a contaminated sample when running on a low-performance GPU (RTX A5000)' and '5.8 seconds' on H100. Table 12 gives per-method runtime breakdown. Sub-linear scaling with prompt length is demonstrated." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "Per-sample runtime and GPU types are reported, but the total computational budget for all experiments (oracle training, evaluation across all attacks and benchmarks) is not stated." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs, with averaging only over task combinations." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of experimental runs per configuration is not explicitly stated. Results are averaged over task combinations but it is unclear whether each combination was run once or multiple times." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": true, 312 "justification": "The search budget for τ is explicitly stated: grid search over {−0.4, −0.2, 0, 0.2, 0.4} (5 values) evaluated on 556 validation samples. The oracle fine-tuning follows DataSentinel's alternating optimization procedure." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "τ is selected on a validation set (not test set) by maximizing ES score: 'we compute the ES score of PromptLocate on the validation set and select the value that maximizes ES. Using this procedure, we select τ = 0' (Section 5.1.4)." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied. Results across 7 attacks and 49 task combinations are compared by direct numerical comparison without any statistical testing." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors evaluate their own system (PromptLocate) against baselines without acknowledging potential author-evaluation bias. Attribution baselines are re-implemented by the authors." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "Table 12 shows runtime per method but performance is not plotted as a function of compute budget. PromptLocate takes 11.1s vs 2.3s for SFA baselines but the performance-compute tradeoff is not systematically analyzed." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper does not discuss whether OpenPromptInjection and AgentDojo adequately represent real-world prompt injection scenarios. No construct validity analysis of the benchmarks is provided." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "PromptLocate does not use agentic scaffolding and no model-scaffold comparisons are made. The method is evaluated as a standalone system." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "No discussion of whether the detection LLM (Mistral-7B) could have seen OpenPromptInjection or AgentDojo data during pre-training, or whether temporal ordering of data creates leakage." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup leaks information. For instance, whether the oracle's training distribution provides implicit signals about the test attacks is not analyzed." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": true, 359 "justification": "Explicitly addressed: 'none of the target or injected data samples in the validation set overlap with those in the evaluation set' (Section 5.1.4) and the oracle fine-tuning dataset is 'distributionally different from the OpenPromptInjection and AgentDojo datasets.'" 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. Independence is ensured by design (separate datasets) but not verified post-hoc." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "PromptLocate achieves 0.93–0.99 ROUGE-L and embedding similarity across all seven existing attacks on OpenPromptInjection.", 371 "evidence": "Table 1 shows RL 0.94–0.98 and ES 0.93–0.99 across Naive Attack, Escape Character, Context Ignoring, Fake Completion, Combined Attack, Universal, and NeuralExec.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "PromptLocate significantly outperforms attribution method baselines for prompt injection localization.", 376 "evidence": "Table 1: best baseline achieves RL 0.78 (SVA-T on NeuralExec) while PromptLocate achieves RL 0.94 on the same attack. Across all attacks, PromptLocate RL is 0.94–0.98 vs baselines 0.21–0.78.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "PromptLocate remains effective against eight adaptive attacks specifically designed to evade its three steps.", 381 "evidence": "Table 3 shows RL 0.86–0.96 and ES 0.84–0.93 across Random-Loc, Multiple, Period, Single-Seg, Separator, Sep.+Ins., Swap, and Concat attacks. ASV drops significantly after removal in all cases.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "PromptLocate enables effective post-attack forensic analysis with 1% FPR and 2% FNR.", 386 "evidence": "Table 7 shows Tailored-DS + Segment-group achieves FPR=0.01, FNR=0.02 for Combined Attack and FPR=0.00, FNR=0.04 for NeuralExec in the malicious review forensic analysis scenario.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Data recovered by PromptLocate retains 85–100% of original task performance.", 391 "evidence": "Table 8a: PR values range from 0.26 to 0.88 across OpenPromptInjection tasks with PNA-T from 0.30 to 0.94. Table 8b: PR ranges from 0.63 to 0.81 with PNA-T from 0.65 to 0.90 on AgentDojo.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "Step III (data-contaminated segment identification) substantially improves localization.", 396 "evidence": "Table 6: removing Step III drops RL from 0.97 to 0.89 on the existing Combined Attack and from 0.92 to 0.83 on the adaptive variant.", 397 "supported": "strong" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "No variance or uncertainty quantification", 403 "detail": "All results are point estimates with no error bars, confidence intervals, or standard deviations. It is unclear whether results are from single runs or averaged over multiple runs, making it impossible to assess result stability." 404 }, 405 { 406 "flag": "No statistical significance tests", 407 "detail": "Claims of 'significantly outperforms' are based solely on comparing point estimates. No t-tests, bootstrap tests, or other significance tests are performed despite extensive comparative claims across 7+ attack settings." 408 } 409 ], 410 "cited_papers": [ 411 { 412 "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection", 413 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 414 "year": 2023, 415 "relevance": "Foundational work on indirect prompt injection attacks against real-world LLM-integrated applications." 416 }, 417 { 418 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 419 "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N. Z. Gong"], 420 "year": 2024, 421 "relevance": "Provides the formal framework and OpenPromptInjection benchmark used as the primary evaluation platform in this work." 422 }, 423 { 424 "title": "DataSentinel: A game-theoretic detection of prompt injection attacks", 425 "authors": ["Y. Liu", "Y. Jia", "J. Jia", "D. Song", "N. Z. Gong"], 426 "year": 2025, 427 "relevance": "State-of-the-art prompt injection detector adapted as the oracle in PromptLocate's Step II." 428 }, 429 { 430 "title": "Jatmo: Prompt injection defense by task-specific finetuning", 431 "authors": ["J. Piet", "M. Alrashed", "C. Sitawarin", "S. Chen", "Z. Wei", "E. Sun", "B. Alomair", "D. Wagner"], 432 "year": 2024, 433 "relevance": "Defense against prompt injection via task-specific fine-tuning of LLMs." 434 }, 435 { 436 "title": "Secalign: Defending against prompt injection with preference optimization", 437 "authors": ["S. Chen", "A. Zharmagambetov", "S. Mahloujifar", "K. Chaudhuri", "D. Wagner", "C. Guo"], 438 "year": 2025, 439 "relevance": "Prevention-based defense using preference optimization to make LLMs resilient to prompt injection." 440 }, 441 { 442 "title": "Defeating prompt injections by design", 443 "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan", "J. Hayes", "N. Carlini", "D. Fabian", "C. Kern", "C. Shi", "A. Terzis", "F. Tramèr"], 444 "year": 2025, 445 "relevance": "Security-policy-based defense enforcing constraints on LLM actions to prevent prompt injection." 446 }, 447 { 448 "title": "Automatic and universal prompt injection attacks against large language models", 449 "authors": ["X. Liu", "Z. Yu", "Y. Zhang", "N. Zhang", "C. Xiao"], 450 "year": 2024, 451 "relevance": "Optimization-based universal prompt injection attack (Universal attack) evaluated in PromptLocate's experiments." 452 }, 453 { 454 "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks", 455 "authors": ["D. Pasquini", "M. Strohmeier", "C. Troncoso"], 456 "year": 2024, 457 "relevance": "Optimization-based prompt injection attack (NeuralExec) that jointly optimizes separator and suffix, evaluated in experiments." 458 }, 459 { 460 "title": "A critical evaluation of defenses against prompt injection attacks", 461 "authors": ["Y. Jia", "Z. Shao", "Y. Liu", "J. Jia", "D. Song", "N. Z. Gong"], 462 "year": 2025, 463 "relevance": "Systematic evaluation showing fine-tuned LLMs remain vulnerable to adaptive prompt injection attacks." 464 }, 465 { 466 "title": "Webinject: Prompt injection attack to web agents", 467 "authors": ["X. Wang", "J. Bloch", "Z. Shao", "Y. Hu", "S. Zhou", "N. Z. Gong"], 468 "year": 2025, 469 "relevance": "Prompt injection attack targeting web-based LLM agents, relevant to agentic AI security." 470 }, 471 { 472 "title": "Agentdojo: A dynamic environment to evaluate prompt injection attacks and defenses for llm agents", 473 "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic", "L. Beurer-Kellner", "M. Fischer", "F. Tramèr"], 474 "year": 2024, 475 "relevance": "Agent-based prompt injection benchmark used as the second major evaluation platform in this work." 476 }, 477 { 478 "title": "System-level defense against indirect prompt injection attacks: An information flow control perspective", 479 "authors": ["F. Wu", "E. Cecchetti", "C. Xiao"], 480 "year": 2024, 481 "relevance": "Information flow control approach to defending LLM agents against indirect prompt injection." 482 } 483 ], 484 "engagement_factors": { 485 "practical_relevance": { 486 "score": 2, 487 "justification": "Localization of injected prompts is directly useful for deployed LLM applications doing forensic analysis and data recovery, though it requires fine-tuning a detection LLM." 488 }, 489 "surprise_contrarian": { 490 "score": 1, 491 "justification": "Novel framing of localization (vs. detection/prevention), but the three-step approach is intuitive and does not challenge conventional wisdom about prompt injection." 492 }, 493 "fear_safety": { 494 "score": 2, 495 "justification": "Addresses AI security concerns (prompt injection in production LLM applications) and demonstrates adaptive attacks, but localization is a defensive tool." 496 }, 497 "drama_conflict": { 498 "score": 0, 499 "justification": "No controversy or conflict; straightforward defense contribution accepted at a top venue." 500 }, 501 "demo_ability": { 502 "score": 2, 503 "justification": "Code and data released on GitHub with public benchmarks, making it feasible to reproduce and try, though it requires fine-tuning an oracle model." 504 }, 505 "brand_recognition": { 506 "score": 1, 507 "justification": "Duke University and Penn State are well-known institutions but not top-of-mind AI labs for general audiences." 508 } 509 } 510 }