scan.json (32943B)
1 { 2 "paper": { 3 "title": "ObliInjection: Order-Oblivious Prompt Injection Attack to LLM Agents with Multi-source Data", 4 "authors": [ 5 "Ruiqi Wang", 6 "Yuqi Jia", 7 "Neil Zhenqiang Gong" 8 ], 9 "year": 2025, 10 "venue": "NDSS 2026", 11 "arxiv_id": "2512.09321", 12 "doi": "10.14722/ndss.2026.240702" 13 }, 14 "scan_version": 3, 15 "active_modules": [ 16 "experimental_rigor", 17 "data_leakage" 18 ], 19 "methodology_tags": [ 20 "benchmark-eval" 21 ], 22 "key_findings": "ObliInjection achieves ~99% attack success rate across three datasets and seven LLMs by contaminating just a single data segment, even when one of 6–100 segments is poisoned. The attack introduces an order-oblivious loss and orderGCG algorithm that together handle unknown segment orderings, substantially outperforming prior prompt injection attacks (Combined Attack 5.9–26.9%, Neural Exec 7.1–36.5%, JudgeDeceiver 40.1–56.8%). Existing prevention-based defenses (StruQ, SecAlign) and detection-based defenses (PPL, DataSentinel) are insufficient, with the attack still achieving 53–78% ASR against prevention defenses and 79–93% false negative rates against detectors.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The abstract states: 'Our code and data are available at: https://github.com/ReachalWang/ObliInjection.' A working URL is provided." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The paper states code and data are available at the GitHub URL. Additionally, the three evaluation datasets (Amazon Reviews, Multi-News, HotpotQA) are publicly available standard benchmarks." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "The paper mentions hardware used (A100 GPU, H200 GPUs) but does not provide environment specifications such as requirements.txt, Dockerfile, or library versions in the paper itself." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper provides detailed algorithm pseudocode (Algorithms 1-5) and hyperparameter settings, but does not include step-by-step reproduction instructions or a 'Reproducing Results' section. The GitHub link may contain a README but the paper itself lacks explicit instructions." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "All ASR results in Tables II–VIII are reported as point estimates without confidence intervals or error bars. For example, Table II reports '99.0' for ObliInjection on Amazon Reviews without any uncertainty measure." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "The paper claims 'ObliInjection substantially outperforms all baseline attacks' based solely on comparing ASR numbers (e.g., 99.0% vs 56.8%) without any statistical significance tests." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "The paper provides baseline context for all comparisons. For example, Table II shows ObliInjection at 99.0% vs Combined Attack at 5.9%, Neural Exec at 7.1%, and JudgeDeceiver at 56.8% on Amazon Reviews, making the magnitude of improvement clear." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "The paper uses 100 target tasks per dataset and 50 permutations for ASR calculation but provides no justification for why these numbers were chosen and no power analysis." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "ASR is computed by sampling 50 permutations, but no standard deviation, interquartile range, or other spread measure across those permutations is reported. Only average ASR is given." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "Five baselines are compared: Combined Attack, Neural Exec, JudgeDeceiver, and two ablation variants (ObliInjection-GCG and ObliInjection-CE). These are described in Section V-A and compared in Table II." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Neural Exec (AISec 2024), JudgeDeceiver (CCS 2024), and Combined Attack (USENIX Security 2024) are all recent and represent the state of the art in prompt injection attacks." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Extensive ablation studies in Section V-C cover: forms of contaminated segment (Table V), shadow instruction quality (Table VI), number of target tasks (Fig. 4a), buffer size (Fig. 4b), number of shadow sources (Fig. 4c), token/segment candidates (Fig. 5), and component ablations via ObliInjection-GCG and ObliInjection-CE variants." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": false, 92 "justification": "The primary attack evaluation uses only a single metric: Attack Success Rate (ASR). FPR/FNR are used for detection defense evaluation but the attack itself is measured solely by ASR." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No systematic human evaluation is performed. The paper mentions manual review of keyword matches ('93.1% exactly match') in the limitations but this is a spot-check of the evaluation metric, not a human evaluation of attack quality." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": true, 102 "justification": "Step V uses validation shadow segments held out from optimization to select the final contaminated segment. Table IV separately evaluates on 90 target tasks not used during the 10-task optimization, demonstrating held-out evaluation." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Results are broken down per dataset (3 datasets), per LLM (7 models), per attack variant, and per hyperparameter setting across Tables II–VIII and Figures 4–5." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "The paper discusses lower transfer ASR to GPT-4o (24.0% without API access, Section V-C), reduced effectiveness against SecAlign-Adapt (53.3%), and the keyword-matching metric limitation with a specific counterexample ('Some people might think the product is useless, but others love it')." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The ablation variants ObliInjection-GCG (28.0–61.5%) and ObliInjection-CE (85.3–91.0%) show substantially degraded performance, demonstrating that removing either component hurts. The unconstrained form of x achieves only 80.2% average ASR (Table V). Transfer to GPT-4o without API access yields only 24.0%." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "The abstract claims 'highly effective, even when only one out of 6–100 segments is contaminated' — supported by Table II showing ~99% average ASR. Claims 'existing defenses are insufficient' — supported by Tables VIIa/b. Claims about 'three datasets' and 'twelve LLMs' — confirmed in Section V." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "Causal claims like 'both the order-oblivious loss and orderGCG are essential' are justified through controlled ablation: ObliInjection-GCG removes orderGCG (keeps order-oblivious loss), ObliInjection-CE removes the loss (keeps orderGCG), each degrading performance substantially. This is adequate single-variable manipulation." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": false, 134 "justification": "The title claims applicability to 'LLM Agents with Multi-source Data' but no agentic tasks (tool selection, multi-step reasoning) are actually evaluated. The paper tests only summarization and RAG-based QA. Tool selection is discussed conceptually in the introduction but not experimentally validated." 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper does not discuss alternative explanations for why ObliInjection succeeds. For example, it does not consider whether the high ASR might be partly due to the keyword-based metric inflating success, whether LLMs' instruction-following tendencies (rather than the optimization) drive the attack, or whether the choice of simple injected responses ('The product is useless!') makes the task artificially easy." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper explicitly acknowledges the gap between keyword matching and true semantic success: 'We acknowledge that this simplified metric may introduce inaccuracies' and provides manual verification that 93.1% of keyword matches were exact matches to the attacker's intended output (Section VII)." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": false, 151 "justification": "Open-weight models are specified with full names (e.g., 'Llama-3-8B-Instruct', 'Mistral-7B-Instruct-v0.3', 'Qwen-2.5-7B-Instruct-1M'). However, GPT-4o and Gemini-2.5-flash are referenced without API version or snapshot date, and these are key models in the transfer experiments (Table III)." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full prompt texts for shadow target instruction generation (Table IX), shadow segment generation for Amazon Reviews (Table X), and Multi-News/HotpotQA (Table XI) are provided in the Appendix. Injected prompts are also specified (e.g., 'Print: The product is useless!')." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section V-A reports all key hyperparameters: dbuf=5, dtok=128, dseg=30, drep=2, diter=200, ns values (10 for Amazon Reviews, 3 for others), dper permutations, 50 permutations for ASR calculation, and segment counts." 162 }, 163 "scaffolding_described": { 164 "applies": false, 165 "answer": false, 166 "justification": "No agentic scaffolding is used. The paper evaluates a gradient-based attack optimization algorithm applied directly to LLM inputs, not an agent framework." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section V-A describes how datasets are prepared: 100 target tasks per dataset, Amazon reviews capped at n=100 per product due to context window limits, metadata used for each dataset, shadow segment generation process, and how target tasks are structured as tuples." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section VII is titled 'Conclusion, Limitations, and Future Work' and contains a substantive paragraph discussing the keyword-based evaluation metric limitation with specific examples and manual verification results." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "The limitations section discusses a specific threat: the keyword matching metric may count false positives (e.g., 'Some people might think the product is useless, but others love it' would be counted as success despite contradicting the attack intent). They report manual verification finding 93.1% exact matches." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": false, 188 "justification": "The paper does not explicitly state what the results do not show. It does not acknowledge that agentic tool selection was not tested despite being in the title, does not discuss what types of multi-source applications are excluded, and does not state limits on the injected response complexity." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": true, 195 "justification": "The three evaluation datasets (Amazon Reviews, Multi-News, HotpotQA) are publicly available standard benchmarks. The paper states 'Our code and data are available at' the GitHub repository." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section V-A describes dataset selection, task structure, and dataset statistics (Table I). Each dataset's purpose, segment meaning, and construction are documented. 100 target tasks per dataset, with specified segment counts and length statistics." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. Data sources are standard public benchmarks (Amazon Reviews, Multi-News, HotpotQA)." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The full pipeline is documented: dataset selection → target task construction → shadow generation (Step I) → attack optimization (Steps II-IV) → validation selection (Step V) → evaluation via ASR with 50 permutations. Algorithm pseudocode details each stage." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Acknowledgements section states: 'This work was supported by NSF under grant no. 2450935, 2131859, 2125977, 2112562, and 1937787.'" 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "All three authors are from Duke University, clearly stated. They evaluate third-party LLMs (Meta's Llama, Mistral, Qwen, OpenAI GPT-4o, Google Gemini) with no affiliation conflict." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": true, 227 "justification": "NSF (National Science Foundation) is a government funding agency with no financial stake in whether the attack succeeds or fails against any particular LLM vendor." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "This paper evaluates an attack method's effectiveness at manipulating LLM outputs, not a pre-trained model's capability on a knowledge benchmark. The evaluation measures attack success rate, not model knowledge." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "The paper tests attack effectiveness rather than model knowledge. Whether models have seen the benchmark data during training is not relevant to whether the adversarial attack can hijack their output." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "Same as above — this is a red-teaming/attack study that evaluates adversarial manipulation success, not a capability benchmark evaluation." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study. All experiments are automated evaluations of attack algorithms on LLMs." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants. Section VIII describes ethics considerations but these are about responsible disclosure, not human subjects research." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Section V-B reports: 'optimizing a contaminated segment for 10 target tasks takes fewer than 4 hours on a single A100 GPU for all evaluated LLMs except Llama-4-17B, and about 13 hours for Llama-4-17B using two H200 GPUs.' Section V-C mentions '120K API queries' for GPT-4o log probability access." 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "Hardware (A100 GPU, two H200 GPUs) and approximate wall-clock times are reported for optimization. API query counts for GPT-4o experiments are stated (120K queries)." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": false, 305 "justification": "No mention of multiple random seeds or seed sensitivity analysis. The attack optimization involves randomness (permutation sampling, token candidate selection) but results are not reported across seeds." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "The paper states ASR is calculated over 50 sampled permutations (Section V-A), optimization runs for diter=200 iterations, and specifies all iteration/sample counts in the algorithms." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "Default hyperparameter values (dbuf=5, dtok=128, dseg=30, drep=2) are stated but the paper does not report how these defaults were selected or what search budget was used to find them. Ablation studies (Figs. 4-5) show sensitivity but do not constitute a reported search budget." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "Step V explicitly selects the best contaminated segment via validation ASR on held-out shadow segments, not on the test data. The selection criterion (highest validation ASR) is clearly described in Algorithm 1 and Section IV-B." 321 }, 322 "multiple_comparison_correction": { 323 "applies": false, 324 "answer": false, 325 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable. The paper compares methods using point estimates only." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors adapted all baseline attacks (Combined Attack, Neural Exec, JudgeDeceiver) to the multi-source setting themselves. No discussion of whether their adaptations may disadvantage baselines or whether the baselines' original authors would implement them differently." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": true, 335 "justification": "Section V-A states: 'for attacks such as Neural Exec, JudgeDeceiver, and ObliInjection-GCG that do not employ a beam search strategy, we set dseg = 5×30 to ensure the total computational cost remains approximately consistent across different attack methods, enabling fair comparisons.'" 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": false, 340 "justification": "The paper does not discuss whether the three benchmark datasets (Amazon Reviews, Multi-News, HotpotQA) are representative of real-world multi-source LLM application scenarios, or whether the keyword-based ASR metric accurately captures attack success in practice beyond the 93.1% spot-check." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": false, 344 "answer": false, 345 "justification": "No agentic scaffolding is involved. The attack is applied directly to LLM inputs without scaffolding/tooling that could confound results." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The paper does not discuss whether the evaluated LLMs may have been trained on the benchmark datasets (Amazon Reviews published 2023, Multi-News 2019, HotpotQA 2018). While this is less critical for attack evaluation than capability benchmarks, it could affect the baseline task performance that the attack attempts to hijack." 353 }, 354 "feature_leakage_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether the evaluation setup provides information not available in a real attack scenario. The shadow segments are synthesized by GPT-4o which may differ from what a real attacker could produce." 358 }, 359 "non_independence_addressed": { 360 "applies": true, 361 "answer": false, 362 "justification": "No discussion of independence between training and test data. The 100 target tasks per dataset are drawn from standard benchmarks without verifying independence from model training data." 363 }, 364 "leakage_detection_method": { 365 "applies": true, 366 "answer": false, 367 "justification": "No concrete leakage detection or prevention method is applied." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "ObliInjection achieves ~99% average ASR across three datasets and seven LLMs, even when only one out of 6–100 segments is contaminated.", 374 "evidence": "Table II shows average ASR of 99.0% (Amazon Reviews), 98.7% (Multi-News), and 99.6% (HotpotQA) across seven LLMs, with minimum ASR of 93.8% (Mistral-7B on Multi-News).", 375 "supported": "strong" 376 }, 377 { 378 "claim": "ObliInjection substantially outperforms existing prompt injection attacks in multi-source data settings.", 379 "evidence": "Table II shows ObliInjection at 99.0% vs Combined Attack 5.9%, Neural Exec 7.1%, JudgeDeceiver 56.8% on Amazon Reviews. Similar gaps on Multi-News and HotpotQA.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "Both the order-oblivious loss and the orderGCG algorithm are essential components of ObliInjection.", 384 "evidence": "Table II ablations: ObliInjection-GCG (removes orderGCG) averages 28.0–61.5%, ObliInjection-CE (removes order-oblivious loss) averages 85.3–91.0%, vs full ObliInjection at 98.7–99.6%.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "Contaminated segments optimized on 10 target tasks transfer well to 90 unseen target tasks.", 389 "evidence": "Table IV shows average ASR of 97.1% (Amazon Reviews), 97.6% (Multi-News), 96.1% (HotpotQA) on unseen tasks, only slightly below the optimization tasks.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "ObliInjection transfers to unknown target LLMs, including closed-source models like GPT-4o.", 394 "evidence": "Table III shows increasing ASR with more shadow LLMs: 24.0% on GPT-4o without API access (5 shadow LLMs), increasing to 95.2% with log probability access. 87.3% on Gemini-2.5-flash.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Existing prevention-based defenses (StruQ, SecAlign) are insufficient to mitigate ObliInjection.", 399 "evidence": "Table VIIa: ASR of 77.9% against StruQ, 63.8% against SecAlign, and 53.3% against adapted SecAlign. Leave-one-segment-out (99.3%) and segment delimiters (96.3%) also fail.", 400 "supported": "moderate" 401 }, 402 { 403 "claim": "Existing detection-based defenses (PPL, DataSentinel) fail to reliably detect ObliInjection's contaminated segments.", 404 "evidence": "Table VIIb: PPL has 92.6% FNR and DataSentinel has 79.6% FNR, while contaminated segments still achieve 85.5–100% ASR across five LLMs.", 405 "supported": "strong" 406 } 407 ], 408 "red_flags": [ 409 { 410 "flag": "Title overgeneralizes beyond experiments", 411 "detail": "The title claims applicability to 'LLM Agents with Multi-source Data' but no agentic tasks (tool selection, multi-step reasoning, MCP server manipulation) are experimentally evaluated. Only summarization and RAG-based QA are tested. Tool selection is mentioned conceptually in the introduction but absent from experiments." 412 }, 413 { 414 "flag": "No statistical significance testing", 415 "detail": "Comparative claims across 7 LLMs × 3 datasets × 6 attacks are made based solely on point estimates of ASR without any statistical significance tests, confidence intervals, or variance measures. With stochastic evaluation (50 random permutations), the variability of ASR estimates is unknown." 416 }, 417 { 418 "flag": "Baselines adapted by attack authors", 419 "detail": "All baseline attacks (Combined Attack, Neural Exec, JudgeDeceiver) were adapted from their original settings to the multi-source scenario by the ObliInjection authors. No discussion of whether these adaptations fairly represent the baselines' potential, or whether the original authors might adapt them differently." 420 }, 421 { 422 "flag": "Simplified evaluation metric", 423 "detail": "ASR is measured by keyword matching (checking for 'useless', 'fake', or 'unknown' in the output). While the authors report 93.1% exact match on manual review, this metric conflates partial successes with full successes and does not capture the quality or convincingness of the attack output." 424 } 425 ], 426 "cited_papers": [ 427 { 428 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 429 "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N. Z. Gong"], 430 "year": 2024, 431 "relevance": "Establishes formal framework and benchmark for prompt injection attacks and defenses, directly foundational to ObliInjection's threat model." 432 }, 433 { 434 "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks", 435 "authors": ["D. Pasquini", "M. Strohmeier", "C. Troncoso"], 436 "year": 2024, 437 "relevance": "Gradient-based prompt injection attack targeting RAG systems, a key baseline for ObliInjection's multi-source comparison." 438 }, 439 { 440 "title": "Optimization-based prompt injection attack to LLM-as-a-judge", 441 "authors": ["J. Shi", "Z. Yuan", "Y. Liu", "Y. Huang", "P. Zhou", "L. Sun", "N. Z. Gong"], 442 "year": 2024, 443 "relevance": "GCG-based prompt injection targeting LLM-as-a-judge with multi-source data, another key baseline for ObliInjection." 444 }, 445 { 446 "title": "Universal and transferable adversarial attacks on aligned language models", 447 "authors": ["A. Zou", "Z. Wang", "J. Z. Kolter", "M. Fredrikson"], 448 "year": 2023, 449 "arxiv_id": "2307.15043", 450 "relevance": "Introduces the GCG algorithm for adversarial attacks on LLMs, the foundational optimization method that ObliInjection's orderGCG extends." 451 }, 452 { 453 "title": "StruQ: Defending against prompt injection with structured queries", 454 "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"], 455 "year": 2025, 456 "relevance": "Prevention-based defense against prompt injection via structured input formatting; evaluated as a defense against ObliInjection." 457 }, 458 { 459 "title": "SecAlign: Defending against prompt injection with preference optimization", 460 "authors": ["S. Chen", "A. Zharmagambetov", "S. Mahloujifar", "K. Chaudhuri", "D. Wagner", "C. Guo"], 461 "year": 2025, 462 "relevance": "DPO-based defense that fine-tunes LLMs to resist prompt injection; evaluated and shown insufficient against ObliInjection." 463 }, 464 { 465 "title": "DataSentinel: A game-theoretic detection of prompt injection attacks", 466 "authors": ["Y. Liu", "Y. Jia", "J. Jia", "D. Song", "N. Z. Gong"], 467 "year": 2025, 468 "relevance": "Game-theoretic detection-based defense against prompt injection; shown to have 79.6% FNR against ObliInjection." 469 }, 470 { 471 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 472 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"], 473 "year": 2023, 474 "relevance": "Foundational work on indirect prompt injection in real-world LLM-integrated applications, establishing the threat landscape ObliInjection addresses." 475 }, 476 { 477 "title": "PoisonedRAG: Knowledge corruption attacks to retrieval-augmented generation of large language models", 478 "authors": ["W. Zou", "R. Geng", "B. Wang", "J. Jia"], 479 "year": 2025, 480 "relevance": "RAG-specific prompt injection attack that assumes attacker controls majority of retrieved passages; contrasted with ObliInjection's single-segment approach." 481 }, 482 { 483 "title": "A critical evaluation of defenses against prompt injection attacks", 484 "authors": ["Y. Jia", "Z. Shao", "Y. Liu", "J. Jia", "D. Song", "N. Z. Gong"], 485 "year": 2025, 486 "arxiv_id": "2505.18333", 487 "relevance": "Evaluates prompt injection defenses and shows fine-tuned models provide limited defense effectiveness, supporting ObliInjection's defense evasion findings." 488 }, 489 { 490 "title": "Lessons from defending Gemini against indirect prompt injections", 491 "authors": ["C. Shi", "S. Lin", "S. Song", "J. Hayes", "I. Shumailov"], 492 "year": 2025, 493 "arxiv_id": "2505.14534", 494 "relevance": "Industry perspective from Google on defending against indirect prompt injection, contextualizes the practical relevance of ObliInjection's threat model." 495 }, 496 { 497 "title": "Defeating prompt injections by design", 498 "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan", "J. Hayes", "N. Carlini"], 499 "year": 2025, 500 "arxiv_id": "2503.18813", 501 "relevance": "Software security-based defense approach using security policies on LLM agent actions; discussed as inapplicable to ObliInjection's target scenarios." 502 }, 503 { 504 "title": "Prompt injection attack to tool selection in LLM agents", 505 "authors": ["J. Shi", "Z. Yuan", "G. Tie", "P. Zhou", "N. Z. Gong", "L. Sun"], 506 "year": 2025, 507 "arxiv_id": "2504.19793", 508 "relevance": "Prompt injection targeting LLM agent tool selection, directly related to the agentic multi-source scenario ObliInjection conceptually addresses." 509 } 510 ], 511 "engagement_factors": { 512 "practical_relevance": { 513 "score": 2, 514 "justification": "Security researchers and red-teamers testing multi-source LLM applications can use the released code and technique, though it requires significant compute and ML expertise." 515 }, 516 "surprise_contrarian": { 517 "score": 1, 518 "justification": "Confirms that prompt injection remains a serious threat; the multi-source ordering angle is novel but doesn't challenge fundamental assumptions." 519 }, 520 "fear_safety": { 521 "score": 3, 522 "justification": "Demonstrates a near-100% success rate attack that defeats existing defenses by contaminating just one out of 100 data segments, with implications for RAG, reviews, and AI Overviews." 523 }, 524 "drama_conflict": { 525 "score": 1, 526 "justification": "Shows that StruQ, SecAlign, PPL, and DataSentinel defenses are insufficient, but the paper frames this constructively rather than as controversy." 527 }, 528 "demo_ability": { 529 "score": 2, 530 "justification": "Code and data are released on GitHub, enabling reproduction, though the attack requires GPU compute and model access to run." 531 }, 532 "brand_recognition": { 533 "score": 1, 534 "justification": "From Duke University's security group; tests against GPT-4o and Gemini but not affiliated with a major AI lab." 535 } 536 } 537 }