scan-v4.json (34328B)
1 { 2 "scan_version": 4, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Defending Against Indirect Prompt Injection Attacks With Spotlighting", 6 "authors": [ 7 "Keegan Hines", 8 "Gary Lopez", 9 "Matthew Hall", 10 "Federico Zarfati", 11 "Yonatan Zunger", 12 "Emre Kıcıman" 13 ], 14 "year": 2024, 15 "venue": "CAMLIS", 16 "arxiv_id": "2403.14720", 17 "doi": "10.48550/arXiv.2403.14720" 18 }, 19 "checklist": { 20 "claims_and_evidence": { 21 "abstract_claims_supported": { 22 "applies": true, 23 "answer": true, 24 "justification": "The abstract claims 'spotlighting reduces the attack success rate from greater than 50% to below 2%' — this is supported by Figures 4-6 showing ASR reductions to 0-3% across models and tasks. The claim of 'minimal impact on task efficacy' is supported by Figure 7 for datamarking, though encoding impacts GPT-3.5 (acknowledged in Section 5.2).", 25 "source": "opus" 26 }, 27 "causal_claims_justified": { 28 "applies": true, 29 "answer": true, 30 "justification": "The causal claim 'spotlighting reduces ASR' is supported by controlled single-variable manipulation: the same model, same dataset, same task, with only the spotlighting transformation varying. This is adequate causal design for the claim.", 31 "source": "opus" 32 }, 33 "generalization_bounded": { 34 "applies": true, 35 "answer": false, 36 "justification": "The title 'Defending Against Indirect Prompt Injection Attacks With Spotlighting' is generic, but experiments use only GPT-family models and only synthetic keyword payload attacks. The abstract says 'Using GPT-family models' but broader claims about XPIA defense are not bounded to this narrow attack type.", 37 "source": "opus" 38 }, 39 "alternative_explanations_discussed": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper does not discuss alternative explanations for why spotlighting works beyond the telecommunications analogy (Section 6). It does not consider whether results are specific to keyword attacks, whether the attack corpus has distributional properties that favor the defense, or other confounds.", 43 "source": "opus" 44 }, 45 "proxy_outcome_distinction": { 46 "applies": true, 47 "answer": true, 48 "justification": "The appendix (Section 8.1) explicitly discusses the gap between the keyword-detection proxy (ASR) and actual attack success, distinguishing strict ASR from 'Affected Success Rate' (AffSR) and providing examples of gray-area cases. The proxy nature of the measurement is acknowledged.", 49 "source": "opus" 50 } 51 }, 52 "limitations_and_scope": { 53 "limitations_section_present": { 54 "applies": true, 55 "answer": false, 56 "justification": "There is no dedicated 'Limitations' or 'Threats to Validity' section. Limitation-like content is scattered across Section 5.3 (recommendations), Section 5.4 (adversary considerations), Section 6 (discussion), and the appendix, but no consolidated section exists.", 57 "source": "opus" 58 }, 59 "threats_to_validity_specific": { 60 "applies": true, 61 "answer": true, 62 "justification": "Specific threats are discussed throughout: adversaries can subvert delimiting if they know the system prompt (Section 5.4); encoding severely degrades weaker models (Section 5.2); few-shot examples risk overfitting to known attack patterns (Section 8.2); attacks without whitespace could bypass datamarking (Section 5.4).", 63 "source": "opus" 64 }, 65 "scope_boundaries_stated": { 66 "applies": true, 67 "answer": false, 68 "justification": "The paper does not explicitly state what was NOT tested. It doesn't bound claims to keyword attacks only, doesn't acknowledge the absence of testing against sophisticated/adaptive attacks, and doesn't specify which attack types or real-world scenarios are excluded from the evaluation.", 69 "source": "opus" 70 } 71 }, 72 "conflicts_of_interest": { 73 "funding_disclosed": { 74 "applies": true, 75 "answer": false, 76 "justification": "No funding acknowledgment section. All authors are from Microsoft, which implies corporate funding, but no explicit funding disclosure is provided.", 77 "source": "opus" 78 }, 79 "affiliations_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "All authors are listed with 'Microsoft' affiliation on the first page. The affiliation is clear and prominent.", 83 "source": "opus" 84 }, 85 "funder_independent_of_outcome": { 86 "applies": true, 87 "answer": false, 88 "justification": "Microsoft has a major partnership with OpenAI and sells Azure OpenAI services. Effective prompt injection defenses increase the commercial viability of Microsoft's LLM products. The employer has a direct financial interest in showing these defenses work.", 89 "source": "opus" 90 }, 91 "financial_interests_declared": { 92 "applies": true, 93 "answer": false, 94 "justification": "No competing interests or financial disclosure statement is included in the paper.", 95 "source": "opus" 96 } 97 }, 98 "scope_and_framing": { 99 "key_terms_defined": { 100 "applies": true, 101 "answer": true, 102 "justification": "Key terms well-defined: indirect prompt injection (section 2.2), spotlighting (section 3.1 with three instantiations), Attack Success Rate (section 4.2 with formal definition and examples).", 103 "source": "haiku" 104 }, 105 "intended_contribution_clear": { 106 "applies": true, 107 "answer": true, 108 "justification": "Contribution explicitly stated: 'spotlighting, a family of prompt engineering techniques' to defend against XPIA via three instantiations (delimiting, datamarking, encoding).", 109 "source": "haiku" 110 }, 111 "engagement_with_prior_work": { 112 "applies": true, 113 "answer": true, 114 "justification": "Background distinguishes XPIA from direct prompt injection (jailbreaking), cites prior XPIA work [2], [14], compares to other safety approaches (alignment tuning, detection systems), and positions this as prompt-engineering defense.", 115 "source": "haiku" 116 } 117 } 118 }, 119 "type_checklist": { 120 "empirical": { 121 "artifacts": { 122 "code_released": { 123 "applies": true, 124 "answer": false, 125 "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper. The techniques are described but no implementation is released.", 126 "source": "opus" 127 }, 128 "data_released": { 129 "applies": true, 130 "answer": false, 131 "justification": "The synthetic corpus of 1000 attack documents is not released. Standard NLP benchmarks (SQuAD, IMDB, SuperGLUE) are public, but the custom attack dataset central to the evaluation is not available.", 132 "source": "opus" 133 }, 134 "environment_specified": { 135 "applies": true, 136 "answer": false, 137 "justification": "No environment specifications, requirements files, or dependency information is provided. The paper mentions model names but no software environment details.", 138 "source": "opus" 139 }, 140 "reproduction_instructions": { 141 "applies": true, 142 "answer": false, 143 "justification": "No step-by-step reproduction instructions are included. The system prompts are provided as examples, but there are no scripts, commands, or procedures to replicate the full experimental pipeline.", 144 "source": "opus" 145 } 146 }, 147 "statistical_methodology": { 148 "confidence_intervals_or_error_bars": { 149 "applies": true, 150 "answer": false, 151 "justification": "All results are reported as point estimates (e.g., 'ASR is reduced to 3.10%', 'ASR is reduced to 0.0%'). No confidence intervals, error bars, or uncertainty ranges appear in figures or text.", 152 "source": "opus" 153 }, 154 "significance_tests": { 155 "applies": true, 156 "answer": false, 157 "justification": "Comparative claims like 'datamarking leads a strong reduction in ASR' and 'encoding approach outperforms datamarking' are made without any statistical significance tests — only raw percentage comparisons.", 158 "source": "opus" 159 }, 160 "effect_sizes_reported": { 161 "applies": true, 162 "answer": true, 163 "justification": "Effect sizes are reported with baseline context throughout: 'ASR is reduced from approximately 50% to below 3%' (Section 5.1), 'ASR is reduced to 0.0%' from ~40% (Figure 4), and NLP task performance with/without transformation (Figure 7). The reader can assess magnitude.", 164 "source": "opus" 165 }, 166 "sample_size_justified": { 167 "applies": true, 168 "answer": false, 169 "justification": "The corpus size of 1000 documents is stated (Section 4.2) but never justified. No power analysis or rationale for why 1000 is sufficient.", 170 "source": "opus" 171 }, 172 "variance_reported": { 173 "applies": true, 174 "answer": false, 175 "justification": "No variance, standard deviation, or multiple-run results are reported. All experiments appear to be single-run. The paper mentions examining 'the effect of temperature on XPIA susceptibility' but provides no data on run-to-run variance.", 176 "source": "opus" 177 } 178 }, 179 "evaluation_design": { 180 "baselines_included": { 181 "applies": true, 182 "answer": true, 183 "justification": "Multiple baselines are included: no defense (Figure 1), instruction-only defense (Figure 2), delimiting (Figure 3), and progressive spotlighting variants. The progression from no defense through each technique provides a clear comparison structure.", 184 "source": "opus" 185 }, 186 "baselines_contemporary": { 187 "applies": true, 188 "answer": false, 189 "justification": "The baselines are all ablation-like variants of the authors' own approach (no defense, instructions-only, delimiting) or trivial baselines. No comparison against other published XPIA defense methods from the literature, despite citing prior work [2] that explored defense approaches.", 190 "source": "opus" 191 }, 192 "ablation_study": { 193 "applies": true, 194 "answer": true, 195 "justification": "The three spotlighting variants (delimiting → datamarking → encoding) function as a progressive ablation, showing the incremental benefit of each transformation. The instruction-only condition isolates the effect of prompt instructions from the spotlighting transformations.", 196 "source": "opus" 197 }, 198 "multiple_metrics": { 199 "applies": true, 200 "answer": true, 201 "justification": "Two distinct metric families are used: Attack Success Rate (ASR) for defense effectiveness, and NLP task performance across four benchmarks (SQuAD Q&A accuracy, IMDB Sentiment, SuperGLUE WIC, SuperGLUE BoolQ) for task impact.", 202 "source": "opus" 203 }, 204 "human_evaluation": { 205 "applies": true, 206 "answer": false, 207 "justification": "No human evaluation is performed. Attack success is determined by automated keyword detection. NLP task performance is measured by automated benchmark metrics only.", 208 "source": "opus" 209 }, 210 "held_out_test_set": { 211 "applies": true, 212 "answer": false, 213 "justification": "For the primary ASR evaluation on the 1000-document synthetic corpus, there is no mention of held-out test splits. The defense techniques are hand-designed rather than tuned, but no explicit separation of development and evaluation data is described.", 214 "source": "opus" 215 }, 216 "per_category_breakdown": { 217 "applies": true, 218 "answer": true, 219 "justification": "Results are broken down by model (text-davinci-003, GPT-3.5-Turbo, GPT-4) and by task (summarization, Q&A) across Figures 1-6. NLP task impact is shown per-benchmark (SQuAD, IMDB, WIC, BoolQ) in Figures 7-8.", 220 "source": "opus" 221 }, 222 "failure_cases_discussed": { 223 "applies": true, 224 "answer": true, 225 "justification": "The appendix (Section 8.1) discusses gray-area cases where the model notices but doesn't fall for the attack. Section 5.2 shows encoding degrades GPT-3.5-Turbo performance significantly. Section 5.4 discusses adversary scenarios where defenses could be subverted.", 226 "source": "opus" 227 }, 228 "negative_results_reported": { 229 "applies": true, 230 "answer": true, 231 "justification": "Several negative results are reported: delimiting provides only modest improvement and is not recommended (Section 5.1); encoding severely degrades GPT-3.5-Turbo task performance (Figure 8, bottom); few-shot examples risk overfitting and label leakage (Section 8.2).", 232 "source": "opus" 233 } 234 }, 235 "setup_transparency": { 236 "model_versions_specified": { 237 "applies": true, 238 "answer": true, 239 "justification": "Section 4.1 specifies: 'text-davinci-003, GPT-3.5Turbo (June 2023 version) and GPT-4 (June 2023 version).' Versions are identified by snapshot date.", 240 "source": "opus" 241 }, 242 "prompts_provided": { 243 "applies": true, 244 "answer": true, 245 "justification": "Full system prompt text is provided for delimiting (Section 3.2), datamarking (Section 3.3), encoding (Section 3.4), instruction-only defense (Section 4.2), and few-shot (Section 8.2). These are actual prompt templates with clear placeholder notation.", 246 "source": "opus" 247 }, 248 "hyperparameters_reported": { 249 "applies": true, 250 "answer": true, 251 "justification": "Section 4.1 states: 'All experiments are conducted with temperature set to 1.0. We examined the effect of temperature on XPIA susceptibility and found no notable impact.' Temperature is the critical sampling parameter for these experiments.", 252 "source": "opus" 253 }, 254 "scaffolding_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No agentic scaffolding is used. The approach is direct prompt engineering applied to single LLM calls.", 258 "source": "opus" 259 }, 260 "data_preprocessing_documented": { 261 "applies": true, 262 "answer": false, 263 "justification": "The paper states 'we generated a synthetic dataset of 1000 documents that contain prompt injection attacks' (Section 4.2) with 'variations on a simple keyword payload attack' but provides no details on the generation procedure, templates used, document content, or variation methodology.", 264 "source": "opus" 265 } 266 }, 267 "data_integrity": { 268 "raw_data_available": { 269 "applies": true, 270 "answer": false, 271 "justification": "Neither the 1000-document attack corpus nor the raw model responses are available for independent verification.", 272 "source": "opus" 273 }, 274 "data_collection_described": { 275 "applies": true, 276 "answer": false, 277 "justification": "The attack corpus generation is described only as 'variations on a simple keyword payload attack' with the keyword 'canary' (Section 4.2). No details on how the 1000 documents were generated, what variation strategies were used, or what the non-attack document content looks like.", 278 "source": "opus" 279 }, 280 "recruitment_methods_described": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants. Data is synthetic and from standard benchmarks.", 284 "source": "opus" 285 }, 286 "data_pipeline_documented": { 287 "applies": true, 288 "answer": false, 289 "justification": "The pipeline from document generation through spotlighting transformation to ASR measurement is described conceptually but not in reproducible detail. The attack document generation process, exact transformation implementations, and response classification logic are not documented.", 290 "source": "opus" 291 } 292 }, 293 "contamination": { 294 "training_cutoff_stated": { 295 "applies": false, 296 "answer": false, 297 "justification": "This paper tests defense techniques against prompt injection, not model knowledge or capability on benchmarks. The benchmark evaluations (SQuAD, IMDB, SuperGLUE) serve only to verify the defense doesn't degrade performance; the comparison is within-model (with vs. without spotlighting), making training contamination equally affecting both conditions.", 298 "source": "opus" 299 }, 300 "train_test_overlap_discussed": { 301 "applies": false, 302 "answer": false, 303 "justification": "The paper evaluates defense effectiveness, not model capability. Any benchmark contamination would affect both the with-spotlighting and without-spotlighting conditions equally, making it irrelevant to the claims.", 304 "source": "opus" 305 }, 306 "benchmark_contamination_addressed": { 307 "applies": false, 308 "answer": false, 309 "justification": "The paper tests defenses rather than model knowledge. The primary evaluation uses a custom synthetic attack corpus, and NLP benchmarks are used only for differential comparison.", 310 "source": "opus" 311 } 312 }, 313 "human_studies": { 314 "pre_registered": { 315 "applies": false, 316 "answer": false, 317 "justification": "No human participants in this study. All experiments are automated with synthetic data and benchmark datasets.", 318 "source": "opus" 319 }, 320 "irb_or_ethics_approval": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants.", 324 "source": "opus" 325 }, 326 "demographics_reported": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants.", 330 "source": "opus" 331 }, 332 "inclusion_exclusion_criteria": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants.", 336 "source": "opus" 337 }, 338 "randomization_described": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants.", 342 "source": "opus" 343 }, 344 "blinding_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants.", 348 "source": "opus" 349 }, 350 "attrition_reported": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants.", 354 "source": "opus" 355 } 356 }, 357 "cost_and_practicality": { 358 "inference_cost_reported": { 359 "applies": true, 360 "answer": false, 361 "justification": "No inference costs, API costs, token consumption, or latency overhead is reported for any of the spotlighting techniques despite the encoding approach requiring the model to decode base64, which may incur additional compute.", 362 "source": "opus" 363 }, 364 "compute_budget_stated": { 365 "applies": true, 366 "answer": false, 367 "justification": "No total computational budget, API spend, or hardware details are provided despite running experiments across 3 models, multiple tasks, and 1000-document corpora.", 368 "source": "opus" 369 } 370 }, 371 "experimental_rigor": { 372 "seed_sensitivity_reported": { 373 "applies": true, 374 "answer": false, 375 "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs despite using temperature=1.0, which introduces stochastic variation.", 376 "source": "opus" 377 }, 378 "number_of_runs_stated": { 379 "applies": true, 380 "answer": false, 381 "justification": "The number of experimental runs is never stated. Results are presented as single point estimates with no indication of how many times each experiment was conducted.", 382 "source": "opus" 383 }, 384 "hyperparameter_search_budget": { 385 "applies": true, 386 "answer": false, 387 "justification": "No hyperparameter search budget is reported. The choice of marking tokens, prompt phrasing, and encoding method appear to be hand-selected without documenting what alternatives were tried or how selections were made.", 388 "source": "opus" 389 }, 390 "best_config_selection_justified": { 391 "applies": true, 392 "answer": true, 393 "justification": "Section 5.3 provides justified recommendations: encoding is recommended for high-capacity models based on lowest ASR (Figure 6), datamarking for general use based on strong ASR reduction without task impact (Figures 4, 7). The progression of results across configurations justifies the selection.", 394 "source": "opus" 395 }, 396 "multiple_comparison_correction": { 397 "applies": false, 398 "answer": false, 399 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable.", 400 "source": "opus" 401 }, 402 "self_comparison_bias_addressed": { 403 "applies": true, 404 "answer": false, 405 "justification": "The authors evaluate their own proposed techniques without acknowledging the potential bias of self-evaluation. No independent evaluation or acknowledgment of author-evaluation bias is provided.", 406 "source": "opus" 407 }, 408 "compute_budget_vs_performance": { 409 "applies": true, 410 "answer": false, 411 "justification": "No discussion of compute differences between spotlighting approaches. Encoding requires the model to decode base64, which uses more tokens and compute, but this cost is never quantified or compared against the defense benefit.", 412 "source": "opus" 413 }, 414 "benchmark_construct_validity": { 415 "applies": true, 416 "answer": true, 417 "justification": "Section 4.2 and Appendix 8.1 discuss construct validity of the ASR metric, distinguishing strict ASR from Affected Success Rate (AffSR), providing concrete examples of gray-area cases, and explaining why keyword detection may not capture all attack outcomes.", 418 "source": "opus" 419 }, 420 "scaffold_confound_addressed": { 421 "applies": false, 422 "answer": false, 423 "justification": "No agentic scaffolding is used. Experiments are direct prompt engineering on single LLM calls.", 424 "source": "opus" 425 } 426 }, 427 "data_leakage": { 428 "temporal_leakage_addressed": { 429 "applies": true, 430 "answer": false, 431 "justification": "No discussion of temporal leakage. The NLP benchmarks (SQuAD 2016, IMDB 2011, SuperGLUE 2020) predate the models' training, but this is not discussed.", 432 "source": "opus" 433 }, 434 "feature_leakage_addressed": { 435 "applies": true, 436 "answer": false, 437 "justification": "No discussion of whether the evaluation setup leaks information. The keyword 'canary' in attack payloads could be a distinctive signal that doesn't generalize to real attacks.", 438 "source": "opus" 439 }, 440 "non_independence_addressed": { 441 "applies": true, 442 "answer": false, 443 "justification": "No discussion of independence. The 1000 attack documents are 'variations on a simple keyword payload attack' — the degree of similarity between documents and whether results are inflated by non-independence is not addressed.", 444 "source": "opus" 445 }, 446 "leakage_detection_method": { 447 "applies": true, 448 "answer": false, 449 "justification": "No leakage detection or prevention methods are applied. No canary strings, membership inference, or decontamination pipelines are used.", 450 "source": "opus" 451 } 452 } 453 } 454 }, 455 "claims": [ 456 { 457 "claim": "Spotlighting (particularly encoding) reduces indirect prompt injection attack success rate from baseline 50-60% to near 0-2%", 458 "evidence": "Figures 3-6 showing ASR across models and tasks. Encoding achieves 0% on summarization, 1.8% on Q&A.", 459 "supported": "strong" 460 }, 461 { 462 "claim": "Datamarking and encoding transformations preserve task performance on standard NLP benchmarks", 463 "evidence": "Figure 7 shows no detrimental impact of datamarking across SQuAD, SuperGLUE, IMDB. Figure 8 shows encoding works well for GPT-4 but hurts GPT-3.5-Turbo.", 464 "supported": "moderate" 465 }, 466 { 467 "claim": "Encoding is more effective than datamarking, which is more effective than delimiting", 468 "evidence": "Figures 3-6 show progressive improvement: delimiting ~30-50% reduction, datamarking 92-97% reduction, encoding 98-100% reduction.", 469 "supported": "strong" 470 }, 471 { 472 "claim": "The defense works across different GPT model versions and task types", 473 "evidence": "Results tested on text-davinci-003, GPT-3.5-Turbo, GPT-4 with both summarization and Q&A tasks (Figures 3-6).", 474 "supported": "strong" 475 }, 476 { 477 "claim": "Dynamic marking tokens and randomized interleaving positions can make spotlighting harder to subvert", 478 "evidence": "Section 5.4 adversary considerations discuss dynamic tokens providing N^k possible tokens against 1/N^k guessing probability, but not empirically tested.", 479 "supported": "weak" 480 }, 481 { 482 "claim": "System instructions alone provide minimal defense against indirect prompt injection", 483 "evidence": "Figure 2 shows that adding instructions has negligible benefit for GPT-3.5-Turbo and modest benefit for Text-003.", 484 "supported": "strong" 485 } 486 ], 487 "methodology_tags": [ 488 "benchmark-eval", 489 "case-study" 490 ], 491 "key_findings": "Spotlighting defenses reduce indirect prompt injection attack success rates from 50-60% baseline to near 0-2% through encoding, 3-8% through datamarking. Datamarking preserves task performance across all tested models, while encoding requires high-capacity models (GPT-4) to maintain utility. The techniques represent a structural solution to the LLM architecture's inability to distinguish between code (instructions) and data (untrusted inputs).", 492 "red_flags": [ 493 { 494 "flag": "Single attack type", 495 "detail": "Only simple keyword-payload attacks tested. No evaluation against semantic attacks, multi-step attacks, or adversarially crafted attacks." 496 }, 497 { 498 "flag": "No statistical significance testing", 499 "detail": "No p-values, confidence intervals, or variance reporting. Cannot assess statistical reliability of ASR improvements." 500 }, 501 { 502 "flag": "Synthetic data only", 503 "detail": "Corpus of 1000 synthetic documents does not represent real-world prompt injection attempts in the wild." 504 }, 505 { 506 "flag": "No code or data release", 507 "detail": "Synthetic corpus and implementation code not released. Reproducibility limited to description of technique." 508 }, 509 { 510 "flag": "Model-specific effectiveness", 511 "detail": "Encoding technique fails with GPT-3.5-Turbo, limiting generalization across model families." 512 }, 513 { 514 "flag": "Security assumption", 515 "detail": "Defense assumes attacker does not know about spotlighting. Paper acknowledges this but does not test robustness against informed attackers." 516 }, 517 { 518 "flag": "No human evaluation", 519 "detail": "Lack of user studies or human assessment of whether spotlighting is practical for real-world deployment." 520 }, 521 { 522 "flag": "Benchmark contamination", 523 "detail": "Standard benchmarks used for task evaluation may be in model training data, introducing unquantified confound." 524 } 525 ], 526 "cited_papers": [ 527 { 528 "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models", 529 "authors": "Yi et al.", 530 "year": 2023, 531 "arxiv_id": "2312.14197", 532 "relevance": "Direct prior work on XPIA defense and evaluation methodology. This paper extends the approach with novel spotlighting techniques." 533 }, 534 { 535 "title": "More than you've asked for: A Comprehensive Analysis of Novel Prompt Injection Threats to Application-Integrated Large Language Models", 536 "authors": "Greshake et al.", 537 "year": 2023, 538 "arxiv_id": "2302.12173", 539 "relevance": "Seminal work on comprehensive XPIA taxonomy and threat model. Foundational for understanding indirect vs. direct prompt injection." 540 }, 541 { 542 "title": "How We Broke LLMs: Indirect Prompt Injection", 543 "authors": "Greshake, K.", 544 "year": 2022, 545 "relevance": "Early demonstration of XPIA feasibility in Bing Chat. Established the attack vector this paper defends against." 546 }, 547 { 548 "title": "Language models are few-shot learners", 549 "authors": "Brown et al.", 550 "year": 2020, 551 "arxiv_id": "2005.14165", 552 "relevance": "Foundational GPT-3 work. Establishes in-context learning capability that underpins LLM instruction-following exploited by XPIA." 553 }, 554 { 555 "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models", 556 "authors": "Wei et al.", 557 "year": 2023, 558 "arxiv_id": "2201.11903", 559 "relevance": "Prompt engineering technique foundational to understanding how LLMs respond to complex instructions, relevant to XPIA attack construction." 560 }, 561 { 562 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 563 "authors": "Zou et al.", 564 "year": 2023, 565 "arxiv_id": "2307.15043", 566 "relevance": "Adversarial attack methodology on LLMs. Related to broader prompt injection threat model and attack sophistication." 567 }, 568 { 569 "title": "SQuAD: 100,000+ Questions for Machine Comprehension of Text", 570 "authors": "Rajpurkar et al.", 571 "year": 2016, 572 "arxiv_id": "1606.05250", 573 "relevance": "Standard Q&A benchmark used for task performance evaluation of spotlighting defenses." 574 } 575 ], 576 "engagement_factors": { 577 "practical_relevance": { 578 "score": 3, 579 "justification": "Spotlighting techniques (especially datamarking) are immediately usable by any developer building LLM applications — full example prompts are provided." 580 }, 581 "surprise_contrarian": { 582 "score": 1, 583 "justification": "Confirms the intuition that marking input data helps LLMs distinguish instructions from data; the telecom analogy is novel framing but the finding is expected." 584 }, 585 "fear_safety": { 586 "score": 2, 587 "justification": "Highlights that baseline XPIA success rates exceed 50% on major models, underscoring a significant AI security vulnerability, though it then proposes a mitigation." 588 }, 589 "drama_conflict": { 590 "score": 0, 591 "justification": "No controversy or conflict; straightforward defense paper with positive results." 592 }, 593 "demo_ability": { 594 "score": 1, 595 "justification": "Prompt templates are provided for manual replication, but no code, tool, or demo is released." 596 }, 597 "brand_recognition": { 598 "score": 2, 599 "justification": "From Microsoft Research, evaluating GPT-3.5/GPT-4 — recognizable lab and models." 600 } 601 }, 602 "hn_data": { 603 "threads": [ 604 { 605 "hn_id": "22768143", 606 "title": "Deep Molecular Programming", 607 "points": 130, 608 "comments": 11, 609 "url": "https://news.ycombinator.com/item?id=22768143" 610 }, 611 { 612 "hn_id": "39466681", 613 "title": "Coercing LLMs to do and reveal almost anything", 614 "points": 12, 615 "comments": 1, 616 "url": "https://news.ycombinator.com/item?id=39466681" 617 }, 618 { 619 "hn_id": "45489599", 620 "title": "Tutorials for Sandia's Lammps Simulation Package", 621 "points": 8, 622 "comments": 1, 623 "url": "https://news.ycombinator.com/item?id=45489599" 624 }, 625 { 626 "hn_id": "44478832", 627 "title": "CodingGenie: A Proactive LLM-Powered Programming Assistant", 628 "points": 5, 629 "comments": 0, 630 "url": "https://news.ycombinator.com/item?id=44478832" 631 }, 632 { 633 "hn_id": "23363404", 634 "title": "“Periodic table” for protons in the nucleus", 635 "points": 4, 636 "comments": 0, 637 "url": "https://news.ycombinator.com/item?id=23363404" 638 }, 639 { 640 "hn_id": "44415220", 641 "title": "Storm – Help LLMs to write very long articles", 642 "points": 2, 643 "comments": 0, 644 "url": "https://news.ycombinator.com/item?id=44415220" 645 }, 646 { 647 "hn_id": "43540243", 648 "title": "AttentionRAG: Attention-Guided Context Pruning in Retrieval-Augmented Generation", 649 "points": 2, 650 "comments": 0, 651 "url": "https://news.ycombinator.com/item?id=43540243" 652 }, 653 { 654 "hn_id": "41125541", 655 "title": "Solving the Traveling Salesman Problem Using a Single Qubit", 656 "points": 2, 657 "comments": 0, 658 "url": "https://news.ycombinator.com/item?id=41125541" 659 }, 660 { 661 "hn_id": "41066825", 662 "title": "Solving the Travelling Salesman Problem Using a Single Qubit", 663 "points": 2, 664 "comments": 0, 665 "url": "https://news.ycombinator.com/item?id=41066825" 666 }, 667 { 668 "hn_id": "40822524", 669 "title": "Do LLMs Have Distinct and Consistent Personality?", 670 "points": 2, 671 "comments": 0, 672 "url": "https://news.ycombinator.com/item?id=40822524" 673 } 674 ], 675 "top_points": 130, 676 "total_points": 169, 677 "total_comments": 13 678 } 679 }