ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30688B)


      1 {
      2   "paper": {
      3     "title": "Defense Against Prompt Injection Attack by Leveraging Attack Techniques",
      4     "authors": [
      5       "Yulin Chen",
      6       "Haoran Li",
      7       "Zihao Zheng",
      8       "Dekai Wu",
      9       "Yangqiu Song",
     10       "Bryan Hooi"
     11     ],
     12     "year": 2024,
     13     "venue": "Annual Meeting of the Association for Computational Linguistics",
     14     "arxiv_id": "2411.00459",
     15     "doi": "10.48550/arXiv.2411.00459"
     16   },
     17   "scan_version": 3,
     18   "active_modules": [
     19     "experimental_rigor",
     20     "data_leakage"
     21   ],
     22   "methodology_tags": [
     23     "benchmark-eval"
     24   ],
     25   "key_findings": "The paper inverts prompt injection attack techniques to create novel training-free defense methods. The Fake Completion with Template (Fakecom-t) defense reduces ASR to near zero in many scenarios across both direct and indirect attacks. The approach outperforms existing training-free defenses (Sandwich, Instructional, Reminder, Isolation, Spotlight) and is comparable to fine-tuning-based methods like StruQ. The authors observe that stronger attack techniques tend to produce stronger defense methods when repurposed.",
     26   "checklist": {
     27     "artifacts": {
     28       "code_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "Code is publicly available at https://github.com/LukeChen-go/pia-defense-by-attack, stated in footnote 1 of the abstract."
     32       },
     33       "data_released": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "The paper uses publicly available datasets: 208 samples from AlpacaFarm (Dubois et al., 2024), QA dataset filtered by Li et al. (2023b) with 2000 samples, and SST2 (Socher et al., 2013). All are standard public benchmarks."
     37       },
     38       "environment_specified": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Appendix A.1 mentions PyTorch 2.1.0 and a single NVIDIA A100 GPU, but provides no requirements.txt, Dockerfile, or complete list of library versions needed to recreate the environment."
     42       },
     43       "reproduction_instructions": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released, the paper does not describe a README or specific commands to replicate experiments."
     47       }
     48     },
     49     "statistical_methodology": {
     50       "confidence_intervals_or_error_bars": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "All results in Tables 1-12 are reported as point estimates (e.g., ASR percentages, accuracy percentages) with no confidence intervals or error bars."
     54       },
     55       "significance_tests": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "Claims that defense methods 'outperform' baselines are based solely on comparing ASR numbers. No statistical significance tests (t-tests, bootstrap, etc.) are used anywhere in the paper."
     59       },
     60       "effect_sizes_reported": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "ASR values are reported for both baselines and proposed methods with full context. For example, in Table 2, indirect attack ASR drops from 86.00% (no defense) to 0.05% (Fakecom-t) for Combined attack on Llama3, providing clear magnitude of improvement."
     64       },
     65       "sample_size_justified": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No justification for the 208-sample AlpacaFarm subset for direct attacks or the 2000-sample QA dataset for indirect attacks. No power analysis is mentioned."
     69       },
     70       "variance_reported": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "Results are single-run numbers with no variance, standard deviation, or spread measures reported. While generation uses do_sample=false (deterministic), this is not stated as justification for omitting variance."
     74       }
     75     },
     76     "evaluation_design": {
     77       "baselines_included": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Five training-free defense baselines are compared: Sandwich, Instructional, Reminder, Isolation, and Spotlight (Section 5.2.2). The fine-tuning-based StruQ method is also compared in Table 7."
     81       },
     82       "baselines_contemporary": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Baselines are from 2023-2024: Sandwich (2023), Instructional (2023), Reminder (Yi et al., 2023), Isolation (Willison, 2023), Spotlight (Hines et al., 2024), StruQ (Chen et al., 2024). These represent recent work in the field."
     86       },
     87       "ablation_study": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Section 5.4 contains extensive ablation studies: testing on closed-source models, gradient-based attacks, fake completion attack with template, attack-defense strength relationship, comparison with fine-tuning, long input handling, and impact of deleting data content."
     91       },
     92       "multiple_metrics": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Two metrics are used: Attack Success Rate (ASR) for security evaluation (Tables 1-2) and accuracy for model utility evaluation (Tables 3, 11). Time cost per item is also reported (Table 8)."
     96       },
     97       "human_evaluation": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "All evaluation is automated. ASR checks whether the injected instruction's answer appears in the response via string matching. Accuracy checks whether the golden answer appears. No human evaluation of response quality is performed."
    101       },
    102       "held_out_test_set": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The methods are training-free (prompt engineering), so no dev/validation set is used for tuning. All evaluation datasets (AlpacaFarm, QA, SST2) are used purely for testing, with no selection decisions made on them."
    106       },
    107       "per_category_breakdown": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Results are broken down by attack type (Naive, Ignore, Escape, Fakecom, Combined), by victim model (Llama3, Qwen2, Llama3.1, GPT-3.5-Turbo, GPT-4o-Latest), and by attack scenario (direct vs indirect)."
    111       },
    112       "failure_cases_discussed": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Section 5.5 provides a case study showing where attacks succeed and defenses fail. Figure 16 shows examples where the model executes both the original and injected instructions despite defense attempts."
    116       },
    117       "negative_results_reported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The Escape defense shows notably worse performance than other proposed methods in several settings (e.g., Table 1, Ours-Escape on Fakecom attack for Qwen2: 70.19% ASR). The paper also notes that 'Ours-Ignore' and 'Ours-Fakecom' methods result in 'only a limited decrease in ASR' against the fake completion attack with template (Section 5.4)."
    121       }
    122     },
    123     "claims_and_evidence": {
    124       "abstract_claims_supported": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The abstract claims 'outperform existing defense approaches' and 'ASR approaching zero in some scenarios.' Tables 1-2 support both: Fakecom-t achieves 0.0% ASR in multiple settings, and all proposed methods outperform baselines across models and attack types."
    128       },
    129       "causal_claims_justified": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The main causal claims ('our method outperforms baselines') are supported by controlled experiments that vary only the defense method while holding the attack, model, and dataset constant. Ablation studies in Section 5.4 systematically test individual components."
    133       },
    134       "generalization_bounded": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The title 'Defense Against Prompt Injection Attack' and abstract claim of 'state-of-the-art results' are not bounded to the tested settings. Evaluation uses only 7-8B open-source models and two closed-source models, two datasets, and specific attack templates. No discussion of whether results generalize to other model sizes, languages, or attack types."
    138       },
    139       "alternative_explanations_discussed": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper provides an intuitive explanation (attack and defense share similar goals) but does not consider alternative explanations such as position bias (defense prompts appear last), recency effects in LLMs, or whether the defense works by confusing the model rather than actually 'ignoring' injected content."
    143       },
    144       "proxy_outcome_distinction": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "ASR (checking if injected instruction's answer appears in response) is a direct measure of defense effectiveness, and accuracy (checking if golden answer appears) directly measures utility. The measurements match the claimed outcomes without proxy gaps."
    148       }
    149     },
    150     "setup_transparency": {
    151       "model_versions_specified": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "Open-source models are specified with identifiers (Llama3-8b-Instruct, Qwen2-7b-Instruct, Llama3.1-8b-Instruct). However, closed-source models are listed as 'GPT-3.5-Turbo' and 'GPT-4o-Latest' without snapshot dates or API versions, which the schema requires."
    155       },
    156       "prompts_provided": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Full defense prompt structures are shown in Figure 2 (a-d) and Figures 4-8 for baselines. Attack prompt structures are shown in Figures 9-15. The exact prompt templates including shield prompts and instruction placement are provided. The only variable part is the user instruction from the dataset."
    160       },
    161       "hyperparameters_reported": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Appendix A.1 reports: do_sample=false, max_new_tokens=256, max_length=8192. These are the key generation parameters."
    165       },
    166       "scaffolding_described": {
    167         "applies": false,
    168         "answer": false,
    169         "justification": "No agentic scaffolding is used. The defense methods are prompt engineering techniques applied directly to LLM inputs."
    170       },
    171       "data_preprocessing_documented": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Section 5.1 describes the evaluation setup: 208 samples from AlpacaFarm following Chen et al. (2024) for direct attacks, 2000 samples from the QA dataset filtered by Li et al. (2023b) for indirect attacks, and SST2 for utility. The attack injection process is described in Section 3."
    175       }
    176     },
    177     "limitations_and_scope": {
    178       "limitations_section_present": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "A dedicated 'Limitations' section discusses three specific limitations: inability to test long-query truncation, not using gradient-based attacks as defense, and lack of mathematical proof."
    182       },
    183       "threats_to_validity_specific": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The limitations are specific to this study: 'a benchmark of long queries for prompt injection research has not yet been established,' 'we do not employ gradient-based attack methods as defense methods, as previous studies have shown that their performance is not satisfactory,' and the absence of mathematical proof for why prompt-engineering-based methods work."
    187       },
    188       "scope_boundaries_stated": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The Limitations section explicitly states what was NOT tested: long-query scenarios, gradient-based attacks as defense methods, and theoretical/mathematical analysis. These bound the scope of the findings."
    192       }
    193     },
    194     "data_integrity": {
    195       "raw_data_available": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No raw experimental outputs (model responses, per-sample ASR results) are released. Only aggregate statistics in tables are provided."
    199       },
    200       "data_collection_described": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Data sources are clearly described: AlpacaFarm (208 samples, Dubois et al. 2024), filtered QA dataset (2000 samples, Li et al. 2023b), SST2 (Socher et al. 2013). The evaluation protocol references Chen et al. (2024)."
    204       },
    205       "recruitment_methods_described": {
    206         "applies": false,
    207         "answer": false,
    208         "justification": "No human participants. All data sources are standard public benchmarks (AlpacaFarm, QA dataset, SST2)."
    209       },
    210       "data_pipeline_documented": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "The pipeline is documented: take benchmark dataset → inject attack content into data → apply defense method → run LLM with configured parameters → check output for injected answer (ASR) or golden answer (accuracy). Sections 3-5 describe each step."
    214       }
    215     },
    216     "conflicts_of_interest": {
    217       "funding_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "The Acknowledgment section states: 'The work described in this paper was conducted in full or in part by Dr. Haoran Li, JC STEM Early Career Research Fellow, supported by The Hong Kong Jockey Club Charities Trust.'"
    221       },
    222       "affiliations_disclosed": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "All author affiliations are listed: National University of Singapore, HKUST, and Harbin Institute of Technology Shenzhen. Authors are from academic institutions and are not evaluating their own commercial products."
    226       },
    227       "funder_independent_of_outcome": {
    228         "applies": true,
    229         "answer": true,
    230         "justification": "The Hong Kong Jockey Club Charities Trust is a charitable foundation with no financial stake in prompt injection defense outcomes."
    231       },
    232       "financial_interests_declared": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No competing interests or financial interests statement is included in the paper. While an Ethical Consideration section is present, it does not address financial conflicts."
    236       }
    237     },
    238     "contamination": {
    239       "training_cutoff_stated": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "This paper tests defense methods against prompt injection attacks, not model knowledge or capability on benchmarks. Contamination of training data with benchmark answers would not affect ASR (whether injected instructions are followed)."
    243       },
    244       "train_test_overlap_discussed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Same as above — the paper evaluates defense effectiveness, not model knowledge. Train/test overlap is not relevant to whether a defense prompt prevents instruction injection."
    248       },
    249       "benchmark_contamination_addressed": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "Same as above — the evaluation measures defense effectiveness (ASR), not model capability. Benchmark contamination does not affect the core evaluation metric."
    253       }
    254     },
    255     "human_studies": {
    256       "pre_registered": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study. All experiments are automated using LLMs and benchmark datasets."
    260       },
    261       "irb_or_ethics_approval": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants. The Ethical Consideration section acknowledges the ACM Code of Ethics but no IRB is needed."
    265       },
    266       "demographics_reported": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in the study."
    270       },
    271       "inclusion_exclusion_criteria": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in the study."
    275       },
    276       "randomization_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in the study."
    280       },
    281       "blinding_described": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in the study."
    285       },
    286       "attrition_reported": {
    287         "applies": false,
    288         "answer": false,
    289         "justification": "No human participants in the study."
    290       }
    291     },
    292     "cost_and_practicality": {
    293       "inference_cost_reported": {
    294         "applies": true,
    295         "answer": true,
    296         "justification": "Table 8 reports time cost per item (in seconds) for all defense methods across three models. For example, Ours-Fakecom-t costs 0.766 seconds/item on Llama3 vs. 0.645 for no defense."
    297       },
    298       "compute_budget_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "Appendix A.1 mentions 'a single NVIDIA A100 GPU' but does not state total GPU hours or total compute time for all experiments. Only per-item timing is provided in Table 8."
    302       }
    303     },
    304     "experimental_rigor": {
    305       "seed_sensitivity_reported": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "No seed sensitivity analysis is reported. While do_sample=false makes generation deterministic, the paper does not state this as justification or test whether other sources of variance exist."
    309       },
    310       "number_of_runs_stated": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The number of experimental runs is never explicitly stated. Results appear to be from a single deterministic run (do_sample=false) but this is not documented as such."
    314       },
    315       "hyperparameter_search_budget": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No hyperparameter search is described. Generation parameters (do_sample=false, max_new_tokens=256, max_length=8192) appear to be set without any search process, but no search budget or justification is provided."
    319       },
    320       "best_config_selection_justified": {
    321         "applies": true,
    322         "answer": true,
    323         "justification": "All four proposed defense methods (Ignore, Escape, Fakecom, Fakecom-t) are reported in every experiment, not just the best one. No cherry-picking of configurations is apparent."
    324       },
    325       "multiple_comparison_correction": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The paper makes dozens of comparative claims across 4 defense methods × 5 attack methods × 3-5 models without any statistical tests or multiple comparison corrections."
    329       },
    330       "self_comparison_bias_addressed": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The authors implement both their defense methods and the baselines without acknowledging potential bias in their implementation of baseline methods. The paper thanks Chen et al. (2024) for 'providing the baseline code' but does not discuss whether all baselines were implemented with equal care."
    334       },
    335       "compute_budget_vs_performance": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Table 8 reports time cost per item for all defense methods, allowing comparison of computational overhead versus defense performance. The overhead is shown to be negligible (all methods within 0.6-1.6 seconds/item)."
    339       },
    340       "benchmark_construct_validity": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The paper does not discuss whether the ASR metric (checking if injected answer appears in response) truly captures defense effectiveness. String matching could produce false positives/negatives. No analysis of construct validity for either AlpacaFarm or the QA dataset as prompt injection evaluation benchmarks."
    344       },
    345       "scaffold_confound_addressed": {
    346         "applies": false,
    347         "answer": false,
    348         "justification": "No agentic scaffolding is used. Defense methods are prompt engineering techniques applied directly to LLM inputs."
    349       }
    350     },
    351     "data_leakage": {
    352       "temporal_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the LLMs' training data includes information from AlpacaFarm, the QA dataset, or SST2 that could affect the utility evaluation results."
    356       },
    357       "feature_leakage_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether the evaluation setup leaks information that would not be present in real deployment scenarios."
    361       },
    362       "non_independence_addressed": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No discussion of whether test examples share structural similarities or are drawn from overlapping distributions."
    366       },
    367       "leakage_detection_method": {
    368         "applies": true,
    369         "answer": false,
    370         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination pipelines are used."
    371       }
    372     }
    373   },
    374   "claims": [
    375     {
    376       "claim": "Defense methods based on attack techniques outperform existing training-free defense baselines across multiple attack types and models.",
    377       "evidence": "Tables 1 and 2 show consistent ASR reductions. In indirect attacks (Table 2), Fakecom-t achieves 0.05% ASR on Combined attack for Llama3, compared to 21.25% for the best baseline (Sandwich). Results hold across all three victim models and five attack types.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "The Fakecom-t defense achieves near-zero ASR in certain scenarios.",
    382       "evidence": "Table 1 shows 0.0% ASR for Fakecom-t against Fakecom attack on Llama3. Table 2 shows 0.05% ASR in 8 out of 15 model-attack combinations for Fakecom-t in indirect attacks.",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Stronger attack methods lead to stronger defense methods when repurposed.",
    387       "evidence": "Figure 3 shows the relationship between attack ASR (averaged across defenses) and defense ASR (averaged across attacks), demonstrating a positive correlation with one exception on Qwen2/Llama3.1.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Defense methods do not significantly degrade model utility.",
    392       "evidence": "Table 3 shows QA accuracy is maintained or slightly improved (e.g., Llama3: 78.05% baseline vs. 80.75% with Ours-Fakecom). Table 11 shows sentiment analysis accuracy is similarly preserved across all defense methods.",
    393       "supported": "strong"
    394     },
    395     {
    396       "claim": "Methods are effective on closed-source models (GPT-3.5-Turbo, GPT-4o-Latest).",
    397       "evidence": "Table 4 shows Ours-Ignore achieves 0.0-4.32% ASR on GPT-3.5-Turbo and 0.0-0.90% on GPT-4o-Latest across attack types, outperforming all baselines.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "Methods generalize to defend against gradient-based attacks (GCG, AutoDAN).",
    402       "evidence": "Table 5 shows Fakecom-t reduces ASR from 87.01% to 9.61% against GCG and from 68.75% to 10.57% against AutoDAN on Llama3, outperforming all baselines.",
    403       "supported": "moderate"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "No error bars or statistical tests",
    409       "detail": "All results across 12 tables are reported as point estimates without confidence intervals, error bars, or statistical significance tests. Claims of outperformance rely entirely on comparing raw numbers."
    410     },
    411     {
    412       "flag": "Simple string-matching evaluation metric",
    413       "detail": "ASR is measured by checking if the injected instruction's answer appears in the model's response. This binary string-matching approach could produce false positives (partial matches) or false negatives (paraphrased outputs). No discussion of metric validity."
    414     },
    415     {
    416       "flag": "Unbounded generalization claims",
    417       "detail": "The paper claims 'state-of-the-art results' for prompt injection defense but tests only on 7-8B parameter open-source models and two closed-source models, with specific attack templates. No discussion of how results generalize to different model families, sizes, or novel attack types."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Struq: Defending against prompt injection with structured queries",
    423       "authors": [
    424         "Sizhe Chen",
    425         "Julien Piet",
    426         "Chawin Sitawarin",
    427         "David Wagner"
    428       ],
    429       "year": 2024,
    430       "arxiv_id": "2402.06363",
    431       "relevance": "Fine-tuning-based defense against prompt injection attacks, used as a key baseline comparison in this paper."
    432     },
    433     {
    434       "title": "Ignore previous prompt: Attack techniques for language models",
    435       "authors": [
    436         "Fábio Perez",
    437         "Ian Ribeiro"
    438       ],
    439       "year": 2022,
    440       "arxiv_id": "2211.09527",
    441       "relevance": "Foundational work on prompt injection attack techniques that forms the basis for the defense methods proposed in this paper."
    442     },
    443     {
    444       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    445       "authors": [
    446         "Kai Greshake",
    447         "Sahar Abdelnabi",
    448         "Shailesh Mishra",
    449         "Christoph Endres",
    450         "Thorsten Holz",
    451         "Mario Fritz"
    452       ],
    453       "year": 2023,
    454       "relevance": "Seminal work on indirect prompt injection attacks against LLM-integrated applications."
    455     },
    456     {
    457       "title": "Defending against indirect prompt injection attacks with spotlighting",
    458       "authors": [
    459         "Keegan Hines",
    460         "Gary Lopez",
    461         "Matthew Hall",
    462         "Federico Zarfati",
    463         "Yonatan Zunger",
    464         "Emre Kiciman"
    465       ],
    466       "year": 2024,
    467       "arxiv_id": "2403.14720",
    468       "relevance": "Training-free defense method using special tokens to mark data content, used as a baseline in this paper."
    469     },
    470     {
    471       "title": "The instruction hierarchy: Training LLMs to prioritize privileged instructions",
    472       "authors": [
    473         "Eric Wallace",
    474         "Kai Xiao",
    475         "Reimar Leike",
    476         "Lilian Weng",
    477         "Johannes Heidecke",
    478         "Alex Beutel"
    479       ],
    480       "year": 2024,
    481       "arxiv_id": "2404.13208",
    482       "relevance": "Fine-tuning approach granting privileged status to authorized instructions for prompt injection defense."
    483     },
    484     {
    485       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    486       "authors": [
    487         "Yupei Liu",
    488         "Yuqi Jia",
    489         "Runpeng Geng",
    490         "Jinyuan Jia",
    491         "Neil Zhenqiang Gong"
    492       ],
    493       "year": 2024,
    494       "relevance": "Formalizes prompt injection attack/defense framework including combined attack methods used in this paper."
    495     },
    496     {
    497       "title": "Universal and transferable adversarial attacks on aligned language models",
    498       "authors": [
    499         "Andy Zou",
    500         "Zifan Wang",
    501         "Nicholas Carlini",
    502         "Milad Nasr",
    503         "J Zico Kolter",
    504         "Matt Fredrikson"
    505       ],
    506       "year": 2023,
    507       "arxiv_id": "2307.15043",
    508       "relevance": "GCG attack method used to evaluate defense transferability against gradient-based attacks."
    509     },
    510     {
    511       "title": "Evaluating the instruction-following robustness of large language models to prompt injection",
    512       "authors": [
    513         "Zekun Li",
    514         "Baolin Peng",
    515         "Pengcheng He",
    516         "Xifeng Yan"
    517       ],
    518       "year": 2023,
    519       "relevance": "Provides the filtered QA evaluation dataset and evaluation protocol used in this paper's indirect attack experiments."
    520     },
    521     {
    522       "title": "Automatic and universal prompt injection attacks against large language models",
    523       "authors": [
    524         "Xiaogeng Liu",
    525         "Zhiyuan Yu",
    526         "Yizhe Zhang",
    527         "Ning Zhang",
    528         "Chaowei Xiao"
    529       ],
    530       "year": 2024,
    531       "arxiv_id": "2403.04957",
    532       "relevance": "Automatic prompt injection attack methods relevant to LLM security evaluation."
    533     },
    534     {
    535       "title": "Jatmo: Prompt injection defense by task-specific finetuning",
    536       "authors": [
    537         "Julien Piet",
    538         "Maha Alrashed",
    539         "Chawin Sitawarin",
    540         "Sizhe Chen",
    541         "Zeming Wei",
    542         "Elizabeth Sun",
    543         "Basel Alomair",
    544         "David Wagner"
    545       ],
    546       "year": 2023,
    547       "arxiv_id": "2312.17673",
    548       "relevance": "Fine-tuning defense that trains models for specific tasks to prevent malicious instruction following."
    549     },
    550     {
    551       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    552       "authors": [
    553         "Jingwei Yi",
    554         "Yueqi Xie",
    555         "Bin Zhu",
    556         "Keegan Hines",
    557         "Emre Kiciman",
    558         "Guangzhong Sun",
    559         "Xing Xie",
    560         "Fangzhao Wu"
    561       ],
    562       "year": 2023,
    563       "arxiv_id": "2312.14197",
    564       "relevance": "Benchmarking framework for indirect prompt injection with Reminder defense baseline used in this paper."
    565     }
    566   ],
    567   "engagement_factors": {
    568     "practical_relevance": {
    569       "score": 2,
    570       "justification": "Provides training-free, immediately deployable prompt injection defense techniques with open-source code that developers building LLM applications can integrate."
    571     },
    572     "surprise_contrarian": {
    573       "score": 2,
    574       "justification": "The core insight that attack techniques can be directly repurposed as defenses is counterintuitive and elegantly simple."
    575     },
    576     "fear_safety": {
    577       "score": 2,
    578       "justification": "Prompt injection is OWASP's #1 LLM security risk and the paper demonstrates both attack vectors and concrete defenses."
    579     },
    580     "drama_conflict": {
    581       "score": 1,
    582       "justification": "Implicitly challenges existing defense methods as inadequate but doesn't call out specific companies or create controversy."
    583     },
    584     "demo_ability": {
    585       "score": 1,
    586       "justification": "Code is publicly available on GitHub but requires setting up local LLM inference with specific models and datasets."
    587     },
    588     "brand_recognition": {
    589       "score": 1,
    590       "justification": "Authors from NUS and HKUST are recognized institutions but not household names; the paper tests on GPT-4o but isn't from a major AI lab."
    591     }
    592   }
    593 }

Impressum · Datenschutz