ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28743B)


      1 {
      2   "paper": {
      3     "title": "The Attacker Moves Second: Stronger Adaptive Attacks Bypass Defenses Against LLM Jailbreaks and Prompt Injections",
      4     "authors": [
      5       "Milad Nasr",
      6       "Nicholas Carlini",
      7       "Chawin Sitawarin",
      8       "Sander V. Schulhoff",
      9       "Jamie Hayes",
     10       "Michael Ilie",
     11       "Juliette Pluto",
     12       "Shuang Song",
     13       "Harsh Chaudhari",
     14       "Ilia Shumailov",
     15       "Abhradeep Thakurta",
     16       "Kai Yuanqing Xiao",
     17       "Andreas Terzis",
     18       "Florian Tramèr"
     19     ],
     20     "year": 2025,
     21     "venue": "Preprint (arXiv, under review)",
     22     "arxiv_id": "2510.09023"
     23   },
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No code repository URL is provided in the paper. The paper describes attack methods in detail (Appendix A-D) but does not link to any public implementation."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses publicly available benchmarks: HarmBench, AgentDojo, OpenPromptInject, and the Alpaca/Davinci dataset (with explicit HuggingFace URL in Section B.1). These are standard public benchmarks."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No requirements file, Dockerfile, or detailed environment specification is provided. The paper mentions using Gemini-2.5 Pro as LmMutator and various LLM APIs but does not provide library versions or reproducible environment details."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The appendices describe the attack methods at a conceptual level, but there are no step-by-step commands, scripts, or README that would allow a researcher to reproduce the main results without significant guesswork."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Tables 2-7 and Figure 1 report only point estimates of attack success rates (ASR). No confidence intervals, error bars, or standard deviations are reported anywhere in the paper."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The paper compares ASRs between static/weak attacks and adaptive attacks (e.g., 0% vs. 99% for Spotlighting), but no statistical significance tests are performed. Differences are compared numerically without hypothesis testing."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "The paper reports absolute ASR values for both static attacks and adaptive attacks, providing sufficient context for effect size (e.g., 'near-zero ASR' for static vs. '>90% ASR' for adaptive). Figure 1 and Table 7 make magnitude explicit."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper selects 80 samples from AgentDojo (Slack, Travel, Workspace suites) without justifying this sample size. The human red-teaming study uses 500+ participants but does not discuss power or adequacy of the sample for drawing conclusions."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": false,
     71         "justification": "Table 7 reports 'Median Num. Queries' for search attacks but no standard deviation, IQR, or other spread measure. ASR figures are single point estimates across runs with no variance reported."
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Each defense is evaluated against both the original paper's static/weak attack (the baseline) and the authors' adaptive attacks, providing a clear before/after comparison. Figure 1 shows side-by-side ASR for static and adaptive attacks."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "The paper evaluates 12 defenses published in 2024-2025 using their own evaluation protocols. The GCG attack used as one baseline is 2023 and still considered a standard reference attack. The authors note defenses used generic existing attacks like GCG, which is their point."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "There is no ablation study on the attack components themselves. The paper describes four attack families (gradient, RL, search, human) and uses them separately, but does not systematically ablate individual components of the search or RL attack to measure each component's contribution."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "The evaluation uses multiple metrics: attack success rate (ASR), number of queries until first success (Table 7, Figure 6), utility of the defended system, and qualitative analysis of successful attack examples."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper runs a human red-teaming competition with 500+ participants and $20,000 in prizes (Section E). Human attacks are compared directly against automated attacks across 29 scenarios, making human evaluation a central component."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper evaluates on fixed benchmark datasets (HarmBench, AgentDojo, OpenPromptInject) that are separate from anything used to design the attacks. The attacks are adaptive by design rather than tuned to a specific held-out set."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down by defense type (Prompting, Training, Filtering Model, Secret Knowledge) and by individual defense within each category. Table 7 further breaks down by model and defense combination."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper discusses reward hacking as a failure mode (Appendix C.1, Figure 5) where the RL attacker finds spurious strategies that score well but do not constitute real attacks. PIGuard's lower ASR (71%) relative to other defenses is acknowledged."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": true,
    118         "justification": "The paper reports that gradient-based attacks are 'generally unreliable' and recommends attacks in the text space instead (Section 4). The paper also notes in Appendix F that search attacks succeed on only 69% of scenarios vs. 100% for human red-teamers, acknowledging limitations of automated attacks."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 12 defenses are bypassed with ASR above 90%. This is supported by Figure 1 and Tables 1-7, though some defenses (PIGuard at 71%, MELON human at 89%) fall slightly below the 'above 90%' threshold; the abstract accurately qualifies with 'above 90% for most.'"
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper's causal claim is that adaptive attackers cause high ASR compared to static attacks. This is supported by directly applying adaptive attacks to defended systems and measuring ASR. The controlled comparison (same defense, different attack strength) provides adequate causal identification."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper explicitly states 'robustness numbers are not necessarily comparable across defenses' and 'the goal of this section is not to provide a full evaluation of defenses across all attacks' (Section 5). Conclusions are bounded to the 12 tested defenses and specific benchmarks."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": true,
    140         "justification": "Appendix F.1 discusses challenges in interpreting results, including that human ASR may be artificially lower due to inexperienced participants, and that comparison between human and automated attacks is complicated by differing query budgets and participant skill variance."
    141       }
    142     },
    143     "setup_transparency": {
    144       "model_versions_specified": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper mentions 'Gemini-2.5 Pro', 'GPT-5', 'Llama-3.3-70B', and 'Gemini-2.5 Flash' but does not provide specific API versions or snapshot dates. Marketing names without version identifiers do not satisfy this criterion."
    148       },
    149       "prompts_provided": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The paper shows examples of successful attack triggers (e.g., the RL trigger against RPO in Section 5.1, the search trigger against Prompt Sandwiching), but these are outputs of the attack optimization process, not the prompts used to drive the experiments. The actual experimental prompts — the LmMutator system prompt, the critic LLM prompt, the scoring function prompts — are described only in natural language (e.g., 'consists of sections: broad context, the attacker's task, and other miscellaneous information' in Appendix D). Per the schema, 'If prompts are described only in natural language without the actual text, NO.' The LmMutator system prompt is stated to have been 'generated by another LLM and then manually edited' but its actual text is not provided."
    153       },
    154       "hyperparameters_reported": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "The RL attack uses GRPO with 32 independent sessions and 5 rounds per session (Appendix A.2), but key hyperparameters like learning rate, batch size, temperature, and reward weights are not fully specified. Search attack uses Gemini-2.5 Pro with 'maximum thinking budget' (not a numerical value)."
    158       },
    159       "scaffolding_described": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 4 and Appendices A-D describe the attack scaffolding in detail: the four-step PSSU loop (Propose, Score, Select, Update), the MAP Elites controller, LmMutator design, and scoring functions for each defense (Appendix C)."
    163       },
    164       "data_preprocessing_documented": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "Section B.1 describes the benchmarks used. For AgentDojo, the paper specifies using 80 samples from Slack, Travel, and Workspace suites and explains why Banking was excluded. For OpenPromptInject, the source dataset and task pairs are described."
    168       }
    169     },
    170     "limitations_and_scope": {
    171       "limitations_section_present": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "There is no dedicated limitations or threats-to-validity section. Section 6 ('Lessons and Discussion') and Appendix F.1 discuss challenges, but there is no section explicitly titled 'Limitations' or equivalent."
    175       },
    176       "threats_to_validity_specific": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Appendix F.1 provides specific discussion of validity threats: human skill heterogeneity affecting ASR comparisons, differing query costs between human and automated attacks, and the fact that human collective success may not reflect individual performance."
    180       },
    181       "scope_boundaries_stated": {
    182         "applies": true,
    183         "answer": true,
    184         "justification": "Section 5 explicitly states 'the goal of this section is not to provide a full evaluation of defenses across all attacks or to compare the effectiveness of multiple defenses.' The paper is scoped to demonstrating that adaptive attacks exist, not ranking defenses."
    185       }
    186     },
    187     "data_integrity": {
    188       "raw_data_available": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "The raw attack outputs, successful trigger strings, and model response logs are not publicly available. Successful prompt examples are shown selectively in the paper but the full dataset of attack attempts is not released."
    192       },
    193       "data_collection_described": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The human red-teaming competition setup is described in detail in Appendix E: online platform, 500+ participants, $20,000 prize structure, AgentDojo evaluation environment, scoring mechanism, and human judge adjudication for edge cases."
    197       },
    198       "recruitment_methods_described": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The paper states 'we run a human AI red-teaming competition' with 500+ participants but does not describe how participants were recruited, what channels were used, or whether the participant pool introduces selection bias toward more skilled attackers."
    202       },
    203       "data_pipeline_documented": {
    204         "applies": true,
    205         "answer": true,
    206         "justification": "Appendix B describes the benchmark setup, Appendix C describes scoring functions for each defense, and Appendix E.2 describes how human submissions are evaluated (automatic + human adjudication for appeals). The pipeline from attack submission to ASR is clear."
    207       }
    208     },
    209     "conflicts_of_interest": {
    210       "funding_disclosed": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "There is no acknowledgments section disclosing grants, funding agencies, or corporate sponsors. The human competition offered $20,000 in prizes but the funding source for this is not disclosed."
    214       },
    215       "affiliations_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Authors are affiliated with OpenAI, Anthropic, Google DeepMind, HackAPrompt, Northeastern University, ETH Zürich, MATS, and AI Security Company. These affiliations are listed on the first page."
    219       },
    220       "funder_independent_of_outcome": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "Authors from Anthropic and Google DeepMind evaluate defenses against attacks and also use Gemini-2.5 Pro (Google's model) as the LmMutator and as the base model for some evaluations. OpenAI employees are on the team while 'GPT-5' models are evaluated. These are non-independent relationships."
    224       },
    225       "financial_interests_declared": {
    226         "applies": true,
    227         "answer": false,
    228         "justification": "No competing interests statement or financial conflict-of-interest declaration is present in the paper. Authors at AI companies (OpenAI, Anthropic, Google) have potential financial interests but these are not formally declared."
    229       }
    230     },
    231     "contamination": {
    232       "training_cutoff_stated": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The paper does not evaluate pre-trained model knowledge on a benchmark — it evaluates security defenses against adversarial attacks. Contamination of training data is not relevant to measuring defense robustness to adaptive attacks."
    236       },
    237       "train_test_overlap_discussed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "Same as above: the paper tests whether attacks can bypass defenses, not whether models have memorized test examples. Contamination is not applicable to this evaluation design."
    241       },
    242       "benchmark_contamination_addressed": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "Same as above: benchmark contamination is not relevant to adaptive attack evaluations of security defenses."
    246       }
    247     },
    248     "human_studies": {
    249       "pre_registered": {
    250         "applies": true,
    251         "answer": false,
    252         "justification": "The human red-teaming competition involved human participants making decisions, but no pre-registration link (OSF, AsPredicted, etc.) is provided."
    253       },
    254       "irb_or_ethics_approval": {
    255         "applies": true,
    256         "answer": false,
    257         "justification": "The Ethics Statement (Section after Conclusion) mentions 'all participation was voluntary' and 'no personally identifiable or sensitive data were collected,' but no IRB or ethics board approval is mentioned."
    258       },
    259       "demographics_reported": {
    260         "applies": true,
    261         "answer": false,
    262         "justification": "The paper states '500+ participants' in the competition but provides no demographic information (experience level, geographic distribution, professional background)."
    263       },
    264       "inclusion_exclusion_criteria": {
    265         "applies": true,
    266         "answer": false,
    267         "justification": "No inclusion or exclusion criteria for competition participants are stated. The paper does not describe any screening or eligibility requirements."
    268       },
    269       "randomization_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "The human red-teaming is not an experimental study with random assignment to conditions — participants self-select which challenges to attempt. Randomization is not applicable."
    273       },
    274       "blinding_described": {
    275         "applies": true,
    276         "answer": true,
    277         "justification": "Appendix E.3 states that 'the name of the model is replaced with a pseudonym, and participants are unaware of defenses potentially being deployed,' constituting partial blinding of participants."
    278       },
    279       "attrition_reported": {
    280         "applies": true,
    281         "answer": false,
    282         "justification": "The paper mentions 500+ total participants but notes 'not every participant attempts every scenario.' The dropout or participation rate per scenario is not systematically reported."
    283       }
    284     },
    285     "cost_and_practicality": {
    286       "inference_cost_reported": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "The paper uses Gemini-2.5 Pro with 'maximum thinking budget' as LmMutator, runs 32 independent sessions per attack instance, and tests across 12 defenses with multiple benchmarks, but no API costs or token counts are reported."
    290       },
    291       "compute_budget_stated": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No total computational budget, GPU hours, or API spend is stated. The competition prizes ($20,000) are mentioned but the infrastructure and compute costs of running the attacks are not quantified."
    295       }
    296     }
    297   },
    298   "claims": [
    299     {
    300       "claim": "Adaptive attacks bypass 12 recent LLM defenses with attack success rate above 90% for most, compared to near-zero rates reported in original papers.",
    301       "evidence": "Figure 1 shows side-by-side ASR for static attacks vs. adaptive attacks across all 12 defenses. Table 7 provides per-defense, per-model breakdown with specific ASR values.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "Human red-teaming succeeds on all tested scenarios (100% ASR) while search-based attacks succeed on 69% of the 29 overlapping scenarios.",
    306       "evidence": "Figure 6 and Appendix F.2 compare human and automated attack success rates on 29 AgentDojo scenarios. The human collective ASR curve reaches 100% while search attack plateaus at 69%.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Prompting-based defenses (Spotlighting, Prompt Sandwiching) can be bypassed with ASR above 95% using search-based adaptive attacks.",
    311       "evidence": "Section 5.1 reports >95% ASR for both Spotlighting and Prompt Sandwiching using search-based attacks on AgentDojo. Human red-teaming generated 265 and 178 successful attacks respectively.",
    312       "supported": "strong"
    313     },
    314     {
    315       "claim": "Training-based defenses (Circuit Breakers, StruQ, MetaSecAlign) offer no meaningful security against adaptive attacks, with RL attacks achieving 96-100% ASR.",
    316       "evidence": "Section 5.2 reports 100% ASR against Circuit Breakers, success on every StruQ test case, and 96% ASR against MetaSecAlign using RL-based attacks.",
    317       "supported": "strong"
    318     },
    319     {
    320       "claim": "Current LLM defense evaluations are flawed because they rely on static attack sets or computationally weak attacks not adapted to the defense.",
    321       "evidence": "Section 1 and Section 6 (Lesson #1) argue that static datasets cause false sense of security; supported by the empirical finding that defenses claiming near-zero ASR on static attacks fail under adaptive attacks.",
    322       "supported": "strong"
    323     }
    324   ],
    325   "methodology_tags": [
    326     "benchmark-eval",
    327     "case-study"
    328   ],
    329   "key_findings": "The paper demonstrates that 12 recent LLM defenses against jailbreaks and prompt injections, which reported near-zero attack success rates (ASR) in their original evaluations, are all bypassed with ASR above 90% when evaluated against adaptive attacks using gradient descent, reinforcement learning, evolutionary search, and human red-teaming. Human red-teaming in a competition with 500+ participants achieved 100% ASR across all tested scenarios while the best automated search attack reached 69%. The paper argues that defense evaluations must use adaptive attackers that explicitly counter the defense design, and that static benchmark evaluations create a false sense of security in the LLM safety literature.",
    330   "red_flags": [
    331     {
    332       "flag": "No uncertainty quantification",
    333       "detail": "All ASR results are point estimates with no confidence intervals, error bars, or standard deviations reported. Given that ASR is computed over finite sample sizes (e.g., 80 AgentDojo examples), sampling variability could affect conclusions, especially for defenses near the 90% threshold."
    334     },
    335     {
    336       "flag": "Conflict of interest: authors evaluate competitors' systems",
    337       "detail": "Authors from Anthropic, Google DeepMind, and OpenAI participate in evaluating defenses, and the search attack uses Gemini-2.5 Pro (Google's model). The paper evaluates defenses partly against models from the same companies as the authors, creating a potential non-independence issue."
    338     },
    339     {
    340       "flag": "No code or full data release",
    341       "detail": "The adaptive attack framework described is novel and central to the paper's claims, but no code is released. This limits reproducibility and verification of results."
    342     },
    343     {
    344       "flag": "Human participant recruitment not described",
    345       "detail": "The 500+ competition participants' recruitment method is not described, potentially introducing selection bias toward more skilled attackers that inflates the perceived 100% human ASR relative to what an average red-teamer would achieve."
    346     },
    347     {
    348       "flag": "Incomplete model version specification",
    349       "detail": "The paper uses marketing names like 'Gemini-2.5 Pro', 'GPT-5', and 'Llama-3.3-70B' without specific API version dates or snapshot identifiers, making it impossible to know exactly which model versions were evaluated."
    350     },
    351     {
    352       "flag": "Missing limitations section",
    353       "detail": "The paper has no dedicated limitations or threats-to-validity section. While some limitations are discussed in the lessons and appendix, there is no systematic enumeration of what the results do not show."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal",
    359       "authors": [
    360         "Mantas Mazeika",
    361         "Long Phan",
    362         "Xuwang Yin",
    363         "Andy Zou"
    364       ],
    365       "year": 2024,
    366       "relevance": "Core benchmark used for evaluating jailbreak defenses; directly relevant to LLM safety evaluation methodology."
    367     },
    368     {
    369       "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents",
    370       "authors": [
    371         "Edoardo Debenedetti",
    372         "Jie Zhang",
    373         "Mislav Balunovic",
    374         "Luca Beurer-Kellner",
    375         "Marc Fischer",
    376         "Florian Tramèr"
    377       ],
    378       "year": 2024,
    379       "arxiv_id": "2406.13352",
    380       "relevance": "Core benchmark for prompt injection evaluation in agentic settings; evaluates robustness of LLM agent defenses."
    381     },
    382     {
    383       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    384       "authors": [
    385         "Andy Zou",
    386         "Zifan Wang",
    387         "Nicholas Carlini",
    388         "Milad Nasr",
    389         "J. Zico Kolter",
    390         "Matt Fredrikson"
    391       ],
    392       "year": 2023,
    393       "relevance": "GCG attack paper; key baseline attack method used to evaluate LLM defenses."
    394     },
    395     {
    396       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    397       "authors": [
    398         "Kai Greshake",
    399         "Sahar Abdelnabi",
    400         "Shailesh Mishra",
    401         "Christoph Endres",
    402         "Thorsten Holz",
    403         "Mario Fritz"
    404       ],
    405       "year": 2023,
    406       "arxiv_id": "2302.12173",
    407       "relevance": "Foundational work on prompt injection attacks against LLM-integrated applications."
    408     },
    409     {
    410       "title": "Improving Alignment and Robustness with Circuit Breakers",
    411       "authors": [
    412         "Andy Zou",
    413         "Long Phan",
    414         "Justin Wang"
    415       ],
    416       "year": 2024,
    417       "relevance": "One of the 12 defenses evaluated; training-based jailbreak defense using representation engineering."
    418     },
    419     {
    420       "title": "StruQ: Defending against prompt injection with structured queries",
    421       "authors": [
    422         "Sizhe Chen",
    423         "Julien Piet",
    424         "Chawin Sitawarin",
    425         "David Wagner"
    426       ],
    427       "year": 2024,
    428       "arxiv_id": "2402.06363",
    429       "relevance": "One of the 12 defenses evaluated; adversarial training approach for prompt injection defense."
    430     },
    431     {
    432       "title": "Obfuscated Gradients Give a False Sense of Security: Circumventing Defenses to Adversarial Examples",
    433       "authors": [
    434         "Anish Athalye",
    435         "Nicholas Carlini",
    436         "David Wagner"
    437       ],
    438       "year": 2018,
    439       "relevance": "Foundational work on adaptive attacks against adversarial example defenses; the conceptual basis for the paper's argument."
    440     },
    441     {
    442       "title": "Ignore this title and HackAPrompt: Exposing Systemic Vulnerabilities of LLMs through a Global Prompt Hacking Competition",
    443       "authors": [
    444         "Sander Schulhoff",
    445         "Jeremy Pinto",
    446         "Anaum Khan"
    447       ],
    448       "year": 2023,
    449       "relevance": "Previous human red-teaming competition for LLMs; directly relevant to human-based adversarial evaluation."
    450     },
    451     {
    452       "title": "Tensor Trust: Interpretable Prompt Injection Attacks from an Online Game",
    453       "authors": [
    454         "Sam Toyer",
    455         "Olivia Watkins",
    456         "Ethan Adrian Mendes"
    457       ],
    458       "year": 2023,
    459       "arxiv_id": "2311.01011",
    460       "relevance": "Human-based prompt injection evaluation through gamification; relevant to human red-teaming methodology."
    461     },
    462     {
    463       "title": "Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks",
    464       "authors": [
    465         "Maksym Andriushchenko",
    466         "Francesco Croce",
    467         "Nicolas Flammarion"
    468       ],
    469       "year": 2024,
    470       "arxiv_id": "2404.02151",
    471       "relevance": "Prior work on adaptive jailbreak attacks; directly related to the paper's argument for stronger evaluation."
    472     },
    473     {
    474       "title": "DataSentinel: A game-theoretic detection of prompt injection attacks",
    475       "authors": [
    476         "Yupei Liu",
    477         "Yuqi Jia",
    478         "Jinyuan Jia",
    479         "Dawn Song",
    480         "Neil Zhenqiang Gong"
    481       ],
    482       "year": 2025,
    483       "relevance": "One of the 12 defenses evaluated; secret-knowledge based prompt injection defense."
    484     },
    485     {
    486       "title": "MELON: Provable Defense Against Indirect Prompt Injection Attacks in AI Agents",
    487       "authors": [
    488         "Kaijie Zhu",
    489         "Xianjun Yang",
    490         "Jindong Wang",
    491         "Wenbo Guo",
    492         "William Yang Wang"
    493       ],
    494       "year": 2025,
    495       "relevance": "One of the 12 defenses evaluated; dual-run defense for agentic prompt injection."
    496     },
    497     {
    498       "title": "On Adaptive Attacks to Adversarial Example Defenses",
    499       "authors": [
    500         "Florian Tramèr",
    501         "Nicholas Carlini",
    502         "Wieland Brendel",
    503         "Aleksander Madry"
    504       ],
    505       "year": 2020,
    506       "relevance": "Framework for adaptive attack evaluation that this paper extends to the LLM domain."
    507     }
    508   ]
    509 }

Impressum · Datenschutz