ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28685B)


      1 {
      2   "paper": {
      3     "title": "Indirect Prompt Injections: Are Firewalls All You Need, or Stronger Benchmarks?",
      4     "authors": [
      5       "Rishika Bhagwatkar",
      6       "Kevin Kasa",
      7       "Abhay Puri",
      8       "Gabriel Huang",
      9       "Irina Rish",
     10       "Graham W. Taylor",
     11       "Krishnamurthy Dj Dvijotham",
     12       "Alexandre Lacoste"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv preprint (under review)",
     16     "arxiv_id": "2510.05244",
     17     "doi": "10.48550/arXiv.2510.05244"
     18   },
     19   "scan_version": 2,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "Section 10 states 'We will release all code required to reproduce our results upon acceptance.' This is a promise of future release, not an actual release."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "The paper uses four public benchmarks (AgentDojo, ASB, InjecAgent, τ-Bench) which are publicly available. They mention releasing corrected versions of AgentDojo and ASB but these are also future promises."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment specifications are provided anywhere in the paper or appendices."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions are provided. Algorithm 1 describes the pipeline conceptually but does not provide runnable commands or scripts."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "All results tables (Tables 1-4, 7-20) report ± values alongside point estimates, e.g., '83.02 ±5.33' in Table 1."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper makes comparative claims (e.g., 'Sanitizer achieves the best ASR-BU and ASR-UA Tradeoff') but relies solely on comparing point estimates with ± ranges. No statistical significance tests (t-tests, bootstrap tests, etc.) are reported."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Effect sizes are provided with baseline context, e.g., 'the ASR drops sharply from 70% to 9.25%, an almost 8× reduction' (Section 6.2) and '4× lower ASR' (Section 5, ASB discussion)."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification for benchmark sizes or number of experimental runs. The benchmarks are used as-is without discussing whether the evaluation sample sizes are adequate for the claimed conclusions."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "All results tables report ± values, indicating variance across runs, e.g., Table 1 shows '67.68 ±3.56' for Sanitizer BU."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Extensive baselines are compared: PI Detector, Repeat prompt, Spotlighting, Toolfilter, Melon, Melon-Aug, CaMeL, Instructional Prevention, Delimiters defense, Sandwich defense (Tables 1-4, Appendix A)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include recent work: CaMeL (Debenedetti et al., 2025), Melon (Zhu et al., 2025), DoomArena framework (Boisvert et al., 2025). These are contemporary and competitive."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper evaluates Minimizer alone, Sanitizer alone, and Combined (both) across benchmarks, constituting an ablation of the two firewall components (Table 1)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Three metrics are used: Benign Utility (BU), Utility under Attack (UA), and Attack Success Rate (ASR), as defined in Section 4.2."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation is performed. All evaluation is automated through benchmark metrics. Human assessment of whether sanitized outputs preserve semantic meaning would strengthen the utility claims."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "The defense prompts were not tuned per-benchmark, but it is unclear whether the prompts were iterated using any of these benchmark results. No explicit held-out vs. development set separation is documented."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by benchmark suite (Banking, Slack, Travel, Workspace for AgentDojo in Tables 7-12), by DH/DS for InjecAgent (Table 19), and by attack type for ASB (Tables 14-15, 17-18)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section 7 and Appendix E present a specific failure case where Braille-encoded injection bypasses the GPT-4o Sanitizer. The Minimizer's utility degradation is also discussed as a failure mode."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The paper reports that the Minimizer degrades utility due to aggressive redaction (Section 5), the Combined defense has lower utility than Sanitizer alone, and the Sanitizer can be bypassed by Braille encoding (Section 7)."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": false,
    122         "justification": "The abstract claims 'perfect security (0% or the lowest possible attack success rate)' but on ASB the Sanitizer achieves 16.33% ASR (Table 2). While the paper explains this as partly a scoring artifact, 'perfect security' language is overstated for a 16.33% attack success rate."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims (e.g., 'the Sanitizer prevents attacks') are supported by controlled experiments comparing agent performance with and without the defense across multiple benchmarks. The ablation of Minimizer vs Sanitizer vs Combined provides adequate causal evidence."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'Are Firewalls All You Need' and abstract claim of 'model-agnostic' defense imply broad applicability. While tested on four models and four benchmarks, the paper acknowledges the Sanitizer can be bypassed but the title and framing extend beyond what was tested. Section 8 notes benchmarks use weak attacks."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Section 6 extensively discusses alternative explanations for benchmark results (inflated ASR from forced tool injection, brittle utility metrics, missing task-critical content). Section 7 discusses why the Sanitizer might fail (rare token encoding bypasses alignment)."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures ASR, BU, and UA directly. These are the actual quantities of interest — no proxy gap exists between what is measured (attack success rates, task completion rates) and what is claimed."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Specific model versions are given: 'gpt-4o-2024-08-06' (Tables 7-12), 'GPT-4o-2024-05-13' (Appendix E), 'Llama 3.3 70b', 'Qwen3 32b', 'Qwen3 8b'. The GPT-4o versions include snapshot dates."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompts for both the Tool-Input Firewall (Minimizer) and Tool-Output Firewall (Sanitizer) are provided verbatim in Appendix B, including system messages and user message templates."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No temperature, top-p, max tokens, or other API/sampling hyperparameters are reported for any of the LLM calls (agent, Minimizer, or Sanitizer)."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Algorithm 1 describes the full pipeline: user task → agent generates tool call → Minimizer filters inputs → tool executes → Sanitizer filters output → return to agent. Figure 1 provides a visual diagram."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 6 documents specific modifications to AgentDojo and ASB benchmarks with before/after examples (Appendix D). The τ-Bench augmentation with DoomArena attacks is described in Section 4.1."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 7 (Discussion) contains substantive discussion of defense limitations, including how stronger attacks can bypass the Sanitizer. Section 9 (Ethics Statement) discusses risks of overreliance on defenses and additional API call privacy risks."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 7 provides a specific threat: Braille-encoded injections bypass the GPT-4o Sanitizer because rare tokens may be less well-aligned. Section 6 identifies specific benchmark validity threats (forced tool injection, content overwriting, brittle metrics)."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 9 states 'the firewalls we have presented here assume that the user task is fully trusted, and that the tools themselves are secure.' Section 7 states 'it is still possible to bypass them in practice' and calls for 'stronger and more diverse attack strategies.'"
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw experimental data (individual trial results, logs, model outputs) is available. Only aggregated results in tables are provided."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 4.1 describes each of the four benchmarks used, their structure (number of evaluations, attack types, suite compositions), and how they were set up for experiments."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. All data comes from existing public benchmarks."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Algorithm 1 documents the full pipeline from user task through tool calls with firewall processing. Section 6 documents the benchmark modification pipeline with specific before/after examples in Appendix D."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "The acknowledgments thank ServiceNow colleagues for discussions and compute resources, but no explicit funding source (grants, corporate funding) is disclosed."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: ServiceNow Research, Mila - Quebec AI Institute, Université de Montréal, Vector Institute, University of Guelph."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "ServiceNow Research (primary affiliation for 4 of 8 authors) is an enterprise AI company that could commercially benefit from demonstrating effective AI agent security solutions. This potential conflict is not discussed."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is included in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "The paper tests defense mechanisms (firewalls) rather than evaluating model knowledge on benchmarks. The defense effectiveness depends on the LLM following sanitization instructions, not on whether the model has seen benchmark solutions."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "The paper tests defenses against prompt injection attacks, not model capability on knowledge benchmarks. Train/test overlap is not the relevant concern for this paper type."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "The paper evaluates defense mechanisms, not model knowledge. However, the authors do recommend future benchmarks 'add a canary string to detect data contamination' (Section 8), showing awareness of the issue for model evaluation."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants in this study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants in this study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants in this study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants in this study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants in this study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants in this study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "The firewalls add extra LLM calls per tool invocation (2 calls: one for Minimizer, one for Sanitizer). Section 9 acknowledges 'the firewall steps may require additional API calls' but no actual cost, latency, or token usage figures are reported."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget is stated. The paper does not report GPU hours, API costs, or total compute used for the experiments across four benchmarks and multiple models."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "The ± values in tables suggest multiple runs but no explicit mention of random seeds or seed sensitivity analysis. It is unclear whether variance comes from seeds, stochastic sampling, or benchmark subsets."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The paper never explicitly states how many runs produced the reported ± values. No 'averaged over K runs' or equivalent statement found."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The defense prompts are presented as fixed without documenting how they were developed. Section 7 mentions 'experimenting with a wide range of prompting strategies' for the attack but no search budget for the defense design."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The Minimizer and Sanitizer prompts (Appendix B) are presented as the final configuration without explaining how these specific prompts were selected or what alternatives were tried."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Many pairwise comparisons are made across defenses, benchmarks, and models without any correction for multiple comparisons (Bonferroni, etc.)."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors evaluate their own proposed defense against baselines but do not acknowledge the bias of evaluating their own system. CaMeL and Melon results are taken from published papers ('non-replicable baselines'), not independently reproduced."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "The firewall defense adds 1-2 extra LLM calls per tool invocation, significantly increasing compute cost vs. zero-overhead defenses like Spotlighting or Repeat Prompt. This compute disparity is not discussed or controlled for."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": true,
    338         "justification": "Section 6 extensively analyzes construct validity of AgentDojo, ASB, and InjecAgent. The paper identifies forced tool injection inflating ASR, brittle metrics, content overwriting, and missing utility metrics as validity threats. This is a core contribution."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "The defense (firewall) IS the scaffold being tested. The paper compares multiple defenses applied to the same models, so the scaffold confound does not apply in the traditional sense."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether GPT-4o, Llama 3.3, or Qwen3 may have seen AgentDojo, ASB, or InjecAgent tasks during training. These benchmarks were published before the models' training cutoffs."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information. For example, whether the Sanitizer's system prompt gives it unfair advantage by revealing the expected attack format."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of independence between benchmark tasks or whether results on related tasks within the same suite are independent observations."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection methods applied. The paper recommends future benchmarks 'add a canary string to detect data contamination' (Section 8) but does not implement any detection for the current evaluation."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "The Sanitizer firewall achieves 0% or near-0% ASR across four benchmarks while maintaining high task utility.",
    372       "evidence": "Table 1 (AgentDojo): Sanitizer ASR 0.02%; Table 2 (ASB): 16.33%; Table 3 (InjecAgent): 0.3% base, 0.0% enhanced; Table 4 (τ-Bench): 0.0%. Utility maintained comparably to baselines.",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "Forced attack-tool injection in ASB inflates ASR by approximately 8×.",
    377       "evidence": "Table 6 shows ASR drops from 73.58% to 9.25% when forced attack-tool injection is disabled. Specific example in Appendix D.2.1.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "AgentDojo injection vectors overwrite task-critical content, confounding utility evaluation.",
    382       "evidence": "Section 6.1 with specific example in Appendix D.1.1 where bill content is replaced by injection. Table 5 shows 18%+ utility improvement with fix (60.87% → 72.19%).",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "The Sanitizer can be bypassed using Braille-encoded injections.",
    387       "evidence": "Appendix E provides a specific example where a Braille-encoded 'important instructions' attack bypasses GPT-4o-2024-05-13 Sanitizer on the AgentDojo Travel suite (UserTask0, InjectionTask0).",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "The firewall defense outperforms CaMeL and Melon on ASR while maintaining higher utility.",
    392       "evidence": "Table 1: Sanitizer achieves 0.02% ASR with 67.68% BU vs CaMeL's 0% ASR with 53.6% BU. Melon achieves 0.95% ASR with 68.04% BU.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "PI Detector defenses reduce ASR but cause severe utility degradation.",
    397       "evidence": "Table 1: PI Detector reduces ASR to 7.95% but drops BU to 41.49%. Table 4: PI Detector achieves 0% ASR but drops BU to 6.90%.",
    398       "supported": "strong"
    399     }
    400   ],
    401   "methodology_tags": ["benchmark-eval"],
    402   "key_findings": "A simple firewall defense consisting of a Tool-Input Minimizer and Tool-Output Sanitizer achieves near-zero attack success rates across four IPI benchmarks (AgentDojo, ASB, InjecAgent, τ-Bench) while maintaining competitive task utility, outperforming more complex defenses like CaMeL and Melon. The paper identifies critical flaws in existing benchmarks: forced tool injection inflating ASR by 8× in ASB, injection vectors overwriting task-critical content in AgentDojo, and missing utility metrics in InjecAgent. Despite strong benchmark results, the Sanitizer can be bypassed using Braille-encoded attacks, demonstrating that existing benchmarks use overly weak attacks and do not adequately stress-test defenses.",
    403   "red_flags": [
    404     {
    405       "flag": "Non-replicable baselines",
    406       "detail": "CaMeL and Melon results are 'directly reported' from published papers (marked with * in Table 1) because authors were 'unable to successfully reproduce their results.' This makes direct comparison unreliable — different hardware, software versions, or evaluation conditions could explain differences."
    407     },
    408     {
    409       "flag": "Abstract overclaims",
    410       "detail": "The abstract claims 'perfect security (0% or the lowest possible attack success rate)' but ASB Sanitizer ASR is 16.33% (Table 2). The paper attributes this to 'scoring artifacts' but the claim of 'perfect security' in the abstract is misleading without this qualification."
    411     },
    412     {
    413       "flag": "Missing cost analysis for proposed defense",
    414       "detail": "The firewall defense adds 1-2 additional LLM API calls per tool invocation — a significant overhead acknowledged in Section 9 ('may require additional API calls') but never quantified. Baselines like Spotlighting and Repeat Prompt add zero LLM calls, making the comparison unfair on cost."
    415     },
    416     {
    417       "flag": "Number of runs and seed sensitivity not reported",
    418       "detail": "All tables show ± values but the paper never states how many runs produced these numbers, what random seeds were used, or whether variance comes from sampling stochasticity, benchmark subsets, or multiple trials."
    419     },
    420     {
    421       "flag": "Defense prompt engineering not documented",
    422       "detail": "The Minimizer and Sanitizer prompts are presented as final versions without documenting how they were designed, what alternatives were tried, or whether they were iterated based on benchmark results. The claim of 'zero tuning' is not verifiable."
    423     }
    424   ],
    425   "cited_papers": [
    426     {
    427       "title": "Agentdojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    428       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    429       "year": 2024,
    430       "relevance": "Primary benchmark used for evaluating IPI defenses for tool-calling agents."
    431     },
    432     {
    433       "title": "Agent security bench (ASB): Formalizing and benchmarking attacks and defenses in LLM-based agents",
    434       "authors": ["Hanrong Zhang", "Jingyuan Huang", "Kai Mei"],
    435       "year": 2025,
    436       "relevance": "Benchmark for evaluating agent robustness against prompt injection in multi-step tool-use settings."
    437     },
    438     {
    439       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    440       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    441       "year": 2024,
    442       "doi": "10.18653/v1/2024.findings-acl.624",
    443       "relevance": "Benchmark for IPI attacks from malicious tools, including direct harm and data stealing attack categories."
    444     },
    445     {
    446       "title": "Defeating prompt injections by design",
    447       "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan"],
    448       "year": 2025,
    449       "arxiv_id": "2503.18813",
    450       "relevance": "CaMeL system-level defense using a custom Python interpreter for policy-based agent constraints — a key comparison baseline."
    451     },
    452     {
    453       "title": "MELON: Provable defense against indirect prompt injection attacks in AI agents",
    454       "authors": ["Kaijie Zhu", "Xianjun Yang", "Jindong Wang"],
    455       "year": 2025,
    456       "relevance": "Context-minimization defense for conversational LLMs, a key baseline compared against in the paper."
    457     },
    458     {
    459       "title": "Firewalls to secure dynamic LLM agentic networks",
    460       "authors": ["Sahar Abdelnabi", "Amr Gomaa", "Eugene Bagdasarian"],
    461       "year": 2025,
    462       "arxiv_id": "2502.01822",
    463       "relevance": "Introduces firewalled agent architecture with input/data/trajectory firewalls for multi-turn agent conversations."
    464     },
    465     {
    466       "title": "Airgapagent: Protecting privacy-conscious conversational agents",
    467       "authors": ["Eugene Bagdasarian", "Ren Yi", "Sahra Ghalebikesabi"],
    468       "year": 2024,
    469       "doi": "10.1145/3658644.3690350",
    470       "relevance": "Pioneering work on isolated LLM for context minimization to protect conversational agents from data exfiltration."
    471     },
    472     {
    473       "title": "The instruction hierarchy: Training llms to prioritize privileged instructions",
    474       "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike"],
    475       "year": 2024,
    476       "arxiv_id": "2404.13208",
    477       "relevance": "LLM retraining approach for prompt injection defense by establishing instruction priority hierarchies."
    478     },
    479     {
    480       "title": "StruQ: Defending against prompt injection with structured queries",
    481       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"],
    482       "year": 2025,
    483       "relevance": "Defense approach using structured queries to separate trusted and untrusted content in LLM inputs."
    484     },
    485     {
    486       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    487       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"],
    488       "year": 2023,
    489       "relevance": "Foundational work defining indirect prompt injection attacks in LLM-integrated applications."
    490     },
    491     {
    492       "title": "Doomarena: A framework for testing AI agents against evolving security threats",
    493       "authors": ["Léo Boisvert", "Abhay Puri", "Gabriel Huang"],
    494       "year": 2025,
    495       "relevance": "Framework used to augment τ-bench with data stealing attacks for security evaluation."
    496     },
    497     {
    498       "title": "SecAlign: Defending against prompt injection with preference optimization",
    499       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar"],
    500       "year": 2025,
    501       "relevance": "Preference optimization approach for training LLMs to be robust against prompt injection attacks."
    502     }
    503   ]
    504 }

Impressum · Datenschutz