ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28503B)


      1 {
      2   "paper": {
      3     "title": "CausalArmor: Efficient Indirect Prompt Injection Guardrails via Causal Attribution",
      4     "authors": [
      5       "Minbeom Kim",
      6       "Mihir Parmar",
      7       "Phillip Wallis",
      8       "Lesly Miculicich",
      9       "Kyomin Jung",
     10       "Krishnamurthy Dj Dvijotham",
     11       "Long T. Le",
     12       "Tomas Pfister"
     13     ],
     14     "year": 2026,
     15     "venue": "arXiv preprint",
     16     "arxiv_id": "2602.07918"
     17   },
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL or code archive is provided in the paper. The paper references official implementations of baselines (AgentDojo, DRIFT, MELON) but does not release CausalArmor's own code."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available benchmarks: AgentDojo (v1.2.2, with GitHub link https://github.com/ethz-spylab/agentdojo) and DoomArena's TauBench subset. Custom injection templates are provided in full text in Appendix C.2."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "Appendix D.1 specifies the computing environment: '8 NVIDIA A100 GPUs' accessed via Google Cloud Vertex AI, vLLM for proxy model serving. Specific model versions and HuggingFace model IDs are listed for baselines (protectai/deberta-v3-base-prompt-injection-v2, leolee99/PIGuard)."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While experimental details are thorough in the appendix, there is no code release or runnable reproduction guide."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Table 1 (DoomArena) reports results with confidence intervals (e.g., '73.57 (± 1.32)'). However, the main AgentDojo results in Table 3 report only point estimates without confidence intervals or error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No statistical significance tests (p-values, t-tests, etc.) are reported. Claims of CausalArmor outperforming baselines are based on comparing point estimates. For DoomArena, 5 random seeds are used but no formal significance tests are applied."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "The paper reports absolute performance values with baseline context throughout. For example, Table 3 shows ASR dropping from 32.47% (No Defense) to 0.62% (CausalArmor) on Gemini-2.5-Flash with Important Instructions, allowing readers to compute effect sizes. Table 2 shows the CoT masking effect: ASR increases by 1.46% when disabled."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "AgentDojo has 629 injection tasks and DoomArena has 115 scenarios, but no justification is provided for why these sample sizes are sufficient for the claims made. DoomArena uses 5 random seeds but no power analysis is discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "DoomArena results in Table 1 report variance across 5 seeds (± notation). However, the main AgentDojo results in Table 3, which form the bulk of the evaluation, report single-run numbers with no variance across runs."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "The paper compares against multiple baselines across three categories: Prompting (Repeat Prompt), Trained Classifiers (DeBERTa Detector, PiGuard), and System-based defenses (DRIFT, MELON, PromptArmor). No Defense is also included. Results in Table 3."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Baselines include recent 2025 work: DRIFT (Li et al., 2025a), MELON (Zhu et al., 2025), PiGuard (Li et al., 2025b), PromptArmor (Shi et al., 2025b). These represent the current state of the art in IPI defense."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 5.5 presents ablation studies: (1) CoT masking ablation (Table 2) showing ASR increases from 0.29% to 1.75% without it, and (2) proxy model ablation (Figure 5) varying model families and sizes. Section 5.3 and Figure 4 ablate the threshold parameter tau."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper uses five metrics: Benign Utility (BU), Benign Latency (BL), Utility under Attack (UA), Latency under Attack (LA), and Attack Success Rate (ASR). All are reported in Table 3."
     88       },
     89       "human_evaluation": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Human evaluation is not relevant for this work. The evaluation is on automated agent benchmarks (AgentDojo, DoomArena) where success/failure is objectively determined by whether unauthorized actions are executed."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The paper uses established benchmarks (AgentDojo v1.2.2 with its standard evaluation set, DoomArena TauBench subset). Two new attack templates (task_dependency, tool_output_hijack) were introduced alongside the standard template to test robustness to unseen attack vectors."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 3 breaks down results across three different attack templates (Important Instructions, New Attack 1, New Attack 2) and three backbone models (Gemini-2.5-Flash, Claude-4-Sonnet, Gemini-3-Pro). AgentDojo covers four environments (banking, slack, travel, workspace), though per-environment breakdowns are not shown in the main tables."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Appendix E provides detailed case studies including failure modes. Figures 8 and 9 show the poisoned CoT failure where input sanitization alone fails. Appendix A discusses limitations including split-context attacks and sanitizer effectiveness. The paper also discusses cases where overhead can approach always-on sanitization."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper reports that removing CoT masking degrades security (Table 2), that smaller proxy models (<8B parameters) fail to reliably detect attacks (Figure 5), and that in DoomArena's extreme threat model, utility under attack converges to zero for all methods. Appendix A explicitly discusses where the approach may fail."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims CausalArmor 'matches the security of aggressive defenses while improving explainability and preserving utility and latency.' Table 3 supports this: CausalArmor achieves ASR comparable to or lower than system-based defenses while maintaining BU and BL closer to No Defense. The theoretical claim of 'exponentially small upper bound' is proven in Proposition 4.1 and Appendix B."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper uses 'causal' in an explicitly operationalized LOO-ablation sense, not a strict structural causal model sense. This is transparently acknowledged in Appendix A: 'our causal statements should be interpreted under this operational influence measure.' Ablation studies (Table 2, Figure 5) use controlled single-variable manipulation to isolate component contributions."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper evaluates on specific benchmarks (AgentDojo, DoomArena) with specific models (Gemini-2.5-Flash, Claude-4-Sonnet, Gemini-3-Pro) and states limitations about generalization explicitly. Appendix A notes limitations regarding split-context attacks, sanitizer effectiveness, and the operational notion of causality. The title specifically says 'Indirect Prompt Injection' rather than broader claims."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Appendix A discusses multiple alternative explanations and failure modes: that LOO ablation may miss semantic-preserving interventions, that distributed attacks could evade detection, and that sanitizer effectiveness could vary. Section 5.6 explicitly addresses whether theoretical assumptions hold empirically rather than just asserting them."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "The paper uses marketing names without specific version identifiers: 'Gemini-2.5-flash', 'Gemini-3-Pro', 'Claude-4-Sonnet', 'Claude-4.0-Sonnet'. No snapshot dates or API version strings are provided. The proxy model 'Gemma-3-12B-IT' is a model name without a version date. AgentDojo version (v1.2.2) is specified."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The full sanitization system prompt is provided in Appendix D.1. All three injection templates (Important Instructions, Task Dependency, Tool Output Hijack) are provided verbatim in Appendix C.2 with their placeholders documented."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "The key hyperparameter tau is explicitly varied and its effects analyzed (Figure 4, Figure 6). The proxy model size (Gemma-3-12B-IT) and sanitizer model (Gemini-2.5-flash) are specified. DoomArena uses 5 random seeds. However, temperature/sampling settings for the LLM calls are not reported."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The CausalArmor framework is described in detail: Algorithm 1 (simplified overview) and Algorithm 2 (full version) in Appendix C provide pseudocode. The three-stage process (LOO attribution, selective sanitization, retroactive CoT masking) is fully described with mathematical formalization in Sections 3-4."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix C documents the experimental configurations: AgentDojo v1.2.2 with four environments (Section C.1), the three injection templates with full text (Section C.2), DoomArena configuration including the conversational setting and adaptive attacker model (Section C.3). The privileged tool set and proxy model setup are specified."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Appendix A is a dedicated 'Limitations and Future Research Directions' section covering four specific limitation areas: operational notion of causality, robustness to split-context attacks, effectiveness of sanitizer, and the gap between theoretical guarantees and practice."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Appendix A provides specific threats: (1) LOO ablation may miss attacks requiring semantic-preserving interventions or complex multi-span interactions, (2) 'perfectly distributed' split-context attacks could theoretically evade detection, (3) the sanitizer (Gemini-2.5-flash) is not claimed to be robust against all obfuscations, (4) worst-case overhead can approach always-on sanitization."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper explicitly bounds its scope: 'our causal statements should be interpreted under this operational influence measure' (Appendix A), 'it should not be treated as a standalone safety solution under arbitrary distribution shift' (Impact Statement), and recommends using CausalArmor 'as one layer in a defense-in-depth stack.' The theoretical guarantees are explicitly conditional on Assumptions 1-2."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (detailed per-task results, attribution scores, log files) is released. Only aggregated results in tables and figures are provided."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The evaluation uses established benchmarks with described configurations. AgentDojo v1.2.2 with 629 injection tasks across four environments is specified (Section 5.1, Appendix C.1). DoomArena's TauBench-Retail with 115 tasks is described (Appendix C.3). Two new injection templates were designed using Gemini-3-Pro."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants are involved. The evaluation uses automated benchmarks (AgentDojo, DoomArena)."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The evaluation pipeline is documented: Appendix C specifies benchmark configurations, Appendix D covers implementation details including computing environment, baseline implementations, and the sanitization prompt. Algorithm 2 provides the full CausalArmor pipeline pseudocode."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper. Multiple authors are from Google Cloud AI Research, Google, and Google DeepMind, but no explicit funding disclosure is made."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Google Cloud AI Research, Seoul National University, Google, and Google DeepMind. The first author's student researcher status at Google Cloud AI Research is noted."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "The majority of authors (6 of 8) are affiliated with Google (Google Cloud AI Research, Google, Google DeepMind). The experiments heavily feature Google products (Gemini-2.5-Flash, Gemini-3-Pro, Gemma-3-12B-IT) and use Google Cloud Vertex AI infrastructure. Google has a financial interest in demonstrating effective safety solutions for its AI products. No independence statement is provided."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper. Several authors are Google employees working on Google AI products that are central to the evaluation."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the models used (Gemini-2.5-Flash, Gemini-3-Pro, Claude-4-Sonnet, Gemma-3-12B-IT). The paper acknowledges contamination risk for standard injection templates and introduces new templates to mitigate this, but does not state cutoff dates."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": true,
    234         "justification": "The paper explicitly discusses contamination risk for the AgentDojo important_instructions template: 'there is a risk that recent frontier models may have already encountered this specific pattern during their safety alignment or instruction tuning phases (data contamination).' This motivated introducing two new attack templates (Appendix C.2)."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": true,
    239         "justification": "The paper addresses benchmark contamination by (1) acknowledging that standard templates may be contaminated, (2) introducing two novel attack templates (task_dependency, tool_output_hijack) designed by Gemini-3-Pro 'to mimic system-level notifications or fake error traces,' and (3) evaluating on both standard and new templates to test robustness."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants involved. The study uses automated benchmarks."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants involved."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants involved."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants involved."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants involved."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants involved."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants involved."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Latency is reported as a primary metric (Benign Latency, relative to No Defense) across all experiments. Table 3 shows latency multipliers (e.g., CausalArmor BL = 1.30x for Gemini-2.5-Flash). Appendix D.1 discusses the computational overhead of LOO attribution (O(1) sequential calls, O(|S_t|) token compute) and identifies sanitization as the main cost."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "The hardware is specified: '8 NVIDIA A100 GPUs' via Google Cloud Vertex AI (Appendix D.1). The proxy model (Gemma-3-12B-IT) is served via vLLM. Relative latency numbers are reported throughout, providing a practical measure of compute requirements."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "CausalArmor achieves near-zero Attack Success Rate while preserving benign utility and latency close to the No Defense baseline.",
    295       "evidence": "Table 3 shows CausalArmor achieves ASR of 0.62%, 0.42%, and 0.40% across three attack templates on Gemini-2.5-Flash (vs. 32.47%, 12.01%, 7.06% for No Defense), while maintaining BU of 56.26% (vs. 53.61% No Defense) and BL of 1.30x.",
    296       "supported": "strong"
    297     },
    298     {
    299       "claim": "CausalArmor resolves the over-defense dilemma by selectively triggering sanitization only when causal attribution indicates dominance by an untrusted span.",
    300       "evidence": "Figure 4 and Table 3 show CausalArmor achieves security comparable to always-on defenses (DRIFT, MELON) with significantly lower latency. For example, on Claude-4-Sonnet, CausalArmor achieves 0.0% ASR with BL=1.27x, while DRIFT achieves 0.82% ASR with BL=5.43x.",
    301       "supported": "strong"
    302     },
    303     {
    304       "claim": "Retroactive CoT masking is necessary to prevent poisoned reasoning traces from re-triggering malicious actions after input sanitization.",
    305       "evidence": "Table 2 shows that removing CoT masking increases average ASR from 0.29% to 1.75% (a 6x increase) while slightly reducing BU from 75.11% to 74.61%.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "The probability of IPI attack success is bounded exponentially by the combined benign baseline advantage and sanitization margin (Proposition 4.1).",
    310       "evidence": "Proposition 4.1 proves Pr(Attack Success) <= T * |Y_mal| * exp(-(beta + gamma)) under Assumptions 1-2. Section 5.6 provides empirical support for both assumptions via benign utility data (Assumption 1) and causal restoration visualization (Assumption 2, Figure 7).",
    311       "supported": "moderate"
    312     },
    313     {
    314       "claim": "Proxy models (>= 12B parameters) can reliably substitute for frontier models in LOO attribution computation for IPI detection.",
    315       "evidence": "Figure 5 shows that Gemma-3-12B, Qwen3-14B, and Ministral-14B all achieve ASR close to the oracle (near-zero) when used as proxy models for CausalArmor with Gemini-3-Pro as the backbone. Models below 8B show degraded detection.",
    316       "supported": "moderate"
    317     },
    318     {
    319       "claim": "CausalArmor is effective against adaptive, privileged attackers in the DoomArena benchmark.",
    320       "evidence": "Table 1 shows CausalArmor reduces ASR from 88.87% (No Defense) to 3.65% on Gemini-3-Pro in DoomArena's adaptive attack setting, while maintaining BU of 70.96% (vs. 73.57% No Defense). This is comparable to PiGuard (ASR 5.57%) which has significantly worse BU (55.13%).",
    321       "supported": "strong"
    322     }
    323   ],
    324   "methodology_tags": [
    325     "benchmark-eval",
    326     "theoretical"
    327   ],
    328   "key_findings": "CausalArmor proposes a selective defense against Indirect Prompt Injection that detects attacks by computing leave-one-out attribution scores to identify when untrusted content dominates the user request's influence on privileged actions. On AgentDojo and DoomArena benchmarks, CausalArmor achieves near-zero Attack Success Rates (comparable to always-on defenses) while preserving benign utility and latency close to undefended agents, effectively resolving the over-defense dilemma. The framework introduces retroactive Chain-of-Thought masking to prevent poisoned reasoning traces from re-triggering attacks, and demonstrates that efficient proxy models (12B+ parameters) can reliably substitute for frontier models in the attribution computation.",
    329   "red_flags": [
    330     {
    331       "flag": "Company evaluating its own products",
    332       "detail": "Six of eight authors are affiliated with Google (Google Cloud AI Research, Google, Google DeepMind). The evaluation heavily features Google models (Gemini-2.5-Flash, Gemini-3-Pro, Gemma-3-12B-IT) and Google Cloud infrastructure. The proxy model recommended (Gemma-3) is in the same model family as the backbone (Gemini), potentially favoring transferability claims. No conflicts of interest disclosure is provided."
    333     },
    334     {
    335       "flag": "No code release",
    336       "detail": "Despite detailed algorithmic descriptions, no source code is released for CausalArmor. This limits reproducibility and independent verification of the results, especially given the complexity of the multi-component framework (LOO attribution, sanitization, CoT masking)."
    337     },
    338     {
    339       "flag": "Missing model version specifications",
    340       "detail": "All models are referenced by marketing names (Gemini-2.5-Flash, Gemini-3-Pro, Claude-4-Sonnet) without snapshot dates or API version identifiers. Given that model behavior changes across versions, this undermines reproducibility."
    341     },
    342     {
    343       "flag": "AgentDojo results lack variance reporting",
    344       "detail": "The primary evaluation (AgentDojo, Table 3) reports single-run point estimates without error bars, confidence intervals, or multiple seeds. Only the smaller DoomArena evaluation (115 scenarios) uses 5 seeds with variance. Given the stochasticity of LLM-based agents, single-run results may not be stable."
    345     }
    346   ],
    347   "cited_papers": [
    348     {
    349       "title": "Agentdojo: A dynamic environment to evaluate prompt injection attacks and defenses for llm agents",
    350       "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic", "L. Beurer-Kellner", "M. Fischer", "F. Tramer"],
    351       "year": 2024,
    352       "relevance": "Primary evaluation benchmark for IPI defense in tool-using LLM agents."
    353     },
    354     {
    355       "title": "Doomarena: A framework for testing ai agents against evolving security threats",
    356       "authors": ["L. Boisvert", "M. Bansal", "C. K. R. Evuru"],
    357       "year": 2025,
    358       "arxiv_id": "2504.14064",
    359       "relevance": "Second evaluation benchmark featuring adaptive adversarial attackers against AI agents."
    360     },
    361     {
    362       "title": "Defeating prompt injections by design",
    363       "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan", "J. Hayes", "N. Carlini"],
    364       "year": 2025,
    365       "arxiv_id": "2503.18813",
    366       "relevance": "CaMeL system-based defense against prompt injection through instruction/data channel separation."
    367     },
    368     {
    369       "title": "DRIFT: Dynamic rule-based defense with injection isolation for securing LLM agents",
    370       "authors": ["H. Li", "X. Liu", "H.-C. Chiu", "D. Li", "N. Zhang", "C. Xiao"],
    371       "year": 2025,
    372       "arxiv_id": "2506.12104",
    373       "relevance": "System-based IPI defense baseline using dynamic rule generation and always-on sanitization."
    374     },
    375     {
    376       "title": "PiGuard: Prompt injection guardrails via mitigating overdefense for free",
    377       "authors": ["H. Li", "X. Liu", "N. Zhang", "C. Xiao"],
    378       "year": 2025,
    379       "relevance": "Trained classifier baseline for prompt injection detection addressing over-defense."
    380     },
    381     {
    382       "title": "Melon: Provable defense against indirect prompt injection attacks in AI agents",
    383       "authors": ["K. Zhu", "X. Yang", "J. Wang", "W. Guo", "W. Y. Wang"],
    384       "year": 2025,
    385       "arxiv_id": "2502.05174",
    386       "relevance": "System-based defense using policy divergence verification, a key baseline comparison."
    387     },
    388     {
    389       "title": "PromptArmor: Simple yet effective prompt injection defenses",
    390       "authors": ["T. Shi", "K. Zhu", "Z. Wang"],
    391       "year": 2025,
    392       "arxiv_id": "2507.15219",
    393       "relevance": "Sanitization-based defense baseline for prompt injection in LLM agents."
    394     },
    395     {
    396       "title": "The attacker moves second: Stronger adaptive attacks bypass defenses against LLM jailbreaks and prompt injections",
    397       "authors": ["M. Nasr", "N. Carlini", "C. Sitawarin"],
    398       "year": 2025,
    399       "arxiv_id": "2510.09023",
    400       "relevance": "Evaluates robustness of defenses against adaptive attacks; referenced for responsible disclosure considerations."
    401     },
    402     {
    403       "title": "Indirect prompt injections: Are firewalls all you need, or stronger benchmarks?",
    404       "authors": ["R. Bhagwatkar", "K. Kasa", "A. Puri"],
    405       "year": 2025,
    406       "arxiv_id": "2510.05244",
    407       "relevance": "Examines the effectiveness of firewall-based approaches to indirect prompt injection defense."
    408     },
    409     {
    410       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    411       "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"],
    412       "year": 2024,
    413       "arxiv_id": "2403.02691",
    414       "relevance": "Benchmark for evaluating indirect prompt injection attacks in tool-using LLM agents."
    415     },
    416     {
    417       "title": "Agent Security Bench (ASB): Formalizing and benchmarking attacks and defenses in LLM-based agents",
    418       "authors": ["H. Zhang", "J. Huang", "K. Mei"],
    419       "year": 2024,
    420       "arxiv_id": "2410.02644",
    421       "relevance": "Formalizes and benchmarks security attacks and defenses for LLM-based agents."
    422     },
    423     {
    424       "title": "ContextCite: Attributing model generation to context",
    425       "authors": ["B. Cohen-Wang", "H. Shah", "K. Georgiev", "A. Madry"],
    426       "year": 2024,
    427       "relevance": "LOO attribution method for context-dependent generation; foundational for CausalArmor's detection approach."
    428     },
    429     {
    430       "title": "Prompt flow integrity to prevent privilege escalation in LLM agents",
    431       "authors": ["J. Kim", "W. Choi", "B. Lee"],
    432       "year": 2025,
    433       "arxiv_id": "2503.15547",
    434       "relevance": "Defense mechanism against privilege escalation in LLM agents through flow integrity."
    435     }
    436   ]
    437 }

Impressum · Datenschutz