ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27736B)


      1 {
      2   "scan_version": 2,
      3   "active_modules": ["experimental_rigor", "data_leakage"],
      4   "paper": {
      5     "title": "Prompt Injection Attack to Tool Selection in LLM Agents",
      6     "authors": ["Jiawen Shi", "Zenghui Yuan", "Guiyao Tie", "Pan Zhou", "Neil Zhenqiang Gong", "Lichao Sun"],
      7     "year": 2025,
      8     "venue": "NDSS 2026",
      9     "arxiv_id": "2504.19793"
     10   },
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "ToolHijacker achieves high attack success rates (up to 99.6%) against tool selection in LLM agents by injecting a single malicious tool document, even in no-box scenarios where the attacker has no access to the target system. The attack transfers across architecturally different LLMs (e.g., 96.7% ASR from Llama-3.3-70B shadow to GPT-4o target). Existing defenses — both prevention-based (StruQ, SecAlign) and detection-based (known-answer, DataSentinel, PPL) — are insufficient, with gradient-free attacks achieving 99.6% ASR even under StruQ.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "The paper states 'We will release code and data under restricted access — interested parties must request permission' in the Ethics section. This is not a public release."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The paper uses two public benchmark datasets: MetaTool and ToolBench, both publicly available. The 10 target tasks and 1,000 task descriptions per dataset are described in detail in Appendix C (Figures 14-15)."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, requirements files, or dependency details are provided in the paper."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The paper describes methods but does not include scripts or commands to replicate experiments."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "All results are reported as point estimates (e.g., '96.7% ASR') without confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes comparative claims (e.g., 'outperforms baselines') based solely on comparing percentages without any statistical significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes are reported in context: e.g., 'gradient-free attack achieves a higher ASR by 4.5% when targeting GPT-4o' and 'Llama-3-8B increases the ASR by 15.12% over Llama-2-7B' (Section IV-C). Absolute percentages with baselines provide magnitude context."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper uses 100 target task descriptions per task and 10 tasks per dataset but provides no justification for why these numbers are adequate."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs averaged across tasks."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Seven baselines are compared: five manual attacks (naive, escape characters, context ignore, fake completion, combined) and two automated attacks (JudgeDeceiver, PoisonedRAG). Results in Table III."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include recent automated attacks: JudgeDeceiver (CCS 2024) and PoisonedRAG (2024). Also includes StruQ and SecAlign as defense baselines. These represent current state of the art."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Extensive ablation studies in Section IV-C: impact of R vs S components (Table V), impact of k, k', shadow task descriptions (Figure 6), shadow LLM choice (Tables VI-VII), similarity metrics (Table VIII), loss terms (Table XV), and hyperparameters α and β (Figure 9)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Four metrics used: ACC (accuracy without attack), ASR (attack success rate), HR (hit rate for retrieval), and AHR (attack hit rate). Defined in Section IV-A5."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "A user study with 6 participants was conducted to evaluate whether humans can detect malicious tool documents (Table XVII in Appendix B)."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Shadow task descriptions Q' used for optimization are explicitly disjoint from target task descriptions Q (Q ∩ Q' = ∅, Section II-B). The 100 target task descriptions per task are distinct from the 5 shadow descriptions used for optimization."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results broken down per task (Figure 3 shows 10 tasks), per LLM (Table I shows 8 LLMs), per retriever (Table IV shows 4 retrievers), and per dataset (MetaTool vs ToolBench throughout)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses cases where the attack is less effective: Claude-3-Haiku is 'the least sensitive' (Section IV-B), gradient-based attack with Llama-2-7B shadow drops to 34% ASR on Llama-3-70B (Table VII), and low k values reduce effectiveness (Figure 5)."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several negative findings reported: defenses are insufficient (Section V), gradient-based attack has lower transferability with weak shadow LLMs (Table VII), small k' leads to declining ASR (Figure 5), and removing any loss term significantly hurts performance (Table XV)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims about high ASR, outperforming baselines, and defenses being insufficient are all supported by Tables I, III, and IX-X respectively."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims are primarily from ablation studies (e.g., 'removing L3 reduces ASR from 95% to 5%' in Table XV) which use controlled single-variable manipulation. The two-phase design isolates R's effect on retrieval and S's effect on selection (Table V)."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title says 'Prompt Injection Attack to Tool Selection in LLM Agents' broadly, but the evaluation is limited to a specific two-step retrieval+selection pipeline. Many LLM agents use different tool selection mechanisms (e.g., function calling APIs) that are not tested."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper discusses why attacks transfer ('shared alignment objectives and training paradigms make LLMs inherently vulnerable' and 'LLM homogenization'), why gradient-free outperforms gradient-based on closed-source models, and why Claude-3-Haiku is more resistant (Section IV-B)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper clearly distinguishes what it measures (ASR = selecting the malicious tool name) from the broader security implication (executing harmful tools). The metrics are precisely defined (Section IV-A5) and the paper explicitly discusses the gap between tool selection manipulation and actual harm potential."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model versions are listed: Llama-2-7B-chat, Llama-3-8B-Instruct, Llama-3-70B-Instruct, Llama-3.3-70B-Instruct, Claude-3-Haiku, Claude-3.5-Sonnet, GPT-3.5, GPT-4o (Section IV-A3). However, no snapshot dates for closed-source models."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompt texts are provided: the selection prompt (Figure 2), shadow task description generation prompt (Figure 10), shadow tool document generation prompt (Figure 11), attacker LLM system instruction (Figure 13), and initial R and S (Figure 12)."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Hyperparameters reported in Section IV-A4: m'=5, k'=5, Titer=10, B=2, W=10 for gradient-free; α=2.0, β=0.1, R iterations=3, S iterations=400 for gradient-based."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The two-step tool selection pipeline (retrieval + selection) is formally described in Section II-A with mathematical formulations. The shadow framework construction is detailed in Section III."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "For ToolBench: 'After removing duplicate tools and empty descriptions, the tool library contains 9,650 benign tool documents' from 16,464 originals (Section IV-A1). Task description generation process described with templates."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. The Conclusion mentions future work directions but does not discuss limitations of the current study."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed. The paper does not address potential confounds or weaknesses in the experimental design."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do not show. It doesn't acknowledge that the attack is only tested on one specific tool selection architecture (retrieval + selection) and may not apply to other agent frameworks."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "Raw experimental results (individual trial outcomes, per-query results) are not available. Only aggregated percentages are reported."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data collection is described: MetaTool (21,127 instances, 199 tools from OpenAI Plugins) and ToolBench (126,486 samples, 16,464 tools from RapidAPI). Task descriptions generated via LLM + human evaluation (Section IV-A1)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "The user study mentions '6 participants' but provides no details on how they were recruited, their background, or potential selection bias."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline from dataset selection through shadow framework construction, optimization, and evaluation is documented in Sections III and IV. ToolBench filtering from 16,464 to 9,650 tools is explained."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding information is disclosed in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are clearly listed: Huazhong University of Science and Technology, Duke University, and Lehigh University."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is included in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper tests prompt injection attacks against tool selection, not model knowledge on benchmarks. The LLMs are used as tool selectors, not evaluated for their pre-trained knowledge."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Not applicable — the paper tests attack effectiveness on tool selection, not model capability on benchmark tasks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not applicable — contamination of benchmark solutions in training data is irrelevant to this attack evaluation."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "The user study with 6 participants evaluating malicious tool documents is not pre-registered."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": true,
    250         "answer": false,
    251         "justification": "No IRB or ethics approval is mentioned for the user study. The Ethics section discusses responsible disclosure and informed consent but not IRB review."
    252       },
    253       "demographics_reported": {
    254         "applies": true,
    255         "answer": false,
    256         "justification": "The 6 participants are not characterized — no information about their expertise, background, or demographics."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": true,
    260         "answer": false,
    261         "justification": "No inclusion or exclusion criteria for the 6 participants are stated."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "This is not a between-subjects experiment requiring randomization — all participants evaluated the same tool documents."
    267       },
    268       "blinding_described": {
    269         "applies": true,
    270         "answer": false,
    271         "justification": "No blinding details are provided for the user study. It's unclear whether participants knew the ratio of malicious to benign tools."
    272       },
    273       "attrition_reported": {
    274         "applies": true,
    275         "answer": false,
    276         "justification": "No information about whether all 6 participants completed the study or if any were excluded."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Appendix B reports attack costs: gradient-free R requires 1 LLM query and S requires ~18 LLM queries; gradient-based R requires ~1 GPU-minute and S requires ~8 GPU-hours on one NVIDIA A800 GPU."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Compute budget stated in Appendix B: gradient-based requires ~8 GPU-hours on one NVIDIA A800 GPU for S optimization. Gradient-free requires ~19 LLM queries total."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single runs."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is not explicitly stated. It is unclear whether results are from single or multiple optimization/evaluation runs."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "While hyperparameter sensitivity is explored (α, β in Figure 9), no search budget is reported for finding the default values (α=2.0, β=0.1)."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "The paper shows ablation results across multiple α and β values (Figure 9) and reports that ASR remains above 95% for a range of values, justifying the chosen defaults."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors implement both their attack and the baselines (JudgeDeceiver, PoisonedRAG) without acknowledging potential bias in re-implementing competitors' methods."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No comparison of performance at matched compute budgets between ToolHijacker and baselines. The gradient-based method uses 8 GPU-hours while baselines may use significantly less."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper does not discuss whether MetaTool and ToolBench adequately represent real-world tool selection scenarios. No discussion of construct validity."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "The paper evaluates a specific tool selection pipeline, not comparing models across different scaffolds. The pipeline IS the thing being attacked."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "This paper tests attack effectiveness, not model knowledge. Temporal leakage of benchmark solutions is not relevant to measuring whether an LLM selects a malicious tool."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "Not applicable — the evaluation measures attack success on tool selection, not model capability."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": true,
    352         "justification": "The paper explicitly ensures independence between shadow and target descriptions (Q ∩ Q' = ∅) and evaluates on target tasks not used during optimization. Table XII shows 0% ASR on non-target tasks."
    353       },
    354       "leakage_detection_method": {
    355         "applies": false,
    356         "answer": false,
    357         "justification": "Not applicable — the paper is not evaluating pre-trained model knowledge on benchmarks."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "ToolHijacker achieves high attack success rates across different LLMs, with gradient-free achieving 96.7% ASR on GPT-4o (MetaTool) using Llama-3.3-70B as shadow LLM.",
    364       "evidence": "Table I shows ASRs across 8 LLMs and 2 datasets. GPT-4o gradient-free ASR = 96.7% on MetaTool, 88.2% on ToolBench.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "ToolHijacker significantly outperforms existing prompt injection attacks when applied to tool selection.",
    369       "evidence": "Table III: gradient-free achieves 96.7% vs best baseline PoisonedRAG at 39.3% on MetaTool with GPT-4o. All 7 baselines are far below ToolHijacker.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Prevention-based defenses (StruQ, SecAlign) fail to defend against ToolHijacker.",
    374       "evidence": "Table IX: gradient-free achieves 99.6% ASR under StruQ on MetaTool. SecAlign reduces ASR to 97.5% on MetaTool. Both defenses are insufficient.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Detection-based defenses are insufficient, with known-answer detection and DataSentinel having FNRs exceeding 90%.",
    379       "evidence": "Table X: known-answer detection has 100% FNR for both attacks. DataSentinel has 100% FNR for gradient-free, 90% for gradient-based. PPL detects some gradient-based but misses 90%.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "The attack is targeted with minimal impact on non-target tasks.",
    384       "evidence": "Table XII: gradient-free achieves 0% ASR and 0.22% AHR on non-target tasks; gradient-based achieves 0.11% ASR and 4% AHR.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "Humans struggle to detect malicious tool documents generated by ToolHijacker.",
    389       "evidence": "Table XVII: 6 participants failed to detect ≥71% of malicious tools (FNR 71-100%) while incorrectly flagging 5.6-30.35% of benign tools.",
    390       "supported": "weak"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Tiny user study sample",
    396       "detail": "The human detection study uses only 6 participants with no reported demographics, recruitment criteria, or statistical analysis. This is insufficient to draw conclusions about human detection ability."
    397     },
    398     {
    399       "flag": "No variance or uncertainty reporting",
    400       "detail": "All results are point estimates with no error bars, confidence intervals, or variance across runs. Given the stochastic nature of LLM inference, results could vary across runs."
    401     },
    402     {
    403       "flag": "No limitations section",
    404       "detail": "The paper has no dedicated limitations section and does not discuss scope boundaries. The attack is only tested on one specific tool selection architecture (retrieval + selection with dual encoders) but is presented broadly."
    405     },
    406     {
    407       "flag": "Self-comparison bias in baseline implementations",
    408       "detail": "The authors implement competitor baselines (JudgeDeceiver, PoisonedRAG) themselves without acknowledging potential bias. These methods were designed for different problem settings and may be disadvantaged."
    409     },
    410     {
    411       "flag": "Closed-source model versions unspecified",
    412       "detail": "GPT-3.5, GPT-4o, Claude-3-Haiku, Claude-3.5-Sonnet are listed without snapshot dates or API versions. Model behavior can change across versions, affecting reproducibility."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "SWE-agent: Agent-computer interfaces enable automated software engineering",
    418       "authors": ["J. Yang", "C. E. Jimenez", "A. Wettig"],
    419       "year": 2024,
    420       "arxiv_id": "2405.15793",
    421       "relevance": "Key agent framework for code-level software engineering tasks."
    422     },
    423     {
    424       "title": "MetaGPT: Meta programming for multi-agent collaborative framework",
    425       "authors": ["S. Hong", "X. Zheng", "J. Chen"],
    426       "year": 2023,
    427       "arxiv_id": "2308.00352",
    428       "relevance": "Multi-agent collaborative framework for software development."
    429     },
    430     {
    431       "title": "Gorilla: Large language model connected with massive APIs",
    432       "authors": ["S. G. Patil", "T. Zhang", "X. Wang"],
    433       "year": 2023,
    434       "arxiv_id": "2305.15334",
    435       "relevance": "LLM agent connected with massive APIs for tool use."
    436     },
    437     {
    438       "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    439       "authors": ["Y. Qin", "S. Liang", "Y. Ye"],
    440       "year": 2023,
    441       "arxiv_id": "2307.16789",
    442       "relevance": "Major benchmark for LLM tool use with 16K+ APIs, used as evaluation dataset in this paper."
    443     },
    444     {
    445       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    446       "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra"],
    447       "year": 2023,
    448       "relevance": "Foundational work on indirect prompt injection in LLM-integrated applications."
    449     },
    450     {
    451       "title": "PoisonedRAG: Knowledge poisoning attacks to retrieval-augmented generation of large language models",
    452       "authors": ["W. Zou", "R. Geng", "B. Wang", "J. Jia"],
    453       "year": 2024,
    454       "arxiv_id": "2402.07867",
    455       "relevance": "Key baseline: adversarial text injection into RAG knowledge bases."
    456     },
    457     {
    458       "title": "Optimization-based prompt injection attack to LLM-as-a-Judge",
    459       "authors": ["J. Shi", "Z. Yuan", "Y. Liu"],
    460       "year": 2024,
    461       "relevance": "JudgeDeceiver - gradient-optimized prompt injection baseline for LLM judging."
    462     },
    463     {
    464       "title": "StruQ: Defending against prompt injection with structured queries",
    465       "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"],
    466       "year": 2024,
    467       "arxiv_id": "2402.06363",
    468       "relevance": "Prevention-based defense against prompt injection evaluated in this paper."
    469     },
    470     {
    471       "title": "Aligning LLMs to be robust against prompt injection",
    472       "authors": ["S. Chen", "A. Zharmagambetov", "S. Mahloujifar"],
    473       "year": 2024,
    474       "arxiv_id": "2410.05451",
    475       "relevance": "SecAlign defense against prompt injection via preference optimization."
    476     },
    477     {
    478       "title": "DataSentinel: A game-theoretic detection of prompt injection attacks",
    479       "authors": ["Y. Liu", "Y. Jia", "J. Jia", "D. Song", "N. Z. Gong"],
    480       "year": 2025,
    481       "relevance": "State-of-the-art detection-based defense against prompt injection."
    482     },
    483     {
    484       "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    485       "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic"],
    486       "year": 2024,
    487       "relevance": "Comprehensive evaluation framework for prompt injection in LLM agents."
    488     },
    489     {
    490       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    491       "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"],
    492       "year": 2024,
    493       "arxiv_id": "2403.02691",
    494       "relevance": "Benchmark for indirect prompt injection in tool-using LLM agents."
    495     },
    496     {
    497       "title": "From allies to adversaries: Manipulating LLM tool-calling through adversarial injection",
    498       "authors": ["H. Wang", "R. Zhang", "J. Wang"],
    499       "year": 2024,
    500       "arxiv_id": "2412.10198",
    501       "relevance": "Adversarial manipulation of LLM tool calling mechanisms."
    502     },
    503     {
    504       "title": "A critical evaluation of defenses against prompt injection attacks",
    505       "authors": ["Y. Jia", "Z. Shao", "Y. Liu"],
    506       "year": 2025,
    507       "arxiv_id": "2505.18333",
    508       "relevance": "Critical evaluation showing defenses sacrifice general capabilities and remain vulnerable to adaptive attacks."
    509     },
    510     {
    511       "title": "Defeating prompt injections by design",
    512       "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan"],
    513       "year": 2025,
    514       "arxiv_id": "2503.18813",
    515       "relevance": "Security policy enforcement approach to prevent prompt injection in LLM agents."
    516     }
    517   ]
    518 }

Impressum · Datenschutz