ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28225B)


      1 {
      2   "paper": {
      3     "title": "IPIGuard: A Novel Tool Dependency Graph-Based Defense Against Indirect Prompt Injection in LLM Agents",
      4     "authors": [
      5       "Hengyu An",
      6       "Jinghuai Zhang",
      7       "Tianyu Du",
      8       "Chunyi Zhou",
      9       "Qingming Li",
     10       "Tao Lin",
     11       "Shouling Ji"
     12     ],
     13     "year": 2025,
     14     "venue": "Conference on Empirical Methods in Natural Language Processing",
     15     "arxiv_id": "2508.15310",
     16     "doi": "10.48550/arXiv.2508.15310"
     17   },
     18   "scan_version": 2,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "IPIGUARD introduces a Tool Dependency Graph (TDG) paradigm that decouples action planning from external data interaction to defend LLM agents against Indirect Prompt Injection attacks. On the AgentDojo benchmark across six LLMs and four attack types, IPIGUARD achieves the lowest average attack success rate (0.69%) while maintaining the highest utility under attack (58.77%), outperforming four baseline defenses. The ablation study confirms that both Fake Tool Invocation and Node Expansion mechanisms contribute complementary benefits, and using a stronger LLM for planning with a weaker one for execution offers a cost-effective configuration.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "GitHub repository URL provided: https://github.com/Greysahy/ipiguard (footnote 1 in abstract)."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Uses the publicly available AgentDojo benchmark (https://agentdojo.spylab.ai), referenced in Section 4.1."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "No requirements.txt, Dockerfile, or detailed environment setup section found. Appendix C lists model versions and temperature but not software dependencies."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions in the paper. A code repository is provided but the paper itself does not include a 'Reproducing Results' section or commands to run."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "All results in Tables 1-3 and Figures 4-5 are point estimates with no confidence intervals, error bars, or ± notation."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Multiple claims that IPIGUARD outperforms baselines (e.g., 'consistently achieves the lowest ASR') are based on comparing raw numbers without any statistical significance test."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Results provide percentage improvements with baseline context throughout, e.g., IPIGUARD average ASR 0.69% vs no-defense 13.16%, and UA 58.77% vs no-defense 54.30% (Table 1). Enough context for readers to assess magnitude."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "No justification for why 97 tasks with 629 test cases from AgentDojo is sufficient for the claims being made. No power analysis."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "Temperature is fixed to 0 (Appendix C) to reduce variance, but no variance or standard deviation is reported across runs. Single-run results only."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Four baseline defenses compared: Detector, Tool Filter, Spotlight, and Sandwich, plus a no-defense baseline (Section 4.1, Table 1)."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include recent methods: Spotlight (Hines et al., 2024), Detector (ProtectAI.com, 2024), Sandwich (Prompting, 2024), and Tool Filter (Willison, 2023b). All are from 2023-2024, contemporary with this 2025 paper."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Table 3 presents an ablation study removing Fake Tool Invocation (FTI) and Node Expansion (NE) individually and in combination, measuring BU, UA, and ASR."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "Three evaluation metrics used: Benign Utility (BU), Utility under Attack (UA), and Targeted Attack Success Rate (ASR), as defined in Section 4.1."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "Evaluation is entirely automated through the AgentDojo benchmark. No human evaluation of defense quality, task completion adequacy, or user experience."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "AgentDojo provides a standardized benchmark with 629 defined test cases across 97 tasks. The evaluation uses the benchmark's predefined test suite."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results broken down by all four AgentDojo scenarios (Workspace, Slack, Travel, Banking) in Table 1 and Figures 4-5."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 4.2.1 discusses lower Workspace utility due to conservative handling. Section 4.2.2 acknowledges 'ASR is not exactly zero because the fake tool invocation may fail in rare corner cases.' Case studies in Appendix H illustrate both successes and the mechanism's behavior."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "Lower Workspace utility acknowledged (Section 4.2.1). Ablation shows NE alone increases ASR (Table 3). Limitations section identifies three specific shortcomings of the approach."
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims 'superior balance between effectiveness and robustness' are supported by Table 1 showing lowest average ASR (0.69%) with highest UA (58.77%). Claim of working across 'four attack scenarios and six different LLMs' is supported by Table 1 and Figure 4."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims like 'IPIGUARD significantly reduces unintended tool invocations' are supported by the ablation study (Table 3) which shows controlled single-variable manipulation of FTI and NE components. The ablation design is adequate for causal inference about component contributions."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "Abstract claims 'paving the way for the development of safer agentic systems in dynamic environments' and the broad title extend well beyond the tested setting (one benchmark, four attack types). Results are limited to AgentDojo's four domains and specific attack patterns, but framing suggests general applicability to all IPI defense scenarios."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": false,
    138         "justification": "No substantive discussion of alternative explanations for the results. For example, no discussion of whether AgentDojo's specific task structure favors pre-planning approaches, or whether the improvements come primarily from restricting the action space rather than the TDG mechanism specifically."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "Claims match the granularity of measurements. The paper measures ASR, BU, and UA on AgentDojo and makes claims about defense effectiveness on that benchmark. No broader proxy gap — they don't claim to measure 'security' in general, but specific attack success rates."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Appendix C specifies exact versions: 'GPT-4o (gpt-4o-2024-05-13), GPT-4o-mini (gpt-4o-mini-2024-07-18), Claude 3.5 Sonnet (claude-3-5-sonnet-20241022) and o4-mini (o4-mini-2025-04-16).' Qwen model versions are specified by name (Qwen2.5-7B-Instruct, Qwen3-32B)."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "Full prompt templates provided in Appendix A for TDG Construction, Argument Estimation, and Node Expansion, including the actual instruction text and output format specifications."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Appendix C states: 'we fix the decoding temperature to 0 for all models. Unless otherwise specified, we set the reasoning effort level to medium.'"
    161       },
    162       "scaffolding_described": {
    163         "applies": true,
    164         "answer": true,
    165         "justification": "The TDG framework is described in extensive detail in Section 3: node types (Deterministic, Pending, Query Expanded, Resolved), traversal mechanism, Argument Estimation, Node Expansion, and Fake Tool Invocation are all fully specified with workflow diagrams (Figures 1-3)."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "AgentDojo benchmark is used as-is with its predefined tasks and attack injections. Section 4.1 describes the benchmark setup (97 tasks, 629 test cases, four domains) and how attacks are configured."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "Dedicated 'Limitations' section after the conclusion with three specific limitations discussed."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Limitations are specific to this study: (1) focus on tool-based IPI only, not textual output manipulation, (2) cost constraints limiting the number of models tested (cannot evaluate o3), (3) requires models with strong planning capabilities, limiting applicability to weaker models."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": true,
    187         "justification": "Limitations section explicitly states what the results do NOT show: 'We focus on defending LLM agents against IPI attacks that interfere with tool usage, rather than those that solely manipulate textual outputs.' Also: 'our method requires access to models with reasonably strong planning capabilities.'"
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "No raw experimental data (individual test case results, model outputs, TDG plans) are released. Only aggregated metrics in tables and figures."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Uses AgentDojo benchmark with 97 tasks across four domains, 629 test cases combining user goals with adversarial injections (Section 4.1). Benchmark is well-documented."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Uses standard benchmark (AgentDojo) and LLM APIs."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "Pipeline is clear: user instruction → TDG construction → TDG traversal with argument estimation/node expansion/fake tool invocation → evaluation via AgentDojo metrics. Section 3 details each stage."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": true,
    216         "justification": "Acknowledgements section lists multiple funding sources: National Key R&D Program of China, NSFC-Yeqisun Science Foundation, NSFC, Key R&D Program of Ningbo, China Postdoctoral Science Foundation, and Zhejiang Provincial Postdoctoral Research Project."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations clearly listed: Zhejiang University, University of California Los Angeles, and Westlake University. No affiliation with any LLM provider being evaluated."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": true,
    226         "justification": "All funding sources are government/academic grants (NSFC, national R&D programs, postdoctoral foundations) with no financial interest in the outcome of LLM defense research."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests or financial interests statement found in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "No training data cutoff dates stated for any of the six models used (GPT-4o, GPT-4o-mini, Claude 3.5 Sonnet, Qwen2.5-7B, Qwen3-32B, o4-mini). Task completion utility could be affected by training contamination."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": false,
    243         "justification": "No discussion of whether any models may have been trained on AgentDojo tasks or similar tool-use scenarios that overlap with the benchmark."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": false,
    248         "justification": "AgentDojo was published in 2024. Models trained after that date could have seen it. No discussion of this contamination risk, particularly relevant for utility metrics."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants in this study."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants in this study."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": true,
    292         "justification": "Table 2 reports average token usage (input/output) and task completion time for all defense methods. Table 5 reports estimated dollar costs (EC) for different planner-executor configurations."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": true,
    297         "justification": "Table 5 reports estimated costs for completing all tasks under different configurations (e.g., $6.73 for GPT-4o-mini as both planner and executor). Table 2 provides token usage breakdown."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "Temperature is fixed to 0 (Appendix C) to reduce variance, but no multi-seed or multi-run sensitivity analysis is reported. API models can still produce non-deterministic outputs at temperature 0."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs is never explicitly stated. Results appear to be from single runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No mention of hyperparameter search budget. Prompt templates and configuration choices appear fixed without describing how they were selected."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The final IPIGUARD configuration (with both FTI and NE) is presented as the default without explaining how this specific combination of prompt templates and mechanisms was arrived at."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": false,
    323         "answer": false,
    324         "justification": "No statistical tests are performed at all, so multiple comparison correction is moot."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "Authors implement all baseline defenses through the AgentDojo framework but do not acknowledge the bias of evaluating their own system against their own implementations of baselines."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": true,
    334         "justification": "Table 2 explicitly compares compute overhead (token usage, time) against defense performance. Section 4.2.3 discusses the trade-off: 'approximately a twofold increase in token usage' for 'substantial gains in robustness.' Table 5 further explores cost-performance via planner/executor splits."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": false,
    339         "justification": "No discussion of whether AgentDojo adequately represents real-world IPI attack scenarios. The paper uses the benchmark without questioning whether its four domains and attack types capture the breadth of IPI threats in practice."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "IPIGUARD fundamentally changes the agent's execution scaffold (adding TDG planning and constrained execution), but comparisons with baselines that use different scaffolding approaches do not discuss this confound. The performance differences may partly reflect scaffolding differences rather than the TDG mechanism specifically."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": false,
    351         "justification": "No discussion of temporal leakage. AgentDojo tasks may have been seen during model training for newer models like o4-mini."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information. The TDG planning phase receives full task descriptions and tool descriptions, but whether this provides unrealistic advantages compared to real deployment is not discussed."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of whether the 629 test cases are independent or share structural similarities (e.g., same domains, similar tool chains)."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection or prevention method used."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "IPIGUARD achieves the lowest average Attack Success Rate (0.69%) across all four attacks while maintaining the highest Utility under Attack (58.77%) among all defense methods.",
    373       "evidence": "Table 1 shows averaged results across four attacks on GPT-4o-mini: IPIGUARD ASR 0.69% vs next-best Tool Filter 2.06%, and UA 58.77% vs next-best Spotlight 55.09%.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "IPIGUARD maintains benign utility (67.01%) close to the no-defense upper bound (68.04%).",
    378       "evidence": "Figure 5 and Section 4.2.1 report BU scores across defense methods on GPT-4o-mini.",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "IPIGUARD generalizes across six different LLMs including both reasoning and non-reasoning models.",
    383       "evidence": "Figure 4 shows results under Important Instruction attack for GPT-4o, GPT-4o-mini, Claude 3.5 Sonnet, Qwen2.5-7B, Qwen3-32B, and o4-mini. ASR stays low across all models.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Both Fake Tool Invocation and Node Expansion are necessary and complementary components.",
    388       "evidence": "Table 3 ablation on GPT-4o-mini with Important Instruction: without both BU=52.58%/ASR=3.18%; NE alone raises BU to 64.95% but ASR to 4.77%; FTI alone reduces ASR to 0.32% but BU=51.55%; both together achieve BU=69.07%/ASR=0.64%.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Using a stronger LLM for planning significantly improves performance with only marginal cost increases.",
    393       "evidence": "Table 5 shows replacing Qwen2.5-7B planner with o4-mini raises BU from 35.05% to 51.55% and UA from 33.55% to 49.28%. GPT-4o-mini executor with o4-mini planner costs $7.99 vs $6.73 with GPT-4o-mini planner, but UA jumps from 57.07% to 64.39%.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "IPIGUARD incurs approximately a twofold increase in token usage compared to no defense.",
    398       "evidence": "Table 2: IPIGUARD uses 14,605 input tokens vs 6,165 for no defense, and 560 output tokens vs 179. Time is 13.88s vs 7.13s.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "No statistical tests for any comparative claims",
    405       "detail": "All claims of superiority ('consistently achieves the lowest ASR', 'highest overall performance') are based on comparing point estimates without any significance test. With a single run and no variance reported, it is impossible to know whether observed differences are reliable."
    406     },
    407     {
    408       "flag": "Single benchmark evaluation",
    409       "detail": "All results are on AgentDojo alone. The paper claims general defense effectiveness against IPI attacks but evaluates on only one benchmark with four predefined domains and four attack types. Generalization to real-world IPI scenarios is untested."
    410     },
    411     {
    412       "flag": "Single-run results with no variance",
    413       "detail": "Despite temperature 0, API models can produce non-deterministic outputs. No multiple runs or variance measures are reported, making result stability unknown."
    414     },
    415     {
    416       "flag": "No contamination analysis",
    417       "detail": "Several evaluated models (especially o4-mini from 2025) may have seen AgentDojo tasks in training data. Utility metrics could be inflated by contamination, but this is never discussed."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    423       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    424       "year": 2024,
    425       "relevance": "Primary evaluation benchmark for IPI defense; defines attack-defense evaluation framework for tool-augmented LLM agents."
    426     },
    427     {
    428       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    429       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    430       "year": 2023,
    431       "relevance": "Foundational paper on indirect prompt injection attacks in LLM applications."
    432     },
    433     {
    434       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    435       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    436       "year": 2024,
    437       "arxiv_id": "2403.02691",
    438       "relevance": "Benchmark and attack method for indirect prompt injection in tool-augmented agents; one of the four attacks evaluated."
    439     },
    440     {
    441       "title": "Defending Against Indirect Prompt Injection Attacks with Spotlighting",
    442       "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"],
    443       "year": 2024,
    444       "arxiv_id": "2403.14720",
    445       "relevance": "Delimiter-based defense baseline for IPI attacks; compared as one of four baseline defenses."
    446     },
    447     {
    448       "title": "Melon: Indirect Prompt Injection Defense via Masked Re-Execution and Tool Comparison",
    449       "authors": ["Kaijie Zhu", "Xianjun Yang", "Jindong Wang", "Wenbo Guo", "William Yang Wang"],
    450       "year": 2025,
    451       "arxiv_id": "2502.05174",
    452       "relevance": "Recent IPI defense method using masked re-execution and tool comparison for detection."
    453     },
    454     {
    455       "title": "The Task Shield: Enforcing Task Alignment to Defend Against Indirect Prompt Injection in LLM Agents",
    456       "authors": ["Feiran Jia", "Tong Wu", "Xin Qin", "Anna Squicciarini"],
    457       "year": 2024,
    458       "arxiv_id": "2412.16682",
    459       "relevance": "LLM-as-a-judge defense paradigm for IPI attacks, monitoring intermediate steps for alignment with user intent."
    460     },
    461     {
    462       "title": "Can Indirect Prompt Injection Attacks Be Detected and Removed?",
    463       "authors": ["Yulin Chen", "Haoran Li", "Yuan Sui", "Yufei He", "Yue Liu", "Yangqiu Song", "Bryan Hooi"],
    464       "year": 2025,
    465       "arxiv_id": "2502.16580",
    466       "relevance": "Detection-based defense against IPI attacks using auxiliary models."
    467     },
    468     {
    469       "title": "Aligning LLMs to Be Robust Against Prompt Injection",
    470       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "Chuan Guo"],
    471       "year": 2024,
    472       "arxiv_id": "2410.05451",
    473       "relevance": "Training-based approach (RLHF/fine-tuning) to make LLMs robust against prompt injection."
    474     },
    475     {
    476       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    477       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R. Narasimhan", "Yuan Cao"],
    478       "year": 2023,
    479       "relevance": "Foundational agent framework for reasoning and acting with tools; represents the standard execution paradigm that IPIGUARD modifies."
    480     },
    481     {
    482       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    483       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì", "Roberta Raileanu", "Maria Lomeli", "Eric Hambro", "Luke Zettlemoyer", "Nicola Cancedda", "Thomas Scialom"],
    484       "year": 2023,
    485       "relevance": "Key work on LLM tool-use capabilities; relevant to understanding tool-augmented agent vulnerabilities."
    486     },
    487     {
    488       "title": "Benchmarking and Defending Against Indirect Prompt Injection Attacks on Large Language Models",
    489       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    490       "year": 2023,
    491       "arxiv_id": "2312.14197",
    492       "relevance": "Early benchmark and defense methods for indirect prompt injection in LLMs."
    493     },
    494     {
    495       "title": "Ignore Previous Prompt: Attack Techniques for Language Models",
    496       "authors": ["Fábio Perez", "Ian Ribeiro"],
    497       "year": 2022,
    498       "arxiv_id": "2211.09527",
    499       "relevance": "Foundational prompt injection attack technique; one of the four attack types evaluated in this paper."
    500     }
    501   ]
    502 }

Impressum · Datenschutz