ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30568B)


      1 {
      2   "paper": {
      3     "title": "DRIFT: Dynamic Rule-Based Defense with Injection Isolation for Securing LLM Agents",
      4     "authors": [
      5       "Hao Li",
      6       "Xiaogeng Liu",
      7       "Hung-Chun Chiu",
      8       "Dianqi Li",
      9       "Ning Zhang",
     10       "Chaowei Xiao"
     11     ],
     12     "year": 2025,
     13     "venue": "NeurIPS 2025",
     14     "arxiv_id": "2506.12104",
     15     "doi": "10.48550/arXiv.2506.12104"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "DRIFT reduces Attack Success Rate on GPT-4o-mini from 30.7% to 1.3% on AgentDojo while maintaining significantly higher utility than static-policy defenses like CaMeL (58.5% vs 38.4% benign utility). Dynamic policies are shown to be critical for complex tasks—static policies degrade sharply at trajectory lengths of 3+, while dynamic policies remain stable. The framework generalizes across 5 LLMs (GPT-4o, GPT-4o-mini, Claude-3.5-sonnet, Claude-3-haiku, Qwen2.5-7B-Instruct), consistently reducing ASR to single digits. Policy training via LoRA on Qwen2.5-7B achieves 0% ASR with improved utility.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": true,
     26         "justification": "Code is released at https://github.com/SaFoLab-WISC/DRIFT, stated in the abstract and confirmed in the NeurIPS checklist."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": true,
     31         "justification": "They use publicly available benchmarks (AgentDojo, ASB) and state 'We will release our DRIFT training dataset' in the NeurIPS checklist. The benchmarks themselves are open-sourced."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. Training details (batch size, optimizer, learning rate) are mentioned in Section 3.1 but no software environment or dependency specifications."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper describes implementation details in Section 3.1 but does not provide step-by-step reproduction instructions. Code is released in supplementary materials but the paper lacks a 'Reproducing Results' section."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "All results are point estimates without confidence intervals or error bars. The NeurIPS checklist item 7 explicitly answers 'No' for error bars."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "No statistical significance tests are used. Claims like 'DRIFT outperforms CaMeL by 20.1%' are based on comparing point estimates without any tests."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Effect sizes are given as absolute and relative differences with baseline context: e.g., 'ASR is successfully reduced from 30.7% to 1.3%', 'utility outperforms CaMeL by 20.1% under no attack' (Section 3.2). Both baseline and DRIFT numbers are provided throughout."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification for the sample sizes. AgentDojo has 97 user tasks and 629 injection tasks; ASB has 10 scenarios. No power analysis or discussion of whether these are sufficient."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "All results appear to be single-run with no standard deviations, variance, or multi-run analysis. The NeurIPS checklist confirms no error bars are reported."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Multiple baselines compared: repeat_user_prompt, spotlighting_with_delimiting, tool_filter, transformers_pi_detector, CaMeL, Progent on AgentDojo (Figure 3); delimiters_defense, ob_sandwich_defense, instructional_prevention, Progent on ASB (Figure 4)."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include CaMeL (2025) and Progent (2025), both concurrent system-level defenses. Other baselines span 2023-2025."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table 1 presents a thorough ablation: Native Agent → +Planner → +Planner+Validator → +Planner+Validator+Isolator, plus Isolator alone. Each component's contribution is measured."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": true,
     90         "justification": "Three metrics are used throughout: Benign Utility (task completion without attacks), Utility Under Attack (task completion with attacks), and Attack Success Rate (ASR)."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "All evaluation is fully automated using benchmark success criteria. No human evaluation of agent outputs, defense decisions, or utility quality."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Training data is collected from ToolBench (Section 2.5.1) while evaluation uses AgentDojo and ASB, which are completely separate benchmarks."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Appendix D (Tables 6-8) provides per-scenario breakdowns across Banking, Slack, Travel, and Workspace for all models. Figure 6a also shows per-session comparison."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Section C.1 analyzes failure on open-ended tasks (Table 4 shows DRIFT achieves 17.6% vs base agent's 25.7%). Section C.3 provides a detailed case study. Section 3.5 shows where static policies fail (trajectory length ≥3)."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Table 1 shows the strict static policy (Planner only) causes severe utility drops (63.55% → 37.71%). Table 4 shows DRIFT reduces open-ended task completion. Claude-3-haiku shows limited utility improvement (Table 6). These negative aspects are reported transparently."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "Abstract claims (ASR reduced from 30.7% to 1.3%, utility outperforms CaMeL) are supported by Figure 3 and detailed results in Section 3.2. Cross-model generalization claims are supported by Figure 5."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "Causal claims about each component's contribution are supported by the ablation study (Table 1), which uses controlled single-variable manipulation. Each component is added incrementally, isolating its effect."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The abstract specifies evaluation on 'AgentDojo and ASB benchmark' with specific models. The limitations section (Appendix A) explicitly states 'the benchmark domains are limited and do not fully cover the diverse tasks and attack scenarios encountered in real-world agentic systems.'"
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "No discussion of alternative explanations. For example, the utility improvements could partly stem from additional LLM reasoning calls rather than the security mechanism itself. The paper doesn't consider confounding factors."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Metrics (Benign Utility, Utility Under Attack, ASR) are clearly defined and match the granularity of claims. The paper doesn't inflate 'success on AgentDojo tasks' into broader claims about real-world security beyond what's measured."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "GPT-4o-mini is specified as 'GPT-4o-mini-2024-07-18' (Section 3.1), but GPT-4o has no snapshot date, Claude-3.5-sonnet and Claude-3-haiku lack version identifiers. Per schema, 'GPT-4o' without a snapshot date does not count."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt texts are provided in Appendix E (Figures 8-13) for the Secure Planner, Privilege Assignment, Intent Alignment Validation, Injection Detection, Planning Sampling, and Injection Sampling."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "Training hyperparameters are reported (batch size 4, 3 epochs, Adam optimizer, lr 2e-5 in Section 3.1). However, LLM API inference settings (temperature, top-p, max tokens) for the online models are not reported."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "The agentic scaffolding is described in detail across Sections 2.1-2.4, with workflow diagrams (Figures 1, 2), including the Secure Planner, Dynamic Validator, Injection Isolator, and their interaction with the agent and tools."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 2.5.1 describes the full data construction pipeline: ToolBench conversations are modified using GPT-4o-mini for both Planner and Isolator training data. Tool environment reconstruction adds 0-25 extra tools. 1,000 samples collected for each component."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Appendix A contains a dedicated 'Limitations' section discussing benchmark domain limitations."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "The limitations section states only that 'the benchmark domains are limited and do not fully cover the diverse tasks and attack scenarios encountered in real-world agentic systems.' This is close to boilerplate — it doesn't identify specific threats like benchmark overfitting, GPT-4o-mini's weaknesses affecting generalization, or whether the training data generation pipeline introduces biases."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The limitations section does not enumerate specific untested scenarios, attack types, deployment contexts, or populations. 'Diverse tasks and attack scenarios' is vague. No equivalent of 'What the evidence does not show.'"
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw experimental data (per-task results, logs, model outputs) is released. Only aggregate metrics are presented in figures and tables."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 2.5.1 describes how training data was collected from ToolBench: assistant messages rewritten by GPT-4o-mini, injection content simulated and placed by GPT-4o-mini. Evaluation uses standard benchmarks with described setups."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. Data comes from standard benchmarks (AgentDojo, ASB) and ToolBench."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Section 2.5.1 documents the pipeline: ToolBench conversations → GPT-4o-mini rewrites assistant messages → 1,000 Planner samples and 1,000 Isolator samples collected → tool environment augmented with 0-25 extra tools from 10,000+ unique tools across 5,000 samples."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Acknowledgments section states: 'This project is partially supported by Schmidt Science AI2050 Early Career Fellow and Open philanthropy.'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations clearly listed: Washington University in St. Louis, Johns Hopkins University, and Independent Researchers. No conflicts with evaluated products."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "Schmidt Science and Open Philanthropy are independent foundations with no financial stake in the evaluated defense mechanism or the models used."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement is present in the paper. Absence of disclosure is not absence of conflict."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This paper tests defense mechanisms against prompt injection attacks, not model knowledge or capability on benchmarks. Contamination of benchmark tasks in training data is not the primary concern."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "Paper evaluates a defense system, not pre-trained model capabilities. Any contamination would affect all compared methods equally."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Paper evaluates defense effectiveness, not model knowledge. Benchmark contamination is not structurally relevant to the defense comparison."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in this study. Evaluation is entirely automated via benchmarks."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Table 3 reports total token usage for DRIFT (2.37M tokens) and all baselines on AgentDojo, plus an efficiency metric (Utility−ASR)/Total Tokens. DRIFT uses ~1.89× more tokens than the undefended agent."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Token usage for inference is reported in Table 3, but no GPU hours, training time, or total API spend for the LoRA fine-tuning of Qwen2.5-7B-Instruct is provided."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "All results appear to be single-run. No seed sensitivity analysis or multi-seed results are reported."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The number of experimental runs is never stated. Results are presented as point estimates without indication of how many runs produced them."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "No hyperparameter search budget is reported. Training hyperparameters (lr, batch size, epochs) are stated but not how they were selected."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "No explanation of how the final configuration was selected or whether alternative configurations were tried."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "Multiple comparisons across 5 models, 2 benchmarks, 4 scenarios, and multiple baselines are made without any correction for multiple testing."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "Authors implement and evaluate their own system against baselines without acknowledging potential self-comparison bias. No independent evaluation."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": true,
    333         "justification": "Table 3 compares DRIFT's performance against token cost for all methods, and computes an efficiency metric. DRIFT's cost relative to performance is explicitly analyzed."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "No discussion of whether AgentDojo or ASB actually measure real-world agent security. The limitations section notes benchmark domains are limited but does not question construct validity of the benchmarks themselves."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": true,
    343         "justification": "Model comparisons in Figure 5 use ReAct as the consistent baseline scaffold across all models. DRIFT is applied as a system-level addition on top of the same base scaffold, controlling for scaffold confounds."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether evaluated models may have seen AgentDojo or ASB benchmark tasks during training, or whether the benchmarks were created before model training cutoffs."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "No discussion of whether the evaluation setup leaks information about attack presence or correct task execution."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of independence between training data (ToolBench) and evaluation data (AgentDojo), or structural similarities that might bias results."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention method is applied."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "DRIFT reduces ASR from 30.7% to 1.3% on GPT-4o-mini while outperforming CaMeL in utility by 20.1% (no attack) and 12.5% (under attack) on AgentDojo.",
    372       "evidence": "Figure 3 and Section 3.2 show DRIFT achieves 1.3% ASR vs CaMeL's 0.0%, with utility of 58.5% vs 38.4% (no attack) and 47.9% vs 35.4% (under attack).",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "DRIFT generalizes across 5 LLMs, consistently reducing ASR to single-digit levels while maintaining utility.",
    377       "evidence": "Figure 5 shows ASR reductions across GPT-4o (51.7%→1.5%), GPT-4o-mini (30.7%→1.3%), Claude-3.5-sonnet (37.1%→4.4%), Claude-3-haiku (11.2%→3.0%), Qwen2.5-7B (15.1%→0.0%). Utility is preserved or improved in most cases.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "Dynamic policies are necessary for complex tasks — static policies cause sharp utility drops at trajectory length ≥3.",
    382       "evidence": "Figure 6b shows static policy success rate drops sharply from ~55% (length 2) to ~25% (length 3), while dynamic policy remains stable at ~50-55%.",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "The Injection Isolator is necessary for defending against attacks that don't alter the tool-call trajectory.",
    387       "evidence": "Section C.3 provides a case study where injected content affects only the final answer text, not tool calls. Table 1 ablation shows Isolator reduces ASR from 3.66% to 1.29%.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Policy training on Qwen2.5-7B achieves 0% ASR with improved utility compared to untuned DRIFT.",
    392       "evidence": "Figure 5 shows tuned Qwen2.5-7B achieves 0% ASR (from 15.1% undefended), utility improves from 26.6% to 32.2% (no attack) and 19.1% to 22.2% (under attack).",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "DRIFT is robust to adaptive attacks, with ASR increasing by only 0.81% under combined isolator and validator adaptive attacks.",
    397       "evidence": "Table 2 shows IAA+VAA results in 2.10% ASR vs 1.29% baseline, and PAIR attack yields 1.60% ASR. Utility remains stable.",
    398       "supported": "moderate"
    399     },
    400     {
    401       "claim": "DRIFT is more cost-efficient than other policy-based defenses, using 2.37M tokens vs CaMeL's 6.09M.",
    402       "evidence": "Table 3 shows token usage comparison across all defenses. DRIFT's efficiency metric (19.7) exceeds CaMeL (5.8) and Progent (13.9).",
    403       "supported": "strong"
    404     }
    405   ],
    406   "red_flags": [
    407     {
    408       "flag": "No error bars or variance across runs",
    409       "detail": "All results throughout the paper are single-run point estimates. The NeurIPS checklist explicitly acknowledges this (question 7: 'No'). Without variance estimates, it is impossible to assess whether observed differences are meaningful or due to random fluctuation."
    410     },
    411     {
    412       "flag": "No statistical significance tests",
    413       "detail": "Comparisons across methods and models rely entirely on comparing point estimates. Claims like 'DRIFT outperforms Progent' are based on numeric differences that could be within noise, especially on smaller scenario subsets (e.g., 10 ASB scenarios)."
    414     },
    415     {
    416       "flag": "Arbitrary efficiency metric",
    417       "detail": "Table 3 defines efficiency as (Utility - ASR) / Total Tokens. This metric conflates utility and security into a single number with no theoretical justification for why subtraction and division are appropriate. Different weighting would change rankings."
    418     },
    419     {
    420       "flag": "Training data quality concerns",
    421       "detail": "Policy training data for both Planner and Isolator is generated by GPT-4o-mini rewriting ToolBench conversations. Potential quality issues and biases introduced by this synthetic data generation pipeline are not discussed."
    422     },
    423     {
    424       "flag": "Limited adaptive attack evaluation",
    425       "detail": "Adaptive attacks (Section 3.6) are manually crafted or use PAIR on sampled cases. A more systematic adaptive attack (e.g., targeting the specific prompt structures in Figures 8-13) could be more effective. The paper doesn't explore attacks that exploit knowledge of the full system architecture."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Agentdojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    431       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    432       "year": 2024,
    433       "relevance": "Primary evaluation benchmark for prompt injection attack/defense in LLM agents."
    434     },
    435     {
    436       "title": "Defeating prompt injections by design",
    437       "authors": ["Edoardo Debenedetti", "Ilia Shumailov", "Tianqi Fan", "Jamie Hayes", "Nicholas Carlini", "Daniel Fabian", "Christoph Kern", "Chongyang Shi", "Andreas Terzis", "Florian Tramèr"],
    438       "year": 2025,
    439       "arxiv_id": "2503.18813",
    440       "relevance": "CaMeL static policy defense — primary baseline and comparison point for system-level prompt injection defense."
    441     },
    442     {
    443       "title": "Progent: Programmable privilege control for LLM agents",
    444       "authors": ["Tianneng Shi", "Jingxuan He", "Zhun Wang", "Linyu Wu", "Hongwei Li", "Wenbo Guo", "Dawn Song"],
    445       "year": 2025,
    446       "arxiv_id": "2504.11703",
    447       "relevance": "Concurrent dynamic policy-based defense for LLM agents, directly compared with DRIFT."
    448     },
    449     {
    450       "title": "IsolateGPT: An execution isolation architecture for LLM-based agentic systems",
    451       "authors": ["Yuhao Wu", "Franziska Roesner", "Tadayoshi Kohno", "Ning Zhang", "Umar Iqbal"],
    452       "year": 2025,
    453       "relevance": "System-level isolation defense for LLM agents, addresses cross-application information leakage."
    454     },
    455     {
    456       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    457       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    458       "year": 2024,
    459       "relevance": "Benchmark for indirect prompt injection attacks in tool-integrated LLM agents."
    460     },
    461     {
    462       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    463       "authors": ["Sahar Abdelnabi", "Kai Greshake", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    464       "year": 2023,
    465       "relevance": "Foundational work on indirect prompt injection attacks against real-world LLM applications."
    466     },
    467     {
    468       "title": "Neural exec: Learning (and learning from) execution triggers for prompt injection attacks",
    469       "authors": ["Dario Pasquini", "Martin Strohmeier", "Carmela Troncoso"],
    470       "year": 2024,
    471       "relevance": "Advanced prompt injection attack using learned execution triggers."
    472     },
    473     {
    474       "title": "StruQ: Defending against prompt injection with structured queries",
    475       "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David A. Wagner"],
    476       "year": 2024,
    477       "arxiv_id": "2402.06363",
    478       "relevance": "Model-level defense transforming queries to structured form to resist prompt injection."
    479     },
    480     {
    481       "title": "SecAlign: Defending against prompt injection with preference optimization",
    482       "authors": ["Sizhe Chen", "Arman Zharmagambetov", "Saeed Mahloujifar", "Kamalika Chaudhuri", "David Wagner", "Chuan Guo"],
    483       "year": 2024,
    484       "arxiv_id": "2410.05451",
    485       "relevance": "Model-level defense using preference optimization to mitigate prompt injection."
    486     },
    487     {
    488       "title": "PiGuard: Prompt injection guardrail via mitigating overdefense for free",
    489       "authors": ["Hao Li", "Xiaogeng Liu", "Ning Zhang", "Chaowei Xiao"],
    490       "year": 2025,
    491       "relevance": "Prompt injection detection guardrail that addresses overdefense trade-offs."
    492     },
    493     {
    494       "title": "System-level defense against indirect prompt injection attacks: An information flow control perspective",
    495       "authors": ["Fangzhou Wu", "Ethan Cecchetti", "Chaowei Xiao"],
    496       "year": 2024,
    497       "arxiv_id": "2409.19091",
    498       "relevance": "Information flow control approach to defending LLM agents against indirect prompt injection."
    499     },
    500     {
    501       "title": "RTBAS: defending LLM agents against prompt injection and privacy leakage",
    502       "authors": ["Peter Yong Zhong", "Siyuan Chen", "Ruiqi Wang", "McKenna McCall", "Ben L. Titzer", "Heather Miller", "Phillip B. Gibbons"],
    503       "year": 2025,
    504       "arxiv_id": "2502.08966",
    505       "relevance": "Runtime defense system for LLM agents against prompt injection and privacy leakage."
    506     },
    507     {
    508       "title": "ReAct: Synergizing reasoning and acting in language models",
    509       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du", "Izhak Shafran", "Karthik R. Narasimhan", "Yuan Cao"],
    510       "year": 2023,
    511       "relevance": "Foundational agentic reasoning framework used as the baseline agent scaffold in DRIFT experiments."
    512     },
    513     {
    514       "title": "Agent Security Bench (ASB): formalizing and benchmarking attacks and defenses in LLM-based agents",
    515       "authors": ["Hanrong Zhang", "Jingyuan Huang", "Kai Mei", "Yifei Yao", "Zhenting Wang", "Chenlu Zhan", "Hongwei Wang", "Yongfeng Zhang"],
    516       "year": 2025,
    517       "relevance": "Secondary evaluation benchmark for LLM agent security with 10 attack scenarios."
    518     }
    519   ],
    520   "engagement_factors": {
    521     "practical_relevance": {
    522       "score": 2,
    523       "justification": "DRIFT is a deployable defense framework with released code applicable to existing LLM agents, but requires non-trivial integration into agentic systems."
    524     },
    525     "surprise_contrarian": {
    526       "score": 1,
    527       "justification": "Shows dynamic policies substantially outperform static ones and that memory isolation matters — useful but not deeply surprising given the motivation."
    528     },
    529     "fear_safety": {
    530       "score": 2,
    531       "justification": "Demonstrates that even GPT-4o has 51.7% ASR when undefended against prompt injection, highlighting real security risks in LLM agentic systems."
    532     },
    533     "drama_conflict": {
    534       "score": 0,
    535       "justification": "No controversy — straightforward defense paper comparing against other methods without contentious claims."
    536     },
    537     "demo_ability": {
    538       "score": 2,
    539       "justification": "Code released on GitHub (SaFoLab-WISC/DRIFT) and benchmarks are publicly available, but not pip-installable or live demo."
    540     },
    541     "brand_recognition": {
    542       "score": 1,
    543       "justification": "Authors from Washington University in St. Louis and Johns Hopkins University — reputable universities but not major AI labs. Uses well-known models (GPT-4o, Claude) in experiments."
    544     }
    545   }
    546 }

Impressum · Datenschutz