ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (33066B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "DRIFT: Dynamic Rule-Based Defense with Injection Isolation for Securing LLM Agents",
      6     "authors": [
      7       "Hao Li",
      8       "Xiaogeng Liu",
      9       "Hung-Chun Chiu",
     10       "Dianqi Li",
     11       "Ning Zhang"
     12     ],
     13     "year": 2025,
     14     "venue": "NeurIPS 2025",
     15     "arxiv_id": "2506.12104",
     16     "doi": "10.48550/arXiv.2506.12104"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Abstract claims (ASR reduced from 30.7% to 1.3%, utility outperforms CaMeL) are supported by Figure 3 and detailed results in Section 3.2. Cross-model generalization claims are supported by Figure 5.",
     24         "source": "opus"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about each component's contribution are supported by the ablation study (Table 1), which uses controlled single-variable manipulation. Each component is added incrementally, isolating its effect.",
     30         "source": "opus"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "The abstract specifies evaluation on 'AgentDojo and ASB benchmark' with specific models. The limitations section (Appendix A) explicitly states 'the benchmark domains are limited and do not fully cover the diverse tasks and attack scenarios encountered in real-world agentic systems.'",
     36         "source": "opus"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No discussion of alternative explanations. For example, the utility improvements could partly stem from additional LLM reasoning calls rather than the security mechanism itself. The paper doesn't consider confounding factors.",
     42         "source": "opus"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Metrics (Benign Utility, Utility Under Attack, ASR) are clearly defined and match the granularity of claims. The paper doesn't inflate 'success on AgentDojo tasks' into broader claims about real-world security beyond what's measured.",
     48         "source": "opus"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Appendix A contains a dedicated 'Limitations' section discussing benchmark domain limitations.",
     56         "source": "opus"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The limitations section states only that 'the benchmark domains are limited and do not fully cover the diverse tasks and attack scenarios encountered in real-world agentic systems.' This is close to boilerplate — it doesn't identify specific threats like benchmark overfitting, GPT-4o-mini's weaknesses affecting generalization, or whether the training data generation pipeline introduces biases.",
     62         "source": "opus"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "The limitations section does not enumerate specific untested scenarios, attack types, deployment contexts, or populations. 'Diverse tasks and attack scenarios' is vague. No equivalent of 'What the evidence does not show.'",
     68         "source": "opus"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Acknowledgments section states: 'This project is partially supported by Schmidt Science AI2050 Early Career Fellow and Open philanthropy.'",
     76         "source": "opus"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Author affiliations clearly listed: Washington University in St. Louis, Johns Hopkins University, and Independent Researchers. No conflicts with evaluated products.",
     82         "source": "opus"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Schmidt Science and Open Philanthropy are independent foundations with no financial stake in the evaluated defense mechanism or the models used.",
     88         "source": "opus"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement is present in the paper. Absence of disclosure is not absence of conflict.",
     94         "source": "opus"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Key terms are defined: 'prompt injection attack' is explained with an example, 'Secure Planner/Dynamic Validator/Injection Isolator' are defined in Section 2, and privilege categories (Read/Write/Execute) are explicitly defined in Section 2.2.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Contributions are explicitly bulleted at the end of the introduction: a new framework (DRIFT) and extensive empirical validation of its effectiveness and adaptability.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Appendix B and Section 1 engage substantively with prior defenses (IsolateGPT, CaMeL, Progent, StruQ), explaining how DRIFT addresses their specific limitations rather than merely listing them.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": true,
    124           "justification": "Code is released at https://github.com/SaFoLab-WISC/DRIFT, stated in the abstract and confirmed in the NeurIPS checklist.",
    125           "source": "opus"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "They use publicly available benchmarks (AgentDojo, ASB) and state 'We will release our DRIFT training dataset' in the NeurIPS checklist. The benchmarks themselves are open-sourced.",
    131           "source": "opus"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. Training details (batch size, optimizer, learning rate) are mentioned in Section 3.1 but no software environment or dependency specifications.",
    137           "source": "opus"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "The paper describes implementation details in Section 3.1 but does not provide step-by-step reproduction instructions. Code is released in supplementary materials but the paper lacks a 'Reproducing Results' section.",
    143           "source": "opus"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "All results are point estimates without confidence intervals or error bars. The NeurIPS checklist item 7 explicitly answers 'No' for error bars.",
    151           "source": "opus"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "No statistical significance tests are used. Claims like 'DRIFT outperforms CaMeL by 20.1%' are based on comparing point estimates without any tests.",
    157           "source": "opus"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Effect sizes are given as absolute and relative differences with baseline context: e.g., 'ASR is successfully reduced from 30.7% to 1.3%', 'utility outperforms CaMeL by 20.1% under no attack' (Section 3.2). Both baseline and DRIFT numbers are provided throughout.",
    163           "source": "opus"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No justification for the sample sizes. AgentDojo has 97 user tasks and 629 injection tasks; ASB has 10 scenarios. No power analysis or discussion of whether these are sufficient.",
    169           "source": "opus"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "All results appear to be single-run with no standard deviations, variance, or multi-run analysis. The NeurIPS checklist confirms no error bars are reported.",
    175           "source": "opus"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "Multiple baselines compared: repeat_user_prompt, spotlighting_with_delimiting, tool_filter, transformers_pi_detector, CaMeL, Progent on AgentDojo (Figure 3); delimiters_defense, ob_sandwich_defense, instructional_prevention, Progent on ASB (Figure 4).",
    183           "source": "opus"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Baselines include CaMeL (2025) and Progent (2025), both concurrent system-level defenses. Other baselines span 2023-2025.",
    189           "source": "opus"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Table 1 presents a thorough ablation: Native Agent → +Planner → +Planner+Validator → +Planner+Validator+Isolator, plus Isolator alone. Each component's contribution is measured.",
    195           "source": "opus"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": true,
    200           "justification": "Three metrics are used throughout: Benign Utility (task completion without attacks), Utility Under Attack (task completion with attacks), and Attack Success Rate (ASR).",
    201           "source": "opus"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "All evaluation is fully automated using benchmark success criteria. No human evaluation of agent outputs, defense decisions, or utility quality.",
    207           "source": "opus"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Training data is collected from ToolBench (Section 2.5.1) while evaluation uses AgentDojo and ASB, which are completely separate benchmarks.",
    213           "source": "opus"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Appendix D (Tables 6-8) provides per-scenario breakdowns across Banking, Slack, Travel, and Workspace for all models. Figure 6a also shows per-session comparison.",
    219           "source": "opus"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Section C.1 analyzes failure on open-ended tasks (Table 4 shows DRIFT achieves 17.6% vs base agent's 25.7%). Section C.3 provides a detailed case study. Section 3.5 shows where static policies fail (trajectory length ≥3).",
    225           "source": "opus"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Table 1 shows the strict static policy (Planner only) causes severe utility drops (63.55% → 37.71%). Table 4 shows DRIFT reduces open-ended task completion. Claude-3-haiku shows limited utility improvement (Table 6). These negative aspects are reported transparently.",
    231           "source": "opus"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "GPT-4o-mini is specified as 'GPT-4o-mini-2024-07-18' (Section 3.1), but GPT-4o has no snapshot date, Claude-3.5-sonnet and Claude-3-haiku lack version identifiers. Per schema, 'GPT-4o' without a snapshot date does not count.",
    239           "source": "opus"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Full prompt texts are provided in Appendix E (Figures 8-13) for the Secure Planner, Privilege Assignment, Intent Alignment Validation, Injection Detection, Planning Sampling, and Injection Sampling.",
    245           "source": "opus"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Training hyperparameters are reported (batch size 4, 3 epochs, Adam optimizer, lr 2e-5 in Section 3.1). However, LLM API inference settings (temperature, top-p, max tokens) for the online models are not reported.",
    251           "source": "opus"
    252         },
    253         "scaffolding_described": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The agentic scaffolding is described in detail across Sections 2.1-2.4, with workflow diagrams (Figures 1, 2), including the Secure Planner, Dynamic Validator, Injection Isolator, and their interaction with the agent and tools.",
    257           "source": "opus"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": true,
    262           "justification": "Section 2.5.1 describes the full data construction pipeline: ToolBench conversations are modified using GPT-4o-mini for both Planner and Isolator training data. Tool environment reconstruction adds 0-25 extra tools. 1,000 samples collected for each component.",
    263           "source": "opus"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "No raw experimental data (per-task results, logs, model outputs) is released. Only aggregate metrics are presented in figures and tables.",
    271           "source": "opus"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "Section 2.5.1 describes how training data was collected from ToolBench: assistant messages rewritten by GPT-4o-mini, injection content simulated and placed by GPT-4o-mini. Evaluation uses standard benchmarks with described setups.",
    277           "source": "opus"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Data comes from standard benchmarks (AgentDojo, ASB) and ToolBench.",
    283           "source": "opus"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": true,
    288           "justification": "Section 2.5.1 documents the pipeline: ToolBench conversations → GPT-4o-mini rewrites assistant messages → 1,000 Planner samples and 1,000 Isolator samples collected → tool environment augmented with 0-25 extra tools from 10,000+ unique tools across 5,000 samples.",
    289           "source": "opus"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "This paper tests defense mechanisms against prompt injection attacks, not model knowledge or capability on benchmarks. Contamination of benchmark tasks in training data is not the primary concern.",
    297           "source": "opus"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Paper evaluates a defense system, not pre-trained model capabilities. Any contamination would affect all compared methods equally.",
    303           "source": "opus"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": false,
    307           "answer": false,
    308           "justification": "Paper evaluates defense effectiveness, not model knowledge. Benchmark contamination is not structurally relevant to the defense comparison.",
    309           "source": "opus"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants in this study. Evaluation is entirely automated via benchmarks.",
    317           "source": "opus"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "opus"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "opus"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "opus"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "opus"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "opus"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants.",
    353           "source": "opus"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": true,
    360           "justification": "Table 3 reports total token usage for DRIFT (2.37M tokens) and all baselines on AgentDojo, plus an efficiency metric (Utility−ASR)/Total Tokens. DRIFT uses ~1.89× more tokens than the undefended agent.",
    361           "source": "opus"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Token usage for inference is reported in Table 3, but no GPU hours, training time, or total API spend for the LoRA fine-tuning of Qwen2.5-7B-Instruct is provided.",
    367           "source": "opus"
    368         }
    369       },
    370       "experimental_rigor": {
    371         "seed_sensitivity_reported": {
    372           "applies": true,
    373           "answer": false,
    374           "justification": "All results appear to be single-run. No seed sensitivity analysis or multi-seed results are reported.",
    375           "source": "opus"
    376         },
    377         "number_of_runs_stated": {
    378           "applies": true,
    379           "answer": false,
    380           "justification": "The number of experimental runs is never stated. Results are presented as point estimates without indication of how many runs produced them.",
    381           "source": "opus"
    382         },
    383         "hyperparameter_search_budget": {
    384           "applies": true,
    385           "answer": false,
    386           "justification": "No hyperparameter search budget is reported. Training hyperparameters (lr, batch size, epochs) are stated but not how they were selected.",
    387           "source": "opus"
    388         },
    389         "best_config_selection_justified": {
    390           "applies": true,
    391           "answer": false,
    392           "justification": "No explanation of how the final configuration was selected or whether alternative configurations were tried.",
    393           "source": "opus"
    394         },
    395         "multiple_comparison_correction": {
    396           "applies": true,
    397           "answer": false,
    398           "justification": "Multiple comparisons across 5 models, 2 benchmarks, 4 scenarios, and multiple baselines are made without any correction for multiple testing.",
    399           "source": "opus"
    400         },
    401         "self_comparison_bias_addressed": {
    402           "applies": true,
    403           "answer": false,
    404           "justification": "Authors implement and evaluate their own system against baselines without acknowledging potential self-comparison bias. No independent evaluation.",
    405           "source": "opus"
    406         },
    407         "compute_budget_vs_performance": {
    408           "applies": true,
    409           "answer": true,
    410           "justification": "Table 3 compares DRIFT's performance against token cost for all methods, and computes an efficiency metric. DRIFT's cost relative to performance is explicitly analyzed.",
    411           "source": "opus"
    412         },
    413         "benchmark_construct_validity": {
    414           "applies": true,
    415           "answer": false,
    416           "justification": "No discussion of whether AgentDojo or ASB actually measure real-world agent security. The limitations section notes benchmark domains are limited but does not question construct validity of the benchmarks themselves.",
    417           "source": "opus"
    418         },
    419         "scaffold_confound_addressed": {
    420           "applies": true,
    421           "answer": true,
    422           "justification": "Model comparisons in Figure 5 use ReAct as the consistent baseline scaffold across all models. DRIFT is applied as a system-level addition on top of the same base scaffold, controlling for scaffold confounds.",
    423           "source": "opus"
    424         }
    425       },
    426       "data_leakage": {
    427         "temporal_leakage_addressed": {
    428           "applies": true,
    429           "answer": false,
    430           "justification": "No discussion of whether evaluated models may have seen AgentDojo or ASB benchmark tasks during training, or whether the benchmarks were created before model training cutoffs.",
    431           "source": "opus"
    432         },
    433         "feature_leakage_addressed": {
    434           "applies": true,
    435           "answer": false,
    436           "justification": "No discussion of whether the evaluation setup leaks information about attack presence or correct task execution.",
    437           "source": "opus"
    438         },
    439         "non_independence_addressed": {
    440           "applies": true,
    441           "answer": false,
    442           "justification": "No discussion of independence between training data (ToolBench) and evaluation data (AgentDojo), or structural similarities that might bias results.",
    443           "source": "opus"
    444         },
    445         "leakage_detection_method": {
    446           "applies": true,
    447           "answer": false,
    448           "justification": "No leakage detection or prevention method is applied.",
    449           "source": "opus"
    450         }
    451       }
    452     }
    453   },
    454   "claims": [
    455     {
    456       "claim": "DRIFT reduces attack success rate from 30.7% to 1.3% on GPT-4o-mini in AgentDojo while outperforming CaMeL in utility by 21.8% (no attack) and 10.9% (under attack).",
    457       "evidence": "Figure 3 shows the direct comparison across all defenses. Table 1 ablation confirms each component's contribution.",
    458       "supported": "strong"
    459     },
    460     {
    461       "claim": "DRIFT generalizes across five LLMs (GPT-4o, GPT-4o-mini, Claude-3.5-sonnet, Claude-3-haiku, Qwen2.5-7B), reducing ASR to single digits for all models.",
    462       "evidence": "Figure 5 shows per-model results; ASR ranges from 0.0% to 4.4% with DRIFT across all five models.",
    463       "supported": "strong"
    464     },
    465     {
    466       "claim": "Policy-tuned Qwen2.5-7B-Instruct achieves 0.0% ASR while improving utility by 5.6% over the base DRIFT configuration.",
    467       "evidence": "Figure 5 shows Qwen +DRIFT at 0.0% ASR vs. 15.1% for +ReAct, with utility increasing from 26.6% to 32.2%.",
    468       "supported": "strong"
    469     },
    470     {
    471       "claim": "Dynamic policy is essential for complex tasks: static policy utility drops sharply for trajectory length ≥3, while dynamic policy remains stable.",
    472       "evidence": "Figure 6b shows the trajectory-length scaling law; static policy drops below 30% success rate for length ≥4, dynamic policy maintains ~45%.",
    473       "supported": "strong"
    474     },
    475     {
    476       "claim": "The Injection Isolator fills a defense gap not covered by control/data constraints, catching attacks that manipulate final output without deviating the tool trajectory.",
    477       "evidence": "Table 1 shows Isolator alone reduces ASR from 30.67% to 7.95%; Figure 7 demonstrates a case where trajectory is correct but injected output appears.",
    478       "supported": "strong"
    479     },
    480     {
    481       "claim": "DRIFT is robust against adaptive attacks, with only a 0.81% ASR increase under combined isolator+validator adaptive attacks and 0.31% under PAIR.",
    482       "evidence": "Table 2 shows full adaptive attack results; the increases are minor relative to the defended baseline.",
    483       "supported": "moderate"
    484     },
    485     {
    486       "claim": "DRIFT operates at lower token cost than CaMeL (2.37M vs. 6.09M tokens) while achieving better utility-security tradeoff.",
    487       "evidence": "Table 3 reports total tokens and efficiency metric; DRIFT efficiency score is 19.7 vs. CaMeL's 5.8.",
    488       "supported": "strong"
    489     }
    490   ],
    491   "methodology_tags": [
    492     "benchmark-eval"
    493   ],
    494   "key_findings": "DRIFT achieves a substantially better security-utility tradeoff than prior system-level defenses against prompt injection: it reduces attack success rate from 30.7% to 1.3% on AgentDojo while improving utility by 21.8% over the next-best secure method (CaMeL). The dynamic constraint mechanism proves critical for multi-step tasks — static policies degrade sharply at trajectory length ≥3, explaining why prior static defenses sacrifice task completion. A complementary Injection Isolator addresses a distinct attack vector (output manipulation without tool-call deviation) that control/data constraints alone cannot block. Policy fine-tuning on Qwen2.5-7B achieves 0.0% ASR, suggesting that simpler dynamic subtasks (privilege classification, intent alignment) are within reach of smaller models.",
    495   "red_flags": [
    496     {
    497       "flag": "No variance across runs",
    498       "detail": "All results are single-run point estimates. The NeurIPS checklist explicitly answers 'No' to error bars. For stochastic LLM systems, single-run results can vary substantially; without variance no claim about margins is reliable."
    499     },
    500     {
    501       "flag": "Most model versions unpinned",
    502       "detail": "Only GPT-4o-mini has a specific version (2024-07-18). GPT-4o, Claude-3.5-sonnet, and Claude-3-haiku are listed without snapshot dates, making exact reproduction impossible as these models are updated."
    503     },
    504     {
    505       "flag": "Inference hyperparameters unreported",
    506       "detail": "Temperature, top-p, and max-token settings for all LLM calls (Secure Planner, Dynamic Validator, Injection Isolator, agent) are not reported, affecting reproducibility."
    507     },
    508     {
    509       "flag": "Benchmark contamination unaddressed",
    510       "detail": "AgentDojo (NeurIPS 2024) is publicly available and may be in the training corpora of GPT-4o and Claude models evaluated here; no discussion of this potential confound."
    511     },
    512     {
    513       "flag": "Training data not released",
    514       "detail": "The 1,000-sample Planner training dataset and 1,000-sample Isolator dataset are 'will release' at submission time, not yet available; the fine-tuned Qwen model results cannot be reproduced."
    515     }
    516   ],
    517   "cited_papers": [
    518     {
    519       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    520       "relevance": "Primary evaluation benchmark; provides the AgentDojo scenarios (banking, Slack, travel, workspace) used throughout the paper."
    521     },
    522     {
    523       "title": "Defeating Prompt Injections by Design (CaMeL)",
    524       "relevance": "Key static policy-based defense baseline that DRIFT directly improves upon in both security and utility."
    525     },
    526     {
    527       "title": "Progent: Programmable Privilege Control for LLM Agents",
    528       "relevance": "Concurrent dynamic policy defense; extensively compared against in main results and Appendix C.2."
    529     },
    530     {
    531       "title": "IsolateGPT: An Execution Isolation Architecture for LLM-Based Agentic Systems",
    532       "relevance": "Prior isolation-based defense that motivates the Injection Isolator design."
    533     },
    534     {
    535       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated LLM Agents",
    536       "relevance": "Characterizes the prompt injection attack surface that DRIFT defends against."
    537     },
    538     {
    539       "title": "ToolBench: Facilitating Large Language Models to Master 16000+ Real-World APIs",
    540       "relevance": "Source dataset used to construct DRIFT's policy training data for the fine-tuned Qwen model."
    541     },
    542     {
    543       "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-Based Agents",
    544       "relevance": "Second evaluation benchmark used to validate DRIFT's generalization beyond AgentDojo."
    545     },
    546     {
    547       "title": "Jailbreaking Black Box Large Language Models in Twenty Queries (PAIR)",
    548       "relevance": "Advanced adaptive attack used in the stress test (Section 3.6) to evaluate DRIFT's robustness."
    549     },
    550     {
    551       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    552       "relevance": "Baseline agent framework compared against in all multi-model experiments."
    553     }
    554   ],
    555   "engagement_factors": {
    556     "practical_relevance": {
    557       "score": 2,
    558       "justification": "DRIFT is a deployable defense framework with released code applicable to existing LLM agents, but requires non-trivial integration into agentic systems."
    559     },
    560     "surprise_contrarian": {
    561       "score": 1,
    562       "justification": "Shows dynamic policies substantially outperform static ones and that memory isolation matters — useful but not deeply surprising given the motivation."
    563     },
    564     "fear_safety": {
    565       "score": 2,
    566       "justification": "Demonstrates that even GPT-4o has 51.7% ASR when undefended against prompt injection, highlighting real security risks in LLM agentic systems."
    567     },
    568     "drama_conflict": {
    569       "score": 0,
    570       "justification": "No controversy — straightforward defense paper comparing against other methods without contentious claims."
    571     },
    572     "demo_ability": {
    573       "score": 2,
    574       "justification": "Code released on GitHub (SaFoLab-WISC/DRIFT) and benchmarks are publicly available, but not pip-installable or live demo."
    575     },
    576     "brand_recognition": {
    577       "score": 1,
    578       "justification": "Authors from Washington University in St. Louis and Johns Hopkins University — reputable universities but not major AI labs. Uses well-known models (GPT-4o, Claude) in experiments."
    579     }
    580   },
    581   "hn_data": {
    582     "threads": [
    583       {
    584         "hn_id": "44770561",
    585         "title": "B-Splines and Fourier-Best Friends for Spatial-Temporal Video Super-Resolution",
    586         "points": 4,
    587         "comments": 0,
    588         "url": "https://news.ycombinator.com/item?id=44770561"
    589       },
    590       {
    591         "hn_id": "47002668",
    592         "title": "LLMs exceed physicians on complex text-based differential diagnosis",
    593         "points": 3,
    594         "comments": 2,
    595         "url": "https://news.ycombinator.com/item?id=47002668"
    596       },
    597       {
    598         "hn_id": "45534337",
    599         "title": "Advancing medical artificial intelligence using a century of cases",
    600         "points": 3,
    601         "comments": 1,
    602         "url": "https://news.ycombinator.com/item?id=45534337"
    603       },
    604       {
    605         "hn_id": "43401539",
    606         "title": "CriteoPrivateAd: RealWorld Bidding Dataset to Design Private Advertising Systems",
    607         "points": 2,
    608         "comments": 1,
    609         "url": "https://news.ycombinator.com/item?id=43401539"
    610       },
    611       {
    612         "hn_id": "31894669",
    613         "title": "Protecting President Zelenskyy Against Deep Fakes",
    614         "points": 2,
    615         "comments": 0,
    616         "url": "https://news.ycombinator.com/item?id=31894669"
    617       },
    618       {
    619         "hn_id": "27612994",
    620         "title": "LegoFormer: Transformers for Block-by-Block Multi-View 3D Reconstruction",
    621         "points": 2,
    622         "comments": 0,
    623         "url": "https://news.ycombinator.com/item?id=27612994"
    624       },
    625       {
    626         "hn_id": "44971660",
    627         "title": "Scaling laws found in large generative medical event models",
    628         "points": 1,
    629         "comments": 0,
    630         "url": "https://news.ycombinator.com/item?id=44971660"
    631       },
    632       {
    633         "hn_id": "41227450",
    634         "title": "Τ-Bench: A Benchmark for Tool-Agent-User Interaction in Real-World Domains",
    635         "points": 1,
    636         "comments": 0,
    637         "url": "https://news.ycombinator.com/item?id=41227450"
    638       },
    639       {
    640         "hn_id": "40782080",
    641         "title": "Should AI optimize your code? A studio",
    642         "points": 1,
    643         "comments": 0,
    644         "url": "https://news.ycombinator.com/item?id=40782080"
    645       },
    646       {
    647         "hn_id": "28895006",
    648         "title": "IQ-Learn: Inverse Soft-Q Learning for Imitation",
    649         "points": 1,
    650         "comments": 0,
    651         "url": "https://news.ycombinator.com/item?id=28895006"
    652       }
    653     ],
    654     "top_points": 4,
    655     "total_points": 20,
    656     "total_comments": 4
    657   }
    658 }

Impressum · Datenschutz