scan.json (21001B)
1 { 2 "paper": { 3 "title": "AGENTVIGIL: Generic Black-Box Red-teaming for Indirect Prompt Injection against LLM Agents", 4 "authors": ["Zhun Wang", "Vincent Siu", "Zhe Ye", "Tianneng Shi", "Yuzhou Nie", "Xuandong Zhao", "Chenguang Wang", "Wenbo Guo", "Dawn Song"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2505.05849" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "No repository URL or code archive link is provided in the paper. No mention of code release." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses two public benchmarks: AgentDojo and VWA-adv, both publicly available. The initial corpus templates are described but not explicitly released." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment setup section is provided. Model checkpoints are listed in Appendix A but no software dependencies." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method is described algorithmically but without runnable reproduction guidance." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "All results are reported as point estimates (e.g., 71%, 70% success rates) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims AGENTVIGIL outperforms baselines (e.g., 'nearly doubling the performance') based solely on comparing success rate numbers with no statistical tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "The paper reports absolute success rates with baseline context throughout (e.g., baseline 38% vs AGENTVIGIL 71%; baseline 36% vs 70%), providing enough context to assess magnitude." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The fuzzing and test sets are 142 and 173 tasks for AgentDojo, 99 and 100 for VWA-adv. No justification for these sizes or power analysis is provided." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread across runs is reported. Results appear to be single-run numbers." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares against handcrafted baseline attacks from AgentDojo and VWA-adv, plus OpenPromptInjection and InjecAgent baselines in Appendix B.2." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include AgentDojo (2024), VWA-adv (2024), OpenPromptInjection (2024), and InjecAgent (2024), all contemporary." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "Section 5.3 presents ablation removing (1) the initial corpus and (2) adaptive seed scoring + MCTS-based selection, shown in Figure 3." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper reports both attack success rate (ASR) and coverage as metrics, plus evaluates transferability across models and effectiveness against defenses." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of attack quality or outputs is included. All evaluation is automated via benchmark frameworks. The paper makes claims about real-world applicability that could benefit from human assessment." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Tasks are explicitly split into fuzzing sets and test sets (142/173 for AgentDojo, 99/100 for VWA-adv) with transferability evaluated on the held-out test set." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Appendix B.3 (Table 5) provides per-suite breakdowns for AgentDojo (Slack, Workspace, Travel, Banking) and per-goal breakdowns for VWA-adv (illusioning vs goal misdirection)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The paper discusses that attacks are ineffective against Claude-3.5-Sonnet, that defenses reduce success rates, and that VWA-adv defenses cause AGENTVIGIL to converge with baseline." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports that AGENTVIGIL performs worse than baseline against Claude-3.5-Sonnet on AgentDojo (0.04 vs 0.08), and that defenses on VWA-adv cause degradation to baseline levels." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims of 71% and 70% success rates on AgentDojo and VWA-adv are supported by Tables 1-3 and Figures 3-4. The 'nearly doubling' claim is supported (38%→71%, 36%→70%)." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The ablation study in Section 5.3 provides controlled single-variable manipulation to support claims about which components matter (initial corpus, seed scoring/selection)." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title claims 'Generic Black-Box Red-teaming' but results are on only two benchmarks. The real-world case study is a single scenario on a local WebArena deployment. The paper does not bound its generalization claims." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper does not discuss alternative explanations for why AGENTVIGIL outperforms baselines. For example, the improvement could stem primarily from the curated initial corpus rather than the fuzzing framework itself. No threats-to-validity or confound discussion is present." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "Appendix A lists exact checkpoints: o3-mini-2024-12-17, gpt-4o-mini-2024-07-18, gpt-4o-2024-08-06, claude-3-5-sonnet-20241022, gemini-2.0-flash-exp." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The mutation prompt templates (Shorten, Expand, Rephrase, Crossover, GenerateSimilar) and the initial corpus templates are described in natural language but the actual prompt text is not provided." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Key hyperparameters are reported: 10 fuzzing iterations, 3 mutated prompts per iteration (AgentDojo), 10 per iteration (VWA-adv), top-5 seeds for transfer, quarter sampling of tasks. Coverage factor C and exploration factor C in algorithms are named but values not specified." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The fuzzing framework architecture is described in detail in Section 4, including seed selection (MCTS), scoring, mutation, and the iterative loop. Algorithms 1-3 formalize the process." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section 5.1 and 5.2 describe how tasks are randomly split into fuzzing/test sets with specific counts (142/173 and 99/100), and how quarter-sampling is used during fuzzing." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": false, 159 "justification": "There is no dedicated limitations or threats-to-validity section. The Impact Statement discusses ethical considerations but not methodological limitations." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "No specific threats to validity are discussed anywhere in the paper." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what it does NOT show. The threat model (Section 3) defines scope of the attacker but not limitations of the evaluation or claims." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "No raw experimental data (individual task results, generated adversarial prompts, logs) is made available." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section 4.2 describes corpus collection from human heuristics, online resources, and existing research. Sections 5.1-5.2 describe the benchmark setup and task splitting." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants; data comes from standard benchmarks (AgentDojo, VWA-adv)." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline from corpus collection → seed generation → fuzzing → evaluation is documented in Section 4 and the experimental setup sections, with task counts at each stage." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is present in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed with numbered superscripts, though the specific institutions are not spelled out in the extracted text. Authors are from UC Berkeley (Dawn Song's group) and other institutions." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is disclosed, so independence cannot be assessed." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This paper tests attack effectiveness against LLM agents, not model capability on benchmarks. The models are targets being attacked, not being evaluated for their knowledge." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not applicable — the paper evaluates attack success against agents, not model benchmark performance where training data contamination would be relevant." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not applicable — the benchmarks test agent security vulnerabilities, not model knowledge that could be contaminated." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "No API costs, token counts, or wall-clock time for the fuzzing process are reported, despite the framework making many API calls across iterations." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget (API spend, GPU hours, total time) is stated despite the iterative fuzzing requiring substantial API usage." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "AGENTVIGIL achieves 71% attack success rate on AgentDojo against o3-mini agents, nearly doubling the baseline's 38%.", 286 "evidence": "Table 1 and Section 5.1 fuzzing results show 71% ASR on fuzzing set vs 38% baseline.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "AGENTVIGIL achieves 70% success rate on VWA-adv against GPT-4o agents, compared to 36% baseline.", 291 "evidence": "Figure 4 and Section 5.2 show 70% on fuzzing set vs 36% baseline. Table 1 confirms 60% on fuzzing set (discrepancy: 70% is coverage, 60% is ASR in Table 1).", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Generated adversarial prompts exhibit strong transferability across unseen tasks and LLMs.", 296 "evidence": "Table 1 test set results show 65% on unseen tasks for o3-mini, 67% on Gemini-2-flash-exp (unseen LLM). However, transfer to Claude-3.5-Sonnet is poor (4% on AgentDojo).", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "All three core components (initial corpus, seed scoring, MCTS selection) are critical to performance.", 301 "evidence": "Ablation study in Section 5.3 and Figure 3 shows degraded coverage when each is removed.", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "AGENTVIGIL can successfully attack agents in real-world environments, misleading them to navigate to arbitrary URLs.", 306 "evidence": "Section 5.4 describes a single case study on a local WebArena deployment with one attack scenario.", 307 "supported": "weak" 308 } 309 ], 310 "methodology_tags": ["benchmark-eval", "case-study"], 311 "key_findings": "AGENTVIGIL is a black-box fuzzing framework for indirect prompt injection attacks against LLM agents, combining curated initial seed corpora, adaptive scoring, and MCTS-based seed selection. It achieves 71% and 60-70% attack success rates on AgentDojo (o3-mini) and VWA-adv (GPT-4o) respectively, roughly doubling baseline handcrafted attacks. Generated adversarial prompts transfer across unseen tasks and models (except Claude-3.5-Sonnet), but existing defenses can reduce the advantage to near-baseline levels on VWA-adv.", 312 "red_flags": [ 313 { 314 "flag": "No uncertainty quantification", 315 "detail": "All results are single-run point estimates with no error bars, confidence intervals, or variance across runs. Given the stochastic nature of both LLM outputs and the fuzzing process, this is a significant omission." 316 }, 317 { 318 "flag": "No limitations section", 319 "detail": "The paper lacks any limitations or threats-to-validity discussion despite making broad claims about generic applicability." 320 }, 321 { 322 "flag": "Overly broad title claim", 323 "detail": "The title claims 'Generic Black-Box Red-teaming' but the framework is evaluated on only two benchmarks and one real-world scenario (local WebArena). Transfer to Claude fails, undermining the genericity claim." 324 }, 325 { 326 "flag": "No cost reporting", 327 "detail": "The fuzzing process involves many LLM API calls across iterations but no cost or time information is reported, making practical applicability hard to assess." 328 }, 329 { 330 "flag": "Inconsistent VWA-adv numbers", 331 "detail": "The abstract and Section 5.2 claim 70% success on VWA-adv but Table 1 shows 60% ASR on the fuzzing set for GPT-4o. The 70% may refer to coverage rather than ASR, which is not clearly distinguished." 332 } 333 ], 334 "cited_papers": [ 335 { 336 "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents", 337 "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramer"], 338 "year": 2024, 339 "arxiv_id": "2406.13352", 340 "relevance": "Primary benchmark used for evaluating indirect prompt injection attacks against personal assistant agents." 341 }, 342 { 343 "title": "Dissecting Adversarial Robustness of Multimodal LM Agents", 344 "authors": ["Chih-Hui Wu", "Rushil R. Shah", "Jing Yu Koh", "Ruslan Salakhutdinov", "Daniel Fried", "Aditi Raghunathan"], 345 "year": 2024, 346 "relevance": "Second benchmark (VWA-adv) used for evaluating attacks against web agents with multimodal input." 347 }, 348 { 349 "title": "GPTFuzzer: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts", 350 "authors": ["Jiahao Yu", "Xingwei Lin", "Xinyu Xing"], 351 "year": 2023, 352 "arxiv_id": "2309.10253", 353 "relevance": "Key prior work applying fuzzing to direct prompt injection/jailbreaking; AGENTVIGIL extends this to indirect injection in agents." 354 }, 355 { 356 "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions", 357 "authors": ["Eric Wallace", "Kai Xiao", "Reimar Leike", "Lilian Weng", "Johannes Heidecke", "Alex Beutel"], 358 "year": 2024, 359 "arxiv_id": "2404.13208", 360 "relevance": "Defense mechanism evaluated in the paper; results suggest instruction hierarchy is insufficient against AGENTVIGIL attacks." 361 }, 362 { 363 "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses", 364 "authors": ["Yi Liu", "Yujia Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 365 "year": 2024, 366 "relevance": "Provides formalization of prompt injection attacks and baseline used for comparison (OpenPromptInjection)." 367 }, 368 { 369 "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents", 370 "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"], 371 "year": 2024, 372 "arxiv_id": "2403.02691", 373 "relevance": "Benchmark for indirect prompt injection in tool-integrated agents; used as additional baseline comparison." 374 }, 375 { 376 "title": "AgentPoison: Red-teaming LLM Agents via Poisoning Memory or Knowledge Bases", 377 "authors": ["Zhaorun Chen", "Zhen Xiang", "Chaowei Xiao", "Dawn Song", "Bo Li"], 378 "year": 2024, 379 "arxiv_id": "2407.12784", 380 "relevance": "Related attack method that poisons agent memory/knowledge bases; requires white-box access unlike AGENTVIGIL." 381 }, 382 { 383 "title": "Defending Against Indirect Prompt Injection Attacks with Spotlighting", 384 "authors": ["Keegan Hines", "Gary Lopez", "Matthew Hall", "Federico Zarfati", "Yonatan Zunger", "Emre Kiciman"], 385 "year": 2024, 386 "arxiv_id": "2403.14720", 387 "relevance": "Defense strategy (delimiter-based spotlighting) evaluated against AGENTVIGIL's attacks." 388 }, 389 { 390 "title": "PromptFuzz: Harnessing Fuzzing Techniques for Robust Testing of Prompt Injection in LLMs", 391 "authors": ["Jiahao Yu", "Yuhang Shao", "Hanwen Miao", "Jianwei Shi", "Xinyu Xing"], 392 "year": 2024, 393 "arxiv_id": "2409.14729", 394 "relevance": "Related fuzzing approach for testing prompt injection robustness in LLMs." 395 }, 396 { 397 "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents", 398 "authors": ["Shuyan Zhou", "Frank F. Xu", "Hao Zhu", "Xuhui Zhou", "Robert Lo", "Abishek Sridhar", "Xianyi Cheng", "Tianyue Ou", "Yonatan Bisk", "Daniel Fried"], 399 "year": 2023, 400 "arxiv_id": "2307.13854", 401 "relevance": "Provides the real-world web environment used for the case study evaluation in Section 5.4." 402 } 403 ] 404 }