ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29920B)


      1 {
      2   "paper": {
      3     "title": "AGENTVIGIL: Generic Black-Box Red-teaming for Indirect Prompt Injection against LLM Agents",
      4     "authors": [
      5       "Zhun Wang",
      6       "Vincent Siu",
      7       "Zhe Ye",
      8       "Tianneng Shi",
      9       "Yuzhou Nie",
     10       "Xuandong Zhao",
     11       "Chenguang Wang",
     12       "Wenbo Guo",
     13       "Dawn Song"
     14     ],
     15     "year": 2025,
     16     "venue": "Preprint",
     17     "arxiv_id": "2505.05849"
     18   },
     19   "checklist": {
     20     "artifacts": {
     21       "code_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No repository URL or code release is mentioned in the paper. The paper describes algorithms in pseudocode (Appendix C) but there is no GitHub link, Zenodo archive, or any other artifact URL provided."
     25       },
     26       "data_released": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "The paper evaluates on two public benchmarks: AgentDojo (Debenedetti et al., 2024) and VWA-adv (Wu et al., 2024b). Both are publicly available. The paper also uses WebArena's shopping website for the real-world case study, which is open-source (magento2). No novel dataset was created that is unreleased."
     30       },
     31       "environment_specified": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No requirements.txt, Dockerfile, or dependency specification is provided. The paper mentions using GPT-4o-mini as helper model for mutation and specific model checkpoints (Appendix A), but does not describe the software environment needed to reproduce experiments."
     35       },
     36       "reproduction_instructions": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No step-by-step reproduction instructions, README, or reproduction scripts are provided. The algorithms in Appendix C describe the logic but not how to run the actual experiments."
     40       }
     41     },
     42     "statistical_methodology": {
     43       "confidence_intervals_or_error_bars": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "All results are reported as point estimates (e.g., '71% success rate', '70% success rate'). No confidence intervals, standard deviations, or error bars are provided for any result in Tables 1-5 or in Figure 3."
     47       },
     48       "significance_tests": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No statistical significance tests are used. The paper compares AGENTVIGIL against baselines by directly comparing point-estimate success rates without any formal statistical testing (e.g., p-values, bootstrap tests)."
     52       },
     53       "effect_sizes_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "The paper reports percentage improvement with baseline context: '71% success rate' vs '38% baseline' with 'nearly a 100% improvement over the baseline attacks' (Section 5.1). Similar reporting for VWA-adv (70% vs 36%). Per the schema description, 'A paper that says 12% improvement over baseline (from 45% to 57%) provides enough context for YES.' The paper matches this pattern."
     57       },
     58       "sample_size_justified": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper uses 142/173 task split for fuzzing/test in AgentDojo and 99/100 for VWA-adv. The 'randomly dividing' approach is described but no justification for these split sizes or power analysis is provided."
     62       },
     63       "variance_reported": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "No variance, standard deviation, or spread measures are reported. All results in Tables 1-5 are single point estimates. There is no mention of multiple runs with aggregated statistics."
     67       }
     68     },
     69     "evaluation_design": {
     70       "baselines_included": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper uses handcrafted adversarial prompts from AgentDojo and VWA-adv as baselines (Section 5.1, 5.2). In Appendix B.2, additional baselines from OpenPromptInjection (Liu et al., 2024) and InjecAgent (Zhan et al., 2024) are included."
     74       },
     75       "baselines_contemporary": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The baselines used (AgentDojo from 2024, VWA-adv from 2024, OpenPromptInjection from 2024, InjecAgent from 2024) are contemporary and represent relevant prior work. GPTFuzzer (Yu et al., 2023) is discussed but correctly positioned as a related but different approach (direct vs. indirect injection)."
     79       },
     80       "ablation_study": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Section 5.3 presents an ablation study isolating the contribution of three core components: (1) the initial corpus, (2) adaptive seed scoring strategy, and (3) MCTS-based seed selection. Results are shown in Figure 3."
     84       },
     85       "multiple_metrics": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "The paper reports attack success rate (ASR) as the primary metric and uses attack coverage as an additional metric (shown in Figures 3 and 4). Utility scores for open-source models are also reported in Appendix B.1."
     89       },
     90       "human_evaluation": {
     91         "applies": false,
     92         "answer": false,
     93         "justification": "This is an adversarial ML security paper evaluating automated attack success. Human evaluation of model outputs is not relevant to the core claims about attack success rates, which are measured by automated benchmark evaluation criteria."
     94       },
     95       "held_out_test_set": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper explicitly separates tasks into fuzzing sets (used for optimization) and test sets (held-out for final evaluation): 142/173 for AgentDojo and 99/100 for VWA-adv. Transferability is evaluated on the test set to avoid overfitting to the fuzzing set."
     99       },
    100       "per_category_breakdown": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Appendix B.3 (Table 5) provides per-scenario breakdowns for AgentDojo (Slack, Workspace, Travel, Banking) and VWA-adv (Illusioning, Goal misdirection). Per-model results are shown throughout Table 1."
    104       },
    105       "failure_cases_discussed": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper discusses cases where attacks fail, notably that AGENTVIGIL performs poorly against Claude-3.5-Sonnet ('both the baseline and AGENTVIGIL's prompts are ineffective against Claude-3.5-Sonnet, as it demonstrates strong robustness'). Performance degradation against defenses is also discussed in Sections 5.1 and 5.2."
    109       },
    110       "negative_results_reported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "The paper reports that AGENTVIGIL's optimized prompts do not transfer well to Claude-3.5-Sonnet (Section 5.1: 0.03-0.04 success rate), and that when defenses are applied in VWA-adv, 'AGENTVIGIL's performance declines and converges with the baseline' (Section 5.2). These are genuine negative results."
    114       }
    115     },
    116     "claims_and_evidence": {
    117       "abstract_claims_supported": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The abstract claims 71% and 70% success rates on AgentDojo and VWA-adv respectively, 'nearly doubling the performance of baseline attacks' — all supported by Tables 1-3 and Figures 3-4. The claim about transferability to unseen tasks and 'promising results against defenses' is supported though somewhat hedged in body text."
    121       },
    122       "causal_claims_justified": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The ablation study (Section 5.3) uses controlled single-variable manipulation to support causal claims about which components contribute to performance. The ablation substitutes individual components while keeping others constant, which is adequate for causal inference about the system's components."
    126       },
    127       "generalization_bounded": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The abstract states AGENTVIGIL is 'generic' and applicable 'across diverse LLM agents', but testing is limited to two benchmarks (AgentDojo for personal assistants, VWA-adv for web agents) and a single real-world case study. The title's claim of 'generic black-box red-teaming' overstates scope — only English-language text-based attacks are tested, and only specific types of injection goals are covered."
    131       },
    132       "alternative_explanations_discussed": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper does not discuss alternative explanations for its results. For example, when discussing why AGENTVIGIL's prompts do not transfer to Claude ('Claude is more vulnerable to simpler adversarial prompts'), this is presented as speculation without evidence or alternative considerations. No systematic threats-to-validity section exists."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix A explicitly lists all model checkpoints: o3-mini (o3-mini-2024-12-17), GPT-4o-mini (gpt-4o-mini-2024-07-18), GPT-4o (gpt-4o-2024-08-06), Claude-3.5-Sonnet (claude-3-5-sonnet-20241022), Gemini-2-flash-exp (gemini-2.0-flash-exp). Specific version IDs are provided."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The paper describes the corpus of adversarial prompt templates as using 'placeholders to accommodate different variables' (Section 4.2) but does not provide the actual template text in the paper or appendix. The mutation prompts sent to the helper LLM are described in natural language only ('Shorten compresses the seed for conciseness', etc.) without providing the actual prompt text."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "LLM inference hyperparameters (temperature, top-p, etc.) are not reported for any of the models used. The fuzzing hyperparameters such as number of iterations (10) and seeds selected (5 top-scoring seeds) are mentioned, but LLM API parameters are absent."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "AGENTVIGIL attacks agents that are third-party black-box systems (AgentDojo's agent, VWA-adv's agent, WebArena's default agent). The paper cannot describe the internal scaffolding of these external agent systems. AGENTVIGIL itself is the attack framework, not the agent scaffold."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The initial corpus collection is described qualitatively ('collected from human heuristics, online resources, existing research'), but the number of initial templates, specific filtering criteria, and exact sources are not documented in sufficient detail to reproduce the corpus."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "There is an 'Impact Statement' section (before References) but no dedicated Limitations or Threats to Validity section. The Impact Statement discusses dual-use concerns but not methodological limitations of the evaluation."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats-to-validity section exists. The impact statement contains generic statements about ongoing research needs but does not address specific threats to the validity of the experimental results, such as benchmark selection bias, limited number of injection tasks, or model version sensitivity."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The threat model (Section 3) excludes 'misuse of agents to perform harmful actions, and direct attacks on the underlying infrastructure' but this defines attacker capabilities rather than result scope. The paper does not explicitly state what the results do NOT show — e.g., it does not say the attack would not generalize to non-English agents, non-text modalities, or other agent architectures beyond the two benchmarks."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "The generated adversarial prompts and raw attack success logs are not released. The paper reports aggregate success rates in tables but provides no way to verify the underlying per-task results."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 4.2 describes how the initial corpus was collected ('from human heuristics, online resources, existing prompt injection research'). The benchmark task splits are described with specific counts (142/173 for AgentDojo, 99/100 for VWA-adv)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "This paper has no human participants. The evaluation uses automated benchmarks with pre-defined tasks; no recruitment of human participants was involved."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The fuzzing pipeline is documented in detail through Algorithms 1-3 (Appendix C) and Figure 2. The flow from corpus collection → seed selection → mutation → scoring → iteration is clearly described with pseudocode."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source, acknowledgments section, or grant information is present in the paper. As a preprint from UC Berkeley, Washington University, and UC Santa Barbara, funding would be expected but is not disclosed."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly listed on the first page: affiliations 1 (UC Berkeley), 2 (Washington University, Saint Louis), and 3 (UC Santa Barbara) are explicitly stated for each author."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "Because no funding is disclosed, it is impossible to assess funder independence. This is treated as NO under the schema convention that absence of disclosure is not equivalent to absence of conflict."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "There is no competing interests statement, patent disclosure, or financial interests declaration in the paper. The paper attacks commercial products (GPT-4o, Claude, Gemini) from a university setting, but no formal COI declaration is made."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses several LLMs (o3-mini, GPT-4o, Claude-3.5-Sonnet, Gemini-2-flash-exp) and specifies model checkpoints but does not state the training data cutoff dates for these models. The AgentDojo and VWA-adv benchmarks could potentially appear in training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The paper does not discuss whether the AgentDojo or VWA-adv benchmark tasks appeared in the training data of the evaluated LLMs. This is relevant since both benchmarks are from 2024 and some models' training cutoffs may include these datasets."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "AgentDojo was published in 2024 (arXiv:2406.13352) and VWA-adv in 2024. Models like GPT-4o (training cutoff approximately April 2024) and Claude-3.5-Sonnet may have been trained on data that includes these benchmarks. This contamination risk is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "This paper has no human participants. All evaluation is automated using benchmark frameworks."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The paper does discuss ethical considerations for the real-world case study ('Due to ethical considerations, we use a local copy'), but this is not an IRB-reviewed human subjects study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "The paper uses GPT-4o, o3-mini, and GPT-4o-mini extensively across hundreds of tasks and multiple fuzzing iterations, but no API costs, token counts, or latency figures are reported. The paper mentions that some defenses were excluded due to 'high computational costs' but does not quantify AGENTVIGIL's own costs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Total computational budget is not stated. The paper notes that '10 fuzzing iterations' are conducted on 142+ tasks with multiple mutations per iteration (3 or 10), implying hundreds of LLM API calls per experiment, but no total cost or compute budget is reported."
    290       }
    291     }
    292   },
    293   "claims": [
    294     {
    295       "claim": "AGENTVIGIL achieves 71% and 70% attack success rates on AgentDojo and VWA-adv respectively, nearly doubling baseline attack performance.",
    296       "evidence": "Table 1 and Figure 3 show 71% ASR on AgentDojo fuzzing set (o3-mini) vs. 38% baseline, and Figure 4/Table 1 show 70% on VWA-adv fuzzing set (GPT-4o) vs. 36% baseline. Section 5.1 and 5.2.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Adversarial prompts generated by AGENTVIGIL exhibit strong transferability to unseen tasks, achieving 65% and 59% success rates on o3-mini and GPT-4o respectively.",
    301       "evidence": "Table 1, test set row for AgentDojo: AGENTVIGIL achieves 0.65 on o3-mini and 0.59 on GPT-4o in VWA-adv. Section 5.1 and 5.2.",
    302       "supported": "strong"
    303     },
    304     {
    305       "claim": "AGENTVIGIL demonstrates strong transferability across unseen LLMs, achieving 67% success against Gemini-2-flash-exp (not used during fuzzing).",
    306       "evidence": "Table 1, VWA-adv test set row shows AGENTVIGIL achieves 0.67 against Gemini-2-flash-exp, compared to 0.50 for baseline. Section 5.2.",
    307       "supported": "strong"
    308     },
    309     {
    310       "claim": "Existing defenses (pi_detector, repeat, delimit, safety, paraphrase) are insufficient to stop AGENTVIGIL attacks.",
    311       "evidence": "Tables 2 and 3 show AGENTVIGIL maintains 12-49% ASR across defenses vs. 0-36% for baseline. Section 5.1 and 5.2.",
    312       "supported": "moderate"
    313     },
    314     {
    315       "claim": "All three components of AGENTVIGIL (initial corpus, adaptive seed scoring, MCTS-based seed selection) are necessary for performance.",
    316       "evidence": "Section 5.3 ablation study removes each component and shows reduced coverage in Figure 3. Without initial corpus, performance plateaus after ~4 iterations; without MCTS/scoring, improvement is markedly slower.",
    317       "supported": "moderate"
    318     },
    319     {
    320       "claim": "AGENTVIGIL can successfully mislead real-world web agents to navigate to arbitrary URLs including malicious phishing sites.",
    321       "evidence": "Section 5.4 real-world case study deploys attack on WebArena/magento2 shopping website; Figure 1 shows successful URL redirection. However, tested only on a locally hosted copy, not a live production system.",
    322       "supported": "moderate"
    323     }
    324   ],
    325   "methodology_tags": [
    326     "benchmark-eval",
    327     "case-study"
    328   ],
    329   "key_findings": "AGENTVIGIL is a black-box fuzzing framework for indirect prompt injection attacks on LLM agents that uses Monte Carlo Tree Search (MCTS) for seed selection, adaptive coverage-guided scoring, and semantically diverse mutations. On two established benchmarks (AgentDojo and VWA-adv), AGENTVIGIL achieves 71% and 70% attack success rates respectively, nearly doubling the performance of handcrafted baseline attacks. The generated adversarial prompts exhibit strong transferability to unseen tasks and LLMs (including Gemini-2-flash-exp not used in fuzzing), but do not transfer well to Claude-3.5-Sonnet which appears more robust to complex adversarial prompts. Existing defenses (delimiter-based, classifier-based, repetition-based) reduce but do not eliminate attack success, highlighting persistent vulnerabilities in current agent security measures.",
    330   "red_flags": [
    331     {
    332       "flag": "No uncertainty quantification",
    333       "detail": "All results in Tables 1-5 are single point estimates with no standard deviations, confidence intervals, or error bars. Given that fuzzing involves random seed sampling and mutation, variance across runs could be substantial. The paper claims 'nearly doubling' performance without any statistical testing to establish this is not within noise."
    334     },
    335     {
    336       "flag": "Benchmark contamination unaddressed",
    337       "detail": "AgentDojo (arXiv:2406.13352, June 2024) and VWA-adv (2024) are relatively recent benchmarks. Models like GPT-4o (gpt-4o-2024-08-06) and Claude-3.5-Sonnet (claude-3-5-sonnet-20241022) have training data that may include these benchmark descriptions, potentially affecting the validity of results. This risk is not discussed."
    338     },
    339     {
    340       "flag": "No code or prompt artifacts released",
    341       "detail": "The paper does not release the initial adversarial corpus, the mutation prompt templates, or experimental code. The MCTS algorithms are described in pseudocode but the actual implementation with specific prompts cannot be reproduced, making independent replication impossible."
    342     },
    343     {
    344       "flag": "Genericity claim overstated",
    345       "detail": "The paper claims AGENTVIGIL is a 'generic' framework applicable to 'diverse LLM agents', but validation is limited to two benchmarks with specific agent types (personal assistants and web agents) in English. The real-world case study uses a single locally hosted instance. The title's genericity claim is not fully supported."
    346     },
    347     {
    348       "flag": "No limitations section",
    349       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. The Impact Statement focuses on dual-use concerns rather than methodological limitations. Important scope constraints (e.g., attack effectiveness may be lower on production systems with rate limiting, the 10-minute attack window assumption, etc.) are not discussed."
    350     },
    351     {
    352       "flag": "Random task sampling without stability check",
    353       "detail": "During fuzzing, the paper 'randomly sample[s] a quarter of user and injection tasks from each suite to evaluate each newly mutated seed' (Section 5.1). This introduces stochasticity into the evaluation of each seed, but no variance analysis across different random samples is performed to verify result stability."
    354     }
    355   ],
    356   "cited_papers": [
    357     {
    358       "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents",
    359       "authors": [
    360         "Debenedetti, E.",
    361         "Zhang, J.",
    362         "Balunovic, M.",
    363         "Beurer-Kellner, L.",
    364         "Fischer, M.",
    365         "Tramer, F."
    366       ],
    367       "year": 2024,
    368       "arxiv_id": "2406.13352",
    369       "relevance": "Primary benchmark used to evaluate AGENTVIGIL; directly relevant as a framework for assessing LLM agent security."
    370     },
    371     {
    372       "title": "Dissecting Adversarial Robustness of Multimodal LM Agents",
    373       "authors": [
    374         "Wu, C. H.",
    375         "Shah, R. R.",
    376         "Koh, J. Y.",
    377         "Salakhutdinov, R.",
    378         "Fried, D.",
    379         "Raghunathan, A."
    380       ],
    381       "year": 2024,
    382       "relevance": "VWA-adv benchmark paper and baseline; central to the evaluation of AGENTVIGIL on web agents with multimodal inputs."
    383     },
    384     {
    385       "title": "GPTFuzzer: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts",
    386       "authors": [
    387         "Yu, J.",
    388         "Lin, X.",
    389         "Xing, X."
    390       ],
    391       "year": 2023,
    392       "arxiv_id": "2309.10253",
    393       "relevance": "Prior fuzzing work for direct prompt injection that AGENTVIGIL adapts and extends to indirect injection in multi-step agents."
    394     },
    395     {
    396       "title": "More than you've asked for: A Comprehensive Analysis of Novel Prompt Injection Threats to Application-Integrated LLMs",
    397       "authors": [
    398         "Greshake, K.",
    399         "Abdelnabi, S.",
    400         "Mishra, S.",
    401         "Endres, C.",
    402         "Holz, T.",
    403         "Fritz, M."
    404       ],
    405       "year": 2023,
    406       "arxiv_id": "2302.12173",
    407       "relevance": "Foundational work characterizing indirect prompt injection threats to LLM-integrated applications."
    408     },
    409     {
    410       "title": "AgentPoison: Red-teaming LLM Agents via Poisoning Memory or Knowledge Bases",
    411       "authors": [
    412         "Chen, Z.",
    413         "Xiang, Z.",
    414         "Xiao, C.",
    415         "Song, D.",
    416         "Li, B."
    417       ],
    418       "year": 2024,
    419       "arxiv_id": "2407.12784",
    420       "relevance": "Related agent red-teaming attack that uses white-box access; contrasted with AGENTVIGIL's black-box approach."
    421     },
    422     {
    423       "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
    424       "authors": [
    425         "Liu, Y.",
    426         "Jia, Y.",
    427         "Geng, R.",
    428         "Jia, J.",
    429         "Gong, N. Z."
    430       ],
    431       "year": 2024,
    432       "relevance": "OpenPromptInjection benchmark used as additional baseline in Appendix B.2; directly relevant to prompt injection evaluation."
    433     },
    434     {
    435       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated LLM Agents",
    436       "authors": [
    437         "Zhan, Q.",
    438         "Liang, Z.",
    439         "Ying, Z.",
    440         "Kang, D."
    441       ],
    442       "year": 2024,
    443       "arxiv_id": "2403.02691",
    444       "relevance": "Another indirect prompt injection benchmark used as baseline comparison in Appendix B.2."
    445     },
    446     {
    447       "title": "Defending against Indirect Prompt Injection Attacks with Spotlighting",
    448       "authors": [
    449         "Hines, K.",
    450         "Lopez, G.",
    451         "Hall, M.",
    452         "Zarfati, F.",
    453         "Zunger, Y.",
    454         "Kiciman, E."
    455       ],
    456       "year": 2024,
    457       "arxiv_id": "2403.14720",
    458       "relevance": "Defense mechanism (delimiter-based spotlighting) tested against AGENTVIGIL; relevant to agent security survey."
    459     },
    460     {
    461       "title": "The Instruction Hierarchy: Training LLMs to Prioritize Privileged Instructions",
    462       "authors": [
    463         "Wallace, E.",
    464         "Xiao, K.",
    465         "Leike, R.",
    466         "Weng, L.",
    467         "Heidecke, J.",
    468         "Beutel, A."
    469       ],
    470       "year": 2024,
    471       "arxiv_id": "2404.13208",
    472       "relevance": "Training-based defense against prompt injection that AGENTVIGIL evaluates indirectly through model robustness testing."
    473     },
    474     {
    475       "title": "WebArena: A Realistic Web Environment for Building Autonomous Agents",
    476       "authors": [
    477         "Zhou, S.",
    478         "Xu, F. F.",
    479         "Zhu, H.",
    480         "Zhou, X.",
    481         "Lo, R.",
    482         "Sridhar, A.",
    483         "Cheng, X.",
    484         "Ou, T.",
    485         "Bisk, Y.",
    486         "Fried, D."
    487       ],
    488       "year": 2023,
    489       "arxiv_id": "2307.13854",
    490       "relevance": "Environment used for the real-world case study in Section 5.4; foundational web agent benchmark."
    491     },
    492     {
    493       "title": "VisualWebArena: Evaluating Multimodal Agents on Realistic Visual Web Tasks",
    494       "authors": [
    495         "Koh, J. Y.",
    496         "Lo, R.",
    497         "Jang, L.",
    498         "Duvvur, V.",
    499         "Lim, M. C.",
    500         "Huang, P.-Y.",
    501         "Neubig, G.",
    502         "Zhou, S.",
    503         "Salakhutdinov, R.",
    504         "Fried, D."
    505       ],
    506       "year": 2024,
    507       "arxiv_id": "2401.13649",
    508       "relevance": "Foundation for VWA-adv benchmark; relevant as a multimodal web agent evaluation framework."
    509     },
    510     {
    511       "title": "PromptFuzz: Harnessing Fuzzing Techniques for Robust Testing of Prompt Injection in LLMs",
    512       "authors": [
    513         "Yu, J.",
    514         "Shao, Y.",
    515         "Miao, H.",
    516         "Shi, J.",
    517         "Xing, X."
    518       ],
    519       "year": 2024,
    520       "arxiv_id": "2409.14729",
    521       "relevance": "Related fuzzing approach for prompt injection testing; directly comparable to AGENTVIGIL's methodology."
    522     },
    523     {
    524       "title": "AdvWeb: Controllable Black-box Attacks on VLM-powered Web Agents",
    525       "authors": [
    526         "Xu, C.",
    527         "Kang, M.",
    528         "Zhang, J.",
    529         "Liao, Z.",
    530         "Mo, L.",
    531         "Yuan, M.",
    532         "Sun, H.",
    533         "Li, B."
    534       ],
    535       "year": 2024,
    536       "arxiv_id": "2410.17401",
    537       "relevance": "Related black-box attack on web agents; contrasts with AGENTVIGIL's generic approach for indirect injection."
    538     }
    539   ]
    540 }

Impressum · Datenschutz