ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32746B)


      1 {
      2   "paper": {
      3     "title": "WebInject: Prompt Injection Attack to Web Agents",
      4     "authors": [
      5       "Xilong Wang",
      6       "John Bloch",
      7       "Zedian Shao",
      8       "Yuepeng Hu",
      9       "Shuyan Zhou",
     10       "Neil Zhenqiang Gong"
     11     ],
     12     "year": 2025,
     13     "venue": "Conference on Empirical Methods in Natural Language Processing",
     14     "arxiv_id": "2505.11717",
     15     "doi": "10.18653/v1/2025.emnlp-main.104"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "WebInject proposes an optimization-based prompt injection attack that perturbs raw webpage pixel values to mislead MLLM-based web agents into performing attacker-specified actions. The attack achieves 96-97% attack success rate across five open-source MLLMs (UI-TARS, Phi-4, Llama-3.2, Qwen-2.5, Gemma-3), vastly outperforming heuristic webpage-based baselines (max 34.5% ASR) and screenshot-based attacks (0% ASR when naively applied to raw pixels). A key technical contribution is training a neural network to approximate the non-differentiable webpage-to-screenshot mapping, enabling gradient-based optimization of perturbations constrained to be imperceptible (ℓ∞ ≤ 16/255).",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "No code repository URL is provided in the paper. No mention of code release on GitHub, Zenodo, or any archive."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "The paper constructs 10 webpage datasets (5 real, 5 synthetic) but does not provide download links or mention releasing them. The synthetic webpages were generated via GPT-4-Turbo prompts shown in Fig. 9, but the actual generated webpages are not released."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "The paper mentions using selenium, PIL, ImageCms, Canvas API, PyTorch, and an NVIDIA RTX A6000 GPU, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Algorithm 1 provides pseudo-code for the attack and Fig. 3 shows monitor simulation code, but there are no step-by-step reproduction instructions, README, or runnable scripts."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Table 1 and all other result tables report single ASR values averaged across datasets with no confidence intervals, error bars, or uncertainty measures."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims WebInject 'significantly outperforms' baselines (e.g., 0.972 vs 0.062 for Gemma-3) but provides no statistical significance tests — comparisons are based solely on comparing raw ASR numbers."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "The paper reports ASR values for both WebInject and all baselines (Table 1), providing sufficient context to assess effect size. E.g., 'the success rate of our attack is 0.910 higher than the best-performing baseline' for Gemma-3, with baseline ASR of 0.062 vs WebInject ASR of 0.972."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "No justification is provided for the choice of 10 webpage datasets, 10 target prompts per webpage, 10 shadow histories, or the number of real webpages per category (ranging from 26 to 51, Table 3). No power analysis."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "Results are averaged across datasets and presented as single point estimates. No standard deviation, interquartile range, or spread measure is reported across webpages, prompts, or experimental runs."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Table 1 compares WebInject against five baselines: Naive Attack, Context Ignoring, Fake Completion, Combined Attack (all webpage-based), and Screenshot-based attacks. These cover both major categories of prior work."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Baselines include EIA (Liao et al., 2025), Pop-up Attack (Zhang et al., 2024), screenshot-based attacks (Aichberger et al., 2025; Zhao et al., 2025), and various textual prompt injection methods. All are from 2024-2025, representing the state of the art."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The paper includes ablation studies on: number of target monitors (Fig. 2a), perturbation bound ε (Fig. 2b), semantically equivalent user prompts (Table 11), and different target actions (Table 10)."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "The paper uses only Attack Success Rate (ASR) as the evaluation metric. No perceptual quality metrics (SSIM, LPIPS), human detection rates, or other complementary metrics are reported."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "No human evaluation is conducted to verify the stealthiness claim. The paper argues perturbations with ℓ∞ ≤ 16/255 are imperceptible based on prior work convention, but no human study confirms this for the specific webpage context."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "The paper uses separate shadow history sets (for optimization) and user history sets (for evaluation), as described in Section 5.1: 'the shadow history set is used by an attacker to optimize the perturbation, while the user history set is used to evaluate the perturbation.' Semantically equivalent user prompts (Table 11) also test generalization beyond optimization inputs."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Tables 4-9 provide per-dataset breakdowns across all 10 webpage categories (Blog, Commerce, Education, Healthcare, Portfolio × Synthetic/Real) for all attacks and agents."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": false,
    110         "justification": "The paper does not analyze the 2-5% of cases where the attack fails. No error analysis, qualitative examples of failures, or discussion of when/why the approach breaks down."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "Every experiment shows WebInject succeeding at high rates. The paper does not report approaches that were tried and failed, alternative formulations that didn't work, or configurations that broke the attack."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims WebInject is 'highly effective and significantly outperforms baselines.' Table 1 supports this with ASR of 0.963-0.975 across all five MLLMs, compared to baselines maxing at 0.345. The abstract's claim about Gemma-3 ('success rate of our attack is 0.910 higher') is supported by Table 1 (0.972 vs 0.062)."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper claims the optimized perturbation δ causes web agents to perform target actions. The experimental design — controlled pixel perturbation via optimization, evaluated on separate user histories and prompts — constitutes controlled single-variable manipulation adequate for this causal claim."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": true,
    132         "justification": "The Limitations section (Section 8) explicitly states: (1) the threat model may not apply to highly trustworthy sites like Amazon, and (2) transferability to closed-source MLLMs was not evaluated. The title 'Prompt Injection Attack to Web Agents' is appropriately scoped."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not consider alternative explanations for why WebInject achieves such high ASR. No discussion of whether model-specific artifacts, dataset properties, or the simplicity of the target actions could explain the uniformly high success rates."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "The paper measures ASR (exact match of generated action to target action) and claims attack effectiveness. The metric directly measures the intended outcome — whether the agent performs the attacker-specified action — with no proxy gap."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": true,
    149         "justification": "Section 5.1 lists specific model identifiers: 'UI-TARS-7B-SFT', 'Phi-4-multimodal-instruct', 'Llama-3.2-11B-Vision-Instruct', 'Qwen2.5-VL-7B-Instruct', 'Gemma-3-4b-it', with full citations for each."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": false,
    154         "justification": "The meta-prompts used to generate target prompts (Fig. 10) and synthetic webpages (Fig. 9) are provided, but the actual 10 target prompts generated per webpage are not included. The reader cannot reconstruct the exact prompts used in experiments."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.1 reports: ℓ∞-norm constraint ε = 16/255, learning rate α = 0.3, iterations T = 2,500, mapping neural network training: 16,240 input-output pairs, 200 epochs, learning rate 0.005, batch size 16."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section 2 describes the web agent pipeline: the MLLM receives user prompt, resized screenshot, and interaction history, and outputs an action. Table 2 describes the full action space. The pipeline is simple (no complex agentic scaffolding), and the components are fully described."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.1 documents: real webpages collected via SingleFile extension across 5 categories, synthetic webpages generated by GPT-4-Turbo (100 per category), prompts generated per webpage, monitor simulation via selenium + ICC profiles. Table 3 provides dataset statistics."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Section 8 is a dedicated 'Limitations' section with substantive discussion of two specific limitations and potential defenses."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 8 identifies specific threats: (1) the assumption of source code modification may not hold for trustworthy sites like Amazon, (2) transferability to closed-source MLLMs was not evaluated due to computational constraints. These are specific to this study."
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "Section 8 states specific scope boundaries: only open-source MLLMs tested, attacker must control webpage source code, and transferability to closed-source models is an open question. The threat model (Section 3) also bounds scope."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "Neither the webpage datasets, generated prompts, optimized perturbations, nor raw experimental results are made available for independent verification."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 5.1 describes data collection: real webpages via SingleFile extension across 5 categories, synthetic webpages via GPT-4-Turbo with the prompt in Fig. 9, monitor simulation via selenium + ICC profiles (Fig. 3)."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": false,
    202         "answer": false,
    203         "justification": "No human participants. The study evaluates automated attacks against ML models using constructed webpage datasets."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "The pipeline is documented: collect webpages → generate target prompts (Fig. 10) → generate shadow/user histories → train mapping neural networks → optimize perturbation via PGD (Algorithm 1) → evaluate ASR. Fig. 3 shows monitor simulation code."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": true,
    215         "justification": "Section 9 (Acknowledgments) lists NSF grants: 'No. 2414406, 2131859, 2125977, 2112562, 1937787, and 2450935.'"
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "All authors are listed as Duke University affiliates with email addresses. They are not affiliated with any of the companies whose models they evaluate."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": true,
    225         "justification": "NSF is the funder and has no financial interest in whether the attack succeeds or fails. The researchers are independent academic researchers evaluating open-source models."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests or financial interests statement is included in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "The paper evaluates an adversarial attack's ability to manipulate web agents, not a model's learned capability on a benchmark. Whether the model has seen the test webpages is irrelevant to the pixel-perturbation attack mechanism."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "Same as above — contamination is not a concern because the attack works by optimizing pixel perturbations against the model's parameters, not by exploiting model knowledge of test data."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Same as above — the attack is optimization-based and does not depend on whether the model has seen the evaluation webpages during training."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants in the study. All experiments are automated evaluations of adversarial attacks against ML models."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": true,
    291         "justification": "Table 12 reports per-target-webpage per-target-monitor training time and GPU memory usage for each MLLM agent, benchmarked on a single NVIDIA RTX A6000 GPU. However, costs are reported as relative additions (Δ + X min, Ω + X GB) over the screenshot-based baseline without stating the baseline's absolute cost."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Table 12 provides relative per-unit costs (e.g., Δ+1.92 min, Ω+1.93 GB compared to screenshot-based attacks) but the total compute budget across all experiments is not stated, and the baseline absolute costs (Δ, Ω) are not given."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of random seeds, seed sensitivity analysis, or results across multiple seeds. The optimization involves random initialization of shadow histories and prompt mini-batches, but sensitivity to this randomness is not analyzed."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": false,
    308         "justification": "The paper does not state how many times the perturbation optimization was run per target webpage, or whether the reported ASR values reflect single-run or multi-run averages."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "Fixed hyperparameters are used (ε=16/255, α=0.3, T=2500) with sensitivity analysis on ε (Fig. 2b) but no description of how these defaults were selected or how many configurations were tried."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "The default hyperparameters (ε=16/255, α=0.3, T=2500) are used without justification for why these values were chosen. The ε ablation study (Fig. 2b) shows higher ε gives better ASR, but 16/255 is selected based on a convention from prior adversarial example work, not systematic optimization."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": false,
    322         "answer": false,
    323         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors implement both their attack and the baselines. They do not acknowledge potential bias from implementing baselines themselves, which Lucic et al. (2018) showed can lead to systematic underperformance of baselines."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "Table 12 reports relative compute costs but does not plot or analyze performance as a function of compute budget. The paper does not compare whether baselines would perform better with equivalent compute."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper uses ASR as the sole measure of attack effectiveness but does not discuss whether exact action matching is the right metric, or whether partial success (e.g., clicking near the target) should be considered. No discussion of construct validity of the evaluation setup."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": false,
    342         "answer": false,
    343         "justification": "The evaluation tests bare MLLM inference (prompt + screenshot → action) without complex scaffolding. No scaffolding confound is present."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "No discussion of whether the open-source MLLMs may have been trained on the real webpages used in evaluation, or whether temporal ordering of model training vs. webpage collection matters."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": true,
    355         "justification": "The paper explicitly separates optimization inputs from evaluation inputs: shadow histories (optimization) vs. user histories (evaluation) are drawn from different random samples. Target prompts vs. semantically equivalent user prompts provide another separation layer (Section 5.1)."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "No discussion of whether webpages within the same category share structural properties that could inflate ASR consistency, or whether the synthetic webpages (all generated by GPT-4-Turbo) share artifacts."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No concrete leakage detection or prevention method is used beyond the train/eval split of histories and prompts."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "WebInject achieves >96% attack success rate across five open-source MLLMs, vastly outperforming all baselines.",
    372       "evidence": "Table 1 shows ASR of 0.963-0.975 for WebInject across UI-TARS, Phi-4, Llama-3.2, Qwen-2.5, and Gemma-3, compared to a maximum baseline ASR of 0.345 (Fake Completion on Llama-3.2).",
    373       "supported": "strong"
    374     },
    375     {
    376       "claim": "Screenshot-based attacks completely fail (0% ASR) when perturbations are applied to raw pixel values instead of screenshots.",
    377       "evidence": "Table 1 shows ASR of 0.000 for screenshot-based attacks across all five MLLMs and all 10 datasets (detailed in Table 9), demonstrating that the webpage-to-screenshot mapping invalidates direct pixel-to-screenshot transfer.",
    378       "supported": "strong"
    379     },
    380     {
    381       "claim": "WebInject remains effective when user prompts are semantically equivalent but not identical to target prompts.",
    382       "evidence": "Table 11 shows ASR of 0.871-0.959 across all agents and datasets when using semantically equivalent user prompts, compared to 0.929-0.999 with exact target prompts (Table 4).",
    383       "supported": "moderate"
    384     },
    385     {
    386       "claim": "The attack generalizes to multiple target action types beyond click, including drag, hotkey, type, scroll, wait, finished, and call_user.",
    387       "evidence": "Table 10 shows ASR of 0.976-0.993 for various target actions on the synthetic Blog dataset with Phi-4. However, this is only tested on one model and one dataset.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "A trained U-Net can approximate the non-differentiable webpage-to-screenshot mapping, enabling gradient-based optimization.",
    392       "evidence": "The paper trains mapping neural networks per target monitor (Section 4.2) and the resulting high ASR values (Table 1) indirectly demonstrate the approximation is effective, but no direct evaluation of approximation quality (e.g., MSE between predicted and actual screenshots) is reported.",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "WebInject perturbations are stealthy (imperceptible to users) at ε ≤ 16/255.",
    397       "evidence": "Fig. 5 shows examples of perturbed webpages at different ε values, and the paper cites prior work convention that ε ≤ 16/255 is generally considered stealthy. However, no human study validates imperceptibility.",
    398       "supported": "weak"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No stealthiness validation with humans",
    404       "detail": "The paper claims perturbations are stealthy/imperceptible but relies solely on the ℓ∞ ≤ 16/255 convention from prior adversarial example work. No human study is conducted to verify that perturbed webpages are actually indistinguishable from originals in a web browsing context, where users may notice subtle color shifts more easily than in natural images."
    405     },
    406     {
    407       "flag": "No error bars or variance across runs",
    408       "detail": "All ASR values are single point estimates with no uncertainty quantification. The optimization involves stochastic elements (random mini-batches of prompts and histories), yet the paper does not report variation across runs or seeds."
    409     },
    410     {
    411       "flag": "Uniformly high ASR across all models is suspiciously clean",
    412       "detail": "ASR ranges from 0.963 to 0.975 across five architecturally diverse MLLMs (Table 1). Such uniformity despite different model architectures, sizes, and training procedures is noteworthy and not discussed."
    413     },
    414     {
    415       "flag": "White-box assumption limits practical applicability",
    416       "detail": "The attack requires full model parameters (Section 3), which is acknowledged in Section 8 but significantly limits real-world applicability since widely deployed web agents often use closed-source MLLMs (GPT-4V, Claude, Gemini)."
    417     },
    418     {
    419       "flag": "Self-implemented baselines may be disadvantaged",
    420       "detail": "The authors implement all baseline attacks themselves (Section 5.1). Webpage-based baselines are adapted into a uniform pop-up framework (Fig. 4) which may not represent each baseline's optimal configuration."
    421     },
    422     {
    423       "flag": "Single evaluation metric",
    424       "detail": "Only ASR (exact action match) is reported. No perceptual quality metrics for stealthiness (SSIM, LPIPS) or partial success measures (e.g., click distance from target) are included."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Eia: Environmental injection attack on generalist web agents for privacy leakage",
    430       "authors": ["Zeyi Liao", "Lingbo Mo", "Chejian Xu", "Mintong Kang", "Jiawei Zhang", "Chaowei Xiao", "Yuan Tian", "Bo Li", "Huan Sun"],
    431       "year": 2025,
    432       "relevance": "Directly relevant prompt injection attack on web agents that injects HTML elements to trick agents into interacting with malicious elements."
    433     },
    434     {
    435       "title": "Attacking vision-language computer agents via pop-ups",
    436       "authors": ["Yanzhe Zhang", "Tao Yu", "Diyi Yang"],
    437       "year": 2024,
    438       "arxiv_id": "2411.02391",
    439       "relevance": "Pop-up-based prompt injection attack on vision-language web agents, a key baseline for evaluating WebInject."
    440     },
    441     {
    442       "title": "Attacking multimodal os agents with malicious image patches",
    443       "authors": ["Lukas Aichberger", "Alasdair Paren", "Yarin Gal", "Philip Torr", "Adel Bibi"],
    444       "year": 2025,
    445       "arxiv_id": "2503.10809",
    446       "relevance": "Screenshot-based adversarial attack on multimodal OS agents, demonstrates adversarial perturbation approach that WebInject improves upon."
    447     },
    448     {
    449       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    450       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    451       "year": 2024,
    452       "relevance": "Foundational work on formalizing prompt injection attacks including the Combined Attack baseline used in WebInject's evaluation."
    453     },
    454     {
    455       "title": "Dissecting adversarial robustness of multimodal lm agents",
    456       "authors": ["Chen Henry Wu", "Rishi Rajesh Shah", "Jing Yu Koh", "Russ Salakhutdinov", "Daniel Fried", "Aditi Raghunathan"],
    457       "year": 2025,
    458       "relevance": "Studies adversarial robustness of multimodal LM agents, directly relevant to understanding vulnerability of MLLM-based web agents."
    459     },
    460     {
    461       "title": "Agentdojo: A dynamic environment to evaluate attacks and defenses for llm agents",
    462       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunović", "Luca Beurer-Kellner", "Marc Fischer", "Florian Tramèr"],
    463       "year": 2024,
    464       "relevance": "Dynamic evaluation framework for attacks and defenses on LLM agents, provides benchmarking infrastructure for agent security research."
    465     },
    466     {
    467       "title": "VisualWebArena: Evaluating multimodal agents on realistic visual web tasks",
    468       "authors": ["Jing Yu Koh", "Robert Lo", "Lawrence Jang", "Vikram Duvvur", "Ming Chong Lim", "Po-Yu Huang", "Graham Neubig", "Shuyan Zhou", "Ruslan Salakhutdinov", "Daniel Fried"],
    469       "year": 2024,
    470       "arxiv_id": "2401.13649",
    471       "relevance": "Major benchmark for evaluating multimodal web agents, provides evaluation infrastructure relevant to assessing web agent capabilities and vulnerabilities."
    472     },
    473     {
    474       "title": "GPT-4V(ision) is a generalist web agent, if grounded",
    475       "authors": ["Boyuan Zheng", "Boyu Gou", "Jihyung Kil", "Huan Sun", "Yu Su"],
    476       "year": 2024,
    477       "arxiv_id": "2401.01614",
    478       "relevance": "Foundational work on using vision-language models as web agents with grounding, establishing the paradigm WebInject attacks."
    479     },
    480     {
    481       "title": "A critical evaluation of defenses against prompt injection attacks",
    482       "authors": ["Yuqi Jia", "Zedian Shao", "Yupei Liu", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"],
    483       "year": 2025,
    484       "arxiv_id": "2505.18333",
    485       "relevance": "Evaluates defenses against prompt injection attacks, directly relevant to understanding the defense landscape for attacks like WebInject."
    486     },
    487     {
    488       "title": "Optimization-based prompt injection attack to LLM-as-a-judge",
    489       "authors": ["Jiawen Shi", "Zenghui Yuan", "Yinuo Liu", "Yue Huang", "Pan Zhou", "Lichao Sun", "Neil Zhenqiang Gong"],
    490       "year": 2024,
    491       "relevance": "Optimization-based prompt injection approach applied to LLM judges, shares the optimization methodology with WebInject but in a different domain."
    492     },
    493     {
    494       "title": "DataSentinel: A game-theoretic detection of prompt injection attacks",
    495       "authors": ["Yupei Liu", "Yuqi Jia", "Jinyuan Jia", "Dawn Song", "Neil Zhenqiang Gong"],
    496       "year": 2025,
    497       "relevance": "Prompt injection detection method that the authors note is not applicable to WebInject since it does not inject explicit textual prompts."
    498     },
    499     {
    500       "title": "On the robustness of GUI grounding models against image attacks",
    501       "authors": ["Haoren Zhao", "Tianyi Chen", "Zhen Wang"],
    502       "year": 2025,
    503       "arxiv_id": "2504.04716",
    504       "relevance": "Studies robustness of GUI grounding models against image-based attacks, a screenshot-based baseline approach compared to WebInject."
    505     },
    506     {
    507       "title": "AdvAgent: Controllable blackbox red-teaming on web agents",
    508       "authors": ["Chejian Xu", "Mintong Kang", "Jiawei Zhang", "Zeyi Liao", "Lingbo Mo", "Mengqi Yuan", "Huan Sun", "Bo Li"],
    509       "year": 2024,
    510       "arxiv_id": "2410.17401",
    511       "relevance": "Black-box red-teaming approach for web agents, complementary to WebInject's white-box adversarial attack methodology."
    512     }
    513   ],
    514   "engagement_factors": {
    515     "practical_relevance": {
    516       "score": 2,
    517       "justification": "Demonstrates a concrete attack vector against deployed web agents, but requires white-box model access and ML expertise to implement."
    518     },
    519     "surprise_contrarian": {
    520       "score": 1,
    521       "justification": "Adversarial perturbations misleading vision models is well-established; the novel contribution is bridging the webpage-to-screenshot mapping gap, which is technically interesting but not paradigm-shifting."
    522     },
    523     "fear_safety": {
    524       "score": 3,
    525       "justification": "Demonstrates a novel attack enabling click fraud, malware downloads, and data theft through imperceptible webpage modifications that hijack autonomous web agents."
    526     },
    527     "drama_conflict": {
    528       "score": 1,
    529       "justification": "Highlights web agent vulnerability but does not target specific products or make controversial claims about the industry."
    530     },
    531     "demo_ability": {
    532       "score": 0,
    533       "justification": "No code, demo, or tool is released; the attack cannot be tried without reimplementation."
    534     },
    535     "brand_recognition": {
    536       "score": 1,
    537       "justification": "Duke University is reputable but not a marquee AI lab; the tested models (Gemma, Qwen, Llama) are recognizable but not headline products like ChatGPT."
    538     }
    539   }
    540 }

Impressum · Datenschutz