ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (28737B)


      1 {
      2   "paper": {
      3     "title": "Manipulating LLM Web Agents with Indirect Prompt Injection Attack via HTML Accessibility Tree",
      4     "authors": ["Sam Johnson", "Viet Pham", "Thai Le"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2507.14799",
      8     "doi": "10.48550/arXiv.2507.14799"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "case-study"],
     13   "key_findings": "Universal adversarial triggers embedded in webpage HTML via the accessibility tree can hijack LLM-based web navigation agents with high success rates (0.83–1.0 ASR across five real websites). The GCG-optimized triggers are effective for diverse attack scenarios including forced ad clicks, credential exfiltration, and traffic redirection, but do not transfer across models (failed on Llama-2 and Mistral-7B). Optimization can be completed in under an hour when the target string is included in the initial trigger sequence.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub repository provided at https://github.com/sej2020/manipulating-web-agents, released under MIT License. Abstract states: 'The system software is released under the MIT License.'"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The specific experimental datasets (25 training navigation goals, 200 test goals per website, HTML contexts) are not stated as released. A demo website with cached results exists but the underlying experimental data is not separately released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper specifies Llama-3.1-8B-Instruct and Browser Gym but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are provided in the paper. The system design is described at a high level but specific commands or procedures to replicate experiments are absent."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results in Figures 3–5 are reported as averages over 10 runs or as single ASR values on 200 test prompts, but no confidence intervals or error bars are shown."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper claims smaller search width and target-string initialization significantly shorten optimization time, but provides no statistical tests to support these comparative claims."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "ASR values are reported for each website (0.83–1.0 range in Figure 5), and time-to-completion is reported in hours across configurations (Figures 3–4), providing sufficient magnitude context for the claims."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification given for the choice of 200 test prompts per website, 25 training prompts, 10 optimization runs, or 5 sample websites. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Results are 'an average over ten navigation tasks' (Figure 3) but no standard deviation, IQR, or other spread measure is reported for any experiment."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No comparison to prior IPI attack methods (e.g., Imprompter by Fu et al., 2024) or other attack baselines. Results are reported only for the authors' GCG-based approach."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No baselines are included, so contemporaneity cannot be assessed. Imprompter (2024) and adaptive attacks by Zhan et al. (2025) are discussed in related work but not compared against."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section 3.1 systematically varies search width, trigger length, loss function (CW vs CE), top-k candidates, and initial trigger composition, measuring time-to-completion for each (Figures 3, 4, 8–10)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two metrics are used: ASR (proportion of responses that invoke the targeted function) and ASRV (proportion containing the exact target sequence verbatim). Section 3.2."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation of attack stealth, detectability, or real-world impact. All evaluation is automated (measuring whether the agent executes the target action)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "TWUI optimizes on 25 navigation goals and tests on 200 different goals (Section 3.2). UWTI trains on 8 login pages and evaluates on 11 held-out login pages (Section 3.3)."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down per website in Figure 5 (TWUI) and per login page in Figure 6 (UWTI), showing variation across individual settings."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 4 'Failure Analysis' identifies two clusters: (1) concrete instructions with only one obvious action, and (2) responses beginning with chain-of-thought reasoning phrases. Attributes failures to high prior probability on certain tokens."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Several negative results reported: transferability to Llama-2 and Mistral-7B failed (Section 4), CW loss did not improve convergence (Appendix B), trigger length had no effect (Figure 8), UWTI generalized to only 3/11 test websites (Section 3.3)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims 'high success rates across real websites' (supported by 0.83–1.0 ASR in Figure 5), 'login credential exfiltration' (demonstrated in Section 3.3), and 'forced ad clicks' (city scenario in Section 3.1). All claims are backed by experimental results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper claims triggers cause specific agent actions. The experimental design (inject trigger into HTML → observe agent behavior change) is a controlled single-variable manipulation. The causal mechanism is demonstrated directly."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "Title 'Manipulating LLM Web Agents' implies generality across LLM agents, but experiments use only Llama-3.1-8B-Instruct with Browser Gym. Transferability to other LLMs failed (Section 4). The limitations section discusses this but the title and abstract remain broad."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No discussion of alternative explanations for attack success. E.g., whether Browser Gym's prompt template is particularly susceptible, whether Llama-3.1-8B is especially vulnerable, or whether larger models would be more resistant. The failure analysis discusses why some attacks fail, but not why they succeed."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "ASR directly measures attack success (whether the agent executes the target action). ASRV measures exact target string match. The metrics match the granularity of the claims without proxy inflation."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions given: 'Llama-3.1-8B-Instruct' (Section 2.1), 'Llama-2-7b-chat-hf' and 'Mistral-7B-Instruct-v0.3' for transferability experiments (Section 4)."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "The Browser Gym prompt structure is described in natural language ('The prompt comprises context for the web navigation setting, the goal or chat messages from the user, the accessibility tree, and a description of the actions available'—Section 2.1) but the actual prompt text is not provided."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Some GCG hyperparameters are tested (search width 128/256/512, various top-k, trigger lengths) but the paper refers to 'standard GCG hyperparameters' for main experiments without specifying all values. No temperature or sampling parameters for LLM inference reported."
    153       },
    154       "scaffolding_described": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Browser Gym scaffolding is described in Section 2.1: HTML accessibility tree extraction, prompt compilation, LLM querying, response parsing for actions (click, scroll, fill), and browser action execution cycle."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "The process of converting HTML to accessibility tree via Browser Gym is mentioned but not documented in detail. How the 25 training prompts and 200 test prompts were constructed is not described. Website selection criteria are unstated."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section is present, identifying three specific limitations of the attack technique."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Three specific threats identified: (1) attacker must control part of the HTML, (2) triggers are model-specific and don't transfer, (3) target sequences must have syntactic validity in the framework. These are concrete and specific to this study."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "Explicitly states: 'triggers are trained for a particular LLM or set of LLMs, so web-navigation agents underpinned by other LLMs are much less susceptible' and 'A closed-source web navigation framework that rotates its action-space scheme and does not disclose the LLM it uses will be much less susceptible.'"
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "Demo website shows cached results for 5 websites but full raw data (all optimization logs, all 200 test responses per website, model outputs) is not stated as available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "Five real websites are named and attack scenarios described, but the process for selecting these websites, constructing the 200 test navigation goals, and the 25 training goals is not described."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No description of how the 5 sample websites were selected, how the 8 training and 11 test login pages were chosen for UWTI, or how navigation goals were generated."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "The conceptual pipeline (HTML → accessibility tree → prompt → trigger optimization) is described but specific filtering steps, data transformations, and intermediate counts are not documented."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding source or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations clearly listed: Sam Johnson and Thai Le at Indiana University, Viet Pham at University of Science, Ho Chi Minh City. No product-related conflicts."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No funding disclosed; cannot assess funder independence."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is included in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "The paper tests an adversarial attack on a web agent, not a pre-trained model's capability on a benchmark. The triggers are newly generated via GCG optimization, so training data contamination of benchmark problems is not applicable."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "This paper evaluates attack effectiveness, not model knowledge on a pre-existing benchmark. Contamination in the traditional sense does not apply."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "No pre-existing benchmark is used to evaluate model capability. The evaluation measures attack success rate on newly constructed scenarios."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants. The paper includes an Ethical Consideration section discussing responsible disclosure but no IRB was needed."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Wall-clock time for trigger optimization is reported: 'below three hours' with search width 128, 'less than an hour' and 'less than ten minutes' with target-string initialization (Figures 3–4). Time-to-completion averaged over 10 runs."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No GPU hours, hardware specifications, or total compute budget stated. Wall-clock time is reported but the hardware used for GCG optimization is not specified."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "Ten optimization runs are averaged but no variance or sensitivity across runs is reported. GCG optimization is stochastic but seed sensitivity is not analyzed."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 3.1: 'average over 10 optimization runs, with each run featuring a different user-specified task.' Section 3.2: 'test set of 200 prompts.' Numbers are explicit."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "Different hyperparameter values are tested (search width, top-k, trigger length, loss function) but the total search budget and search method are not reported. Main experiments use 'standard GCG hyperparameters' without stating what was tried."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The main TWUI and UWTI experiments use unspecified 'standard GCG hyperparameters.' How this configuration was selected is not explained."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own attack system without comparison to other attack implementations. No acknowledgment of self-comparison bias or independent evaluation."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Time-to-completion is shown for different configurations but performance (ASR) is not reported as a function of compute budget. No compute-matched comparisons with alternative approaches."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "ASR is used as the primary metric without discussing whether syntactic action match fully captures real-world attack success (e.g., does the action actually cause harm, does the user notice, can it be detected?)."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "All experiments use Browser Gym only. The paper claims results about 'LLM Web Agents' generally but does not discuss whether Browser Gym's specific prompt template or action-space design makes the attack easier or harder than other frameworks."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of whether Llama-3.1 may have seen the test websites' HTML during training, which could affect its prior behavior on those sites."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "GCG requires white-box gradient access to the model, which is acknowledged as part of the threat model, but there is no discussion of whether this constitutes an unfair advantage in the evaluation setup."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether the 25 training navigation goals and 200 test goals are structurally independent, or whether overlap in website content between training and test settings could inflate ASR."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No concrete leakage detection or prevention method employed."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Universal adversarial triggers embedded in webpage HTML can hijack LLM web agent behavior with high success rates (0.83–1.0 ASR) across real websites.",
    365       "evidence": "TWUI experiments across 5 real websites with 200 test prompts each show ASR ranging from 0.83 (city) to 1.0 (binary) in Figure 5. Section 3.2.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Smaller search width and target-string initialization significantly reduce GCG optimization time.",
    370       "evidence": "Figure 3 shows search width 128 keeps average runtime below 3 hours. Figure 4 shows target-string initialization reduces time to under 1 hour, sometimes under 10 minutes. Section 3.1. No statistical tests provided.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "Adversarial triggers can exfiltrate login credentials across unseen websites.",
    375       "evidence": "UWTI experiment in Section 3.3: trigger trained on 8 login pages exfiltrated credentials from 3/11 unseen test pages (ASRV), with partial leakage (at least one credential) in 6/11 (ASR=0.55). Figure 6.",
    376       "supported": "weak"
    377     },
    378     {
    379       "claim": "Triggers do not transfer across different LLMs.",
    380       "evidence": "Section 4 'Transferability': 'We attempt to transfer triggers learned in the TWUI setting to other LLMs, namely Llama-2-7b-chat-hf and Mistral-7B-Instruct-v0.3, but were unsuccessful.'",
    381       "supported": "moderate"
    382     },
    383     {
    384       "claim": "CW loss and trigger length do not improve GCG convergence speed for this application.",
    385       "evidence": "Figures 8, 9, 10 in Appendix B show no significant effect of trigger length, top-k, or CW loss on time-to-completion. Section 3.1: 'We do not find any evidence that increasing trigger length or using CW loss increases convergence speed.'",
    386       "supported": "moderate"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Single model, single framework",
    392       "detail": "All main experiments use only Llama-3.1-8B-Instruct with Browser Gym. Transferability to Llama-2 and Mistral-7B failed, yet the paper's title and abstract generalize to 'LLM Web Agents' broadly."
    393     },
    394     {
    395       "flag": "No error bars or variance measures",
    396       "detail": "Results averaged over 10 runs (timing) and 200 test prompts (ASR) but no standard deviation, confidence intervals, or other spread measures reported for any experiment."
    397     },
    398     {
    399       "flag": "No comparison to prior attack methods",
    400       "detail": "Imprompter (Fu et al., 2024) and other IPI attack methods are discussed in related work but not compared against quantitatively, making it impossible to assess the relative effectiveness of this approach."
    401     },
    402     {
    403       "flag": "Small and unexplained website sample",
    404       "detail": "Only 5 websites for TWUI and 8+11 for UWTI, with no justification for selection. Results may not generalize to other website structures, complexities, or domains."
    405     },
    406     {
    407       "flag": "White-box threat model limits practical applicability",
    408       "detail": "GCG requires gradient access to the model. The paper discusses this in limitations but the practical threat is narrower than the framing suggests, since most production web agents use closed-source models."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Universal and transferable adversarial attacks on aligned language models",
    414       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    415       "year": 2023,
    416       "arxiv_id": "2307.15043",
    417       "relevance": "Introduces the GCG algorithm used as the core attack method in this paper; central to LLM safety research on jailbreaking."
    418     },
    419     {
    420       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    421       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    422       "year": 2023,
    423       "relevance": "Introduced the concept of indirect prompt injection (IPI) and classified security risks for LLM-integrated applications."
    424     },
    425     {
    426       "title": "Prompt injection attack against LLM-integrated applications",
    427       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang"],
    428       "year": 2023,
    429       "arxiv_id": "2306.05499",
    430       "relevance": "Studies malicious prompt injection attacks on LLM applications including system prompt overriding."
    431     },
    432     {
    433       "title": "WorkArena: How capable are web agents at solving common knowledge work tasks?",
    434       "authors": ["Alexandre Drouin", "Maxime Gasse", "Massimo Caccia", "Issam H. Laradji"],
    435       "year": 2024,
    436       "relevance": "Introduces the Browser Gym framework used in this paper to create the web navigation agent."
    437     },
    438     {
    439       "title": "Imprompter: Tricking LLM agents into improper tool use",
    440       "authors": ["Xiaohan Fu", "Shuheng Li", "Zihan Wang", "Yihao Liu", "Rajesh K. Gupta", "Taylor Berg-Kirkpatrick", "Earlence Fernandes"],
    441       "year": 2024,
    442       "arxiv_id": "2410.14923",
    443       "relevance": "Extends GCG to generate obfuscated prompts causing tool misuse and data exfiltration; demonstrates transferability to black-box systems."
    444     },
    445     {
    446       "title": "The Llama 3 herd of models",
    447       "authors": ["Aaron Grattafiori", "Abhimanyu Dubey"],
    448       "year": 2024,
    449       "arxiv_id": "2407.21783",
    450       "relevance": "Describes the Llama 3 model family used as the target LLM in this paper's attack experiments."
    451     },
    452     {
    453       "title": "GPT-4 technical report",
    454       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    455       "year": 2023,
    456       "arxiv_id": "2303.08774",
    457       "relevance": "Describes GPT-4, one of the major LLMs used in web navigation agents that this work's threat model applies to."
    458     },
    459     {
    460       "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents",
    461       "authors": ["Qiusi Zhan", "Richard Fang", "Henil Shalin Panchal", "Daniel Kang"],
    462       "year": 2025,
    463       "arxiv_id": "2503.00061",
    464       "relevance": "Demonstrates that prompt injection defenses are ineffective against adaptive GCG-based attacks on LLM agents."
    465     },
    466     {
    467       "title": "Universal adversarial triggers for attacking and analyzing NLP",
    468       "authors": ["Eric Wallace", "Shi Feng", "Nikhil Kandpal", "Matt Gardner", "Sameer Singh"],
    469       "year": 2019,
    470       "arxiv_id": "1908.07125",
    471       "relevance": "Foundational work on universal adversarial triggers for NLP models, building on gradient-based search methods."
    472     },
    473     {
    474       "title": "PAL: Proxy-guided black-box attack on large language models",
    475       "authors": ["Chawin Sitawarin", "Norman Mu", "David Wagner", "Alexandre Araujo"],
    476       "year": 2024,
    477       "arxiv_id": "2402.09674",
    478       "relevance": "Proposes proxy-guided attacks and studies CW loss convergence for GCG, cited for convergence analysis comparison."
    479     }
    480   ],
    481   "engagement_factors": {
    482     "practical_relevance": {
    483       "score": 2,
    484       "justification": "Demonstrates real vulnerabilities in web navigation agents with released code, useful for security researchers and framework developers but not a directly deployable defense tool."
    485     },
    486     "surprise_contrarian": {
    487       "score": 1,
    488       "justification": "LLM vulnerability to prompt injection is expected; the specific demonstration via accessibility tree and GCG on a real agent framework adds incremental novelty but doesn't overturn conventional wisdom."
    489     },
    490     "fear_safety": {
    491       "score": 3,
    492       "justification": "Demonstrates concrete credential exfiltration, forced ad clicks, and agent hijacking attacks on real websites, raising urgent security concerns about emerging autonomous web agents."
    493     },
    494     "drama_conflict": {
    495       "score": 1,
    496       "justification": "Highlights security flaws in the broader web agent ecosystem but doesn't single out a specific company's product or make a controversial accusation."
    497     },
    498     "demo_ability": {
    499       "score": 2,
    500       "justification": "GitHub repo released under MIT License and a publicly accessible demo website at lethaiq.github.io/attack-web-llm-agent, though not pip-installable."
    501     },
    502     "brand_recognition": {
    503       "score": 1,
    504       "justification": "Uses Meta's Llama and mentions OpenAI Operator and Gemini, but authors are from Indiana University and University of Science HCMC, not a major AI lab."
    505     }
    506   }
    507 }

Impressum · Datenschutz