scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (35171B)
      1 {
      2   "paper": {
      3     "title": "EVA: Red-Teaming GUI Agents via Evolving Indirect Prompt Injection",
      4     "authors": [
      5       "Yijie Lu",
      6       "Tianjie Ju",
      7       "Manman Zhao",
      8       "Xinbei Ma",
      9       "Yuan Guo",
     10       "Zhuosheng Zhang"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2505.14289",
     15     "doi": "10.48550/arXiv.2505.14289"
     16   },
     17   "scan_version": 3,
     18   "active_modules": ["experimental_rigor", "data_leakage"],
     19   "methodology_tags": ["benchmark-eval"],
     20   "key_findings": "EVA, a feedback-driven red-teaming framework for GUI agents, outperforms static injection baselines by up to +32% attack success rate (ASR) in pop-up scenarios and +26% in chat scenarios across six multimodal agents. The framework discovers transferable attack patterns, with prompts evolved on one model achieving up to +46% ASR improvement on unseen target models. High-risk scenarios (payment, email) prove largely resistant to injection, with 0% success for payment across all models. Persuasive (49.8%) and urgency (40.0%) strategies dominate successful attacks, with model-specific susceptibility patterns.",
     21   "checklist": {
     22     "artifacts": {
     23       "code_released": {
     24         "applies": true,
     25         "answer": false,
     26         "justification": "The paper claims in contributions section: 'We build and release a reproducible evaluation pipeline' but provides no repository URL, GitHub link, or archive. No working code link appears anywhere in the paper."
     27       },
     28       "data_released": {
     29         "applies": true,
     30         "answer": false,
     31         "justification": "No dataset of attack samples, agent responses, or scenario environments is released. The injection scenarios are custom-created and described in text/figures but not provided as downloadable artifacts."
     32       },
     33       "environment_specified": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Table 6 lists generation hyperparameters (temperature, top_p, top_k, max_tokens) but no environment specifications such as requirements.txt, Docker configuration, library versions, or hardware setup are provided."
     37       },
     38       "reproduction_instructions": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "No step-by-step reproduction instructions, README, or scripts are provided. While the methodology is described in Section 3 and prompt templates are given in Appendix B, there are no concrete instructions for replicating the full experimental pipeline."
     42       }
     43     },
     44     "statistical_methodology": {
     45       "confidence_intervals_or_error_bars": {
     46         "applies": true,
     47         "answer": false,
     48         "justification": "Tables 2-4 and Table 7 report only point estimates (percentages) with no confidence intervals, error bars, or uncertainty measures. All results are single-value success/failure/invalid rates from 50 samples."
     49       },
     50       "significance_tests": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper claims EVA 'outperforms' and 'consistently improves' over baselines throughout, but no statistical significance tests (t-tests, chi-squared, bootstrap, etc.) are applied to any comparison. All claims rest on raw percentage differences."
     54       },
     55       "effect_sizes_reported": {
     56         "applies": true,
     57         "answer": true,
     58         "justification": "Tables 2 and 4 report both baseline and EVA success rates with explicit delta improvements (e.g., '48 → 80 (+32)'), providing sufficient context to assess the magnitude of improvement relative to baseline performance."
     59       },
     60       "sample_size_justified": {
     61         "applies": true,
     62         "answer": false,
     63         "justification": "The paper uses 50 samples per agent per scenario (Section 4.1: 'we generate 50 samples per agent per scenario') but provides no justification for why 50 was chosen. No power analysis or sample size rationale is discussed."
     64       },
     65       "variance_reported": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No variance, standard deviation, or spread measures are reported across any results. Each condition shows a single success/failure/invalid split from 50 trials with no indication of variability across runs or seeds."
     69       }
     70     },
     71     "evaluation_design": {
     72       "baselines_included": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "A static baseline is defined in Section 4.1: one-shot injections generated by GLM-4v-Plus using fixed prompt templates, evaluated as-is without iterative refinement. EVA is compared against this baseline across all agents and scenarios."
     76       },
     77       "baselines_contemporary": {
     78         "applies": true,
     79         "answer": false,
     80         "justification": "The baseline is the authors' own creation (static GLM-4v-Plus generations) rather than a published prior method. Despite citing existing attack methods like AdvWeb [31], Zhan et al. [10], and WASP [11], none are included as experimental comparisons."
     81       },
     82       "ablation_study": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "Table 7 (Appendix D) presents a goal-prompt ablation comparing 'w/ Goal' vs 'w/o Goal' variants across all six models, showing that goal information boosts ASR by 2-24 percentage points."
     86       },
     87       "multiple_metrics": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "The primary evaluation metric is ASR (attack success rate). While outcomes are categorized as success/failure/invalid, these are components of a single outcome classification, not independent metrics. No alternative metrics (e.g., attack stealth, detection difficulty, time to succeed) are reported."
     91       },
     92       "human_evaluation": {
     93         "applies": true,
     94         "answer": false,
     95         "justification": "Attack success is judged entirely by automated classification using the LLM-based Action Evaluation Prompt (Appendix B.5). No human evaluation of attack quality, realism, or stealth is conducted."
     96       },
     97       "held_out_test_set": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "EVA's optimization and evaluation occur on the same models and scenarios. The main results in Table 2 reflect the final performance of the evolved attacks on the same target model used for optimization. Only the transferability experiments (Table 4) test on unseen targets."
    101       },
    102       "per_category_breakdown": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "Results are broken down by model (6 agents), scenario (pop-up, chat link, payment, email) in Tables 2-3, by source-target model pair for transferability (Table 4), and by persuasion strategy across scenarios and models (Table 5, Figures 7-9)."
    106       },
    107       "failure_cases_discussed": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Table 3 shows 0% success for payment scenarios across all models. Section 4.2 discusses why: 'agents either ignore or explicitly reject the injected content, suggesting that these high-risk contexts are inherently more resistant.' The paper also discusses models with high invalid rates (GPT-4V, UI-TARS-7B-DPO at 50%+ invalid in payment)."
    111       },
    112       "negative_results_reported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Table 3 reports complete failure of attacks in payment scenarios (0% success for all models). Table 4 shows some negative transfer cases (e.g., GPT-4o source → GLM-4v-Plus target: EVA 38 vs baseline 40, Δ=-2). The goal ablation (Table 7) shows significantly reduced ASR without goal information."
    116       }
    117     },
    118     "claims_and_evidence": {
    119       "abstract_claims_supported": {
    120         "applies": true,
    121         "answer": true,
    122         "justification": "The abstract claims EVA 'substantially improves success rates over static baselines' (supported by Tables 2-4), 'far greater transferability' (Table 4), effective under 'goal-agnostic constraints' (Table 7, though weaker), and 'injection styles transfer well across models' (Table 4). All claims have corresponding experimental evidence."
    123       },
    124       "causal_claims_justified": {
    125         "applies": true,
    126         "answer": true,
    127         "justification": "The paper claims EVA's feedback-driven evolution causes higher attack success than static methods. The experimental design compares EVA against the static baseline with all other variables controlled (same scenarios, same target models, same sample size), supporting causal attribution. The ablation study (Table 7) further isolates the goal component."
    128       },
    129       "generalization_bounded": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The title 'Red-Teaming GUI Agents' and claims about 'common vulnerabilities in multimodal decision-making' extend well beyond the six tested agents in four synthetic scenarios. The paper does not bound generalization to the specific agents, scenario types, or synthetic environment used. Real-world GUI environments with dynamic content and user interaction are not tested."
    133       },
    134       "alternative_explanations_discussed": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "The paper does not consider alternative explanations for EVA's success. For instance, the improvement could be partly due to the static baseline being deliberately weak (their own creation), or the automated evaluation prompt (Appendix B.5) may systematically favor EVA's evolved outputs. No robustness checks against confounds are discussed."
    138       },
    139       "proxy_outcome_distinction": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper measures ASR (clicking injected elements in synthetic scenarios) and frames this as revealing 'vulnerabilities' and 'failure modes' in multimodal decision-making. The gap between clicking a button in a controlled DOM injection and real-world agent vulnerability is not acknowledged. No discussion of whether synthetic ASR translates to real-world security risk."
    143       }
    144     },
    145     "setup_transparency": {
    146       "model_versions_specified": {
    147         "applies": true,
    148         "answer": false,
    149         "justification": "Models are listed as 'GLM-4v-Plus', 'GPT-4V', 'GPT-4o', 'Qwen2.5-VL', 'UI-TARS-7B-DPO', and 'OS-Atlas-base'. No snapshot dates, API versions, or specific model IDs are provided. Per the schema, marketing names like 'GPT-4o' without a snapshot date do not count as specified versions."
    150       },
    151       "prompts_provided": {
    152         "applies": true,
    153         "answer": true,
    154         "justification": "Full prompt templates with actual text are provided in Appendix B: agree button generation (B.2), reject button rewriting (B.2), common steps extraction (B.2), popup confusion generation (B.3), action summarization (B.4), action evaluation (B.5), and attack-type classification (B.6)."
    155       },
    156       "hyperparameters_reported": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Table 6 reports all experimental hyperparameters: temperature (0.7), top_p (1.0), top_k (32), max_tokens (512), max_iter_steps (10), num_evals (10), success_threshold (7). Section B.7 provides explanations for each."
    160       },
    161       "scaffolding_described": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "EVA's pipeline is described in detail in Section 3.3 with Figure 3: keyword lexicon initialization, injection construction via weighted sampling, feedback-driven lexicon update (Equation 5), pruning and regeneration at fixed intervals, and termination criteria. The feedback loop mechanism is fully specified."
    165       },
    166       "data_preprocessing_documented": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The initial keyword lexicon is described only as 'seeded from a combination of curated trigger words (e.g., \"urgent,\" \"confirm,\" \"security\") and LLM-generated distractors' without providing the full initial set. The extraction function S(T(Kt)) is described as isolating tokens 'based on syntactic roles or heuristic salience' without specifying the heuristics. Key preprocessing parameters (pruning δ%, base increment b, bonus B) are not given exact values."
    170       }
    171     },
    172     "limitations_and_scope": {
    173       "limitations_section_present": {
    174         "applies": true,
    175         "answer": true,
    176         "justification": "Appendix A contains a dedicated 'Limitations' section discussing EVA's nature as a behaviour-driven probe, lack of access to deeper perceptual mechanisms, inability to explain why injections succeed, and operation in synthetic environments."
    177       },
    178       "threats_to_validity_specific": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "The limitations section raises study-specific concerns: EVA 'has no access to the deeper perceptual or grounding mechanisms,' 'cannot explain *why* certain injections succeed beyond statistical associations,' and 'ignores the messy co-evolution found in real interfaces, where user intent, agent alignment and environmental distractions interact.'"
    182       },
    183       "scope_boundaries_stated": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "While limitations are discussed, the paper does not explicitly state what the results do NOT show. It does not specify which populations, agent types, or real-world scenarios are excluded from claims. The limitations section describes what EVA cannot do but doesn't frame explicit boundaries on the generalizability of the findings."
    187       }
    188     },
    189     "data_integrity": {
    190       "raw_data_available": {
    191         "applies": true,
    192         "answer": false,
    193         "justification": "No raw data is available: no attack samples, agent responses, evolved keyword lexicons, or per-trial outcome logs are released. Only aggregated percentages appear in the tables."
    194       },
    195       "data_collection_described": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Section 4.1 describes the experimental procedure: 50 samples per agent per scenario, four injection scenarios, six target agents. The static baseline generation process (GLM-4v-Plus, temperature 0.7, 50 independent calls) and EVA's iterative optimization loop are described in Sections 3.3 and 4.1."
    199       },
    200       "recruitment_methods_described": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "The six GUI agents are described as 'widely used generalist and specialist GUI agents' but no selection criteria or justification for choosing these specific six models is provided. The four scenarios are stated to 'reflect typical use cases' without systematic derivation."
    204       },
    205       "data_pipeline_documented": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "While the high-level pipeline (inject → render → feed to agent → classify → compute ASR) is clear, key specifics are missing: the exact pruning percentage δ for lexicon evolution, the values of base increment b and bonus B in Equation 5, convergence criteria, and how the automated action evaluation prompt's output maps to final ASR counts."
    209       }
    210     },
    211     "conflicts_of_interest": {
    212       "funding_disclosed": {
    213         "applies": true,
    214         "answer": false,
    215         "justification": "No funding acknowledgment, grant numbers, or sponsor information appears anywhere in the paper."
    216       },
    217       "affiliations_disclosed": {
    218         "applies": true,
    219         "answer": true,
    220         "justification": "Author affiliations are clearly listed: Wuhan University and Shanghai Jiao Tong University. These are academic institutions not affiliated with the evaluated commercial products (GLM, GPT, Qwen)."
    221       },
    222       "funder_independent_of_outcome": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "Since funding is not disclosed, independence of the funder cannot be assessed. The absence of a funding statement does not confirm unfunded status."
    226       },
    227       "financial_interests_declared": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No competing interests statement, patent disclosures, or financial interest declarations appear in the paper."
    231       }
    232     },
    233     "contamination": {
    234       "training_cutoff_stated": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "This is a red-teaming study that tests attack effectiveness against GUI agents, not a study evaluating model knowledge or capability on a benchmark. Contamination criteria do not apply."
    238       },
    239       "train_test_overlap_discussed": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "Red-teaming study testing attack methods against models, not evaluating pre-trained model capability on a benchmark."
    243       },
    244       "benchmark_contamination_addressed": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "Red-teaming study testing attack methods against models, not evaluating pre-trained model capability on a benchmark."
    248       }
    249     },
    250     "human_studies": {
    251       "pre_registered": {
    252         "applies": false,
    253         "answer": false,
    254         "justification": "No human participants are involved. All experiments are conducted with AI agents responding to injected GUI content."
    255       },
    256       "irb_or_ethics_approval": {
    257         "applies": false,
    258         "answer": false,
    259         "justification": "No human participants are involved in the study."
    260       },
    261       "demographics_reported": {
    262         "applies": false,
    263         "answer": false,
    264         "justification": "No human participants are involved in the study."
    265       },
    266       "inclusion_exclusion_criteria": {
    267         "applies": false,
    268         "answer": false,
    269         "justification": "No human participants are involved in the study."
    270       },
    271       "randomization_described": {
    272         "applies": false,
    273         "answer": false,
    274         "justification": "No human participants are involved in the study."
    275       },
    276       "blinding_described": {
    277         "applies": false,
    278         "answer": false,
    279         "justification": "No human participants are involved in the study."
    280       },
    281       "attrition_reported": {
    282         "applies": false,
    283         "answer": false,
    284         "justification": "No human participants are involved in the study."
    285       }
    286     },
    287     "cost_and_practicality": {
    288       "inference_cost_reported": {
    289         "applies": true,
    290         "answer": false,
    291         "justification": "No API costs, token consumption, or per-trial costs are reported despite using multiple commercial APIs (GPT-4V, GPT-4o, GLM-4v-Plus) across 50 samples × 6 agents × 4 scenarios plus iterative EVA optimization loops."
    292       },
    293       "compute_budget_stated": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No total computational budget, API spend, wall-clock time, or hardware specifications are reported. The iterative optimization in EVA (up to 10 iterations per sample with 10 evaluations each) implies significant compute but this is not quantified."
    297       }
    298     },
    299     "experimental_rigor": {
    300       "seed_sensitivity_reported": {
    301         "applies": true,
    302         "answer": false,
    303         "justification": "No mention of random seeds or seed sensitivity analysis. The stochastic elements (LLM generation with temperature 0.7, keyword sampling) could produce different results across seeds, but this is not investigated."
    304       },
    305       "number_of_runs_stated": {
    306         "applies": true,
    307         "answer": true,
    308         "justification": "Section 4.1 explicitly states '50 samples per agent per scenario' for the baseline, and the EVA optimization parameters include 'num_evals: 10' per popup and 'max_iter_steps: 10' (Table 6)."
    309       },
    310       "hyperparameter_search_budget": {
    311         "applies": true,
    312         "answer": false,
    313         "justification": "The hyperparameters in Table 6 appear fixed without any search. No mention of how these values (e.g., temperature 0.7, success_threshold 7, max_iter_steps 10) were selected or how many configurations were explored."
    314       },
    315       "best_config_selection_justified": {
    316         "applies": true,
    317         "answer": false,
    318         "justification": "Only one configuration is reported with no explanation of how the hyperparameters in Table 6 were chosen. No validation set or selection criterion is described for choosing the final parameter values."
    319       },
    320       "multiple_comparison_correction": {
    321         "applies": true,
    322         "answer": false,
    323         "justification": "The paper makes comparisons across 6 models × 4 scenarios × 2 methods but applies no statistical tests at all, let alone corrections for multiple comparisons."
    324       },
    325       "self_comparison_bias_addressed": {
    326         "applies": true,
    327         "answer": false,
    328         "justification": "The authors compare EVA (their system) against a static baseline they created themselves. No acknowledgment of author-evaluation bias. No independent evaluation or third-party reproduction is mentioned."
    329       },
    330       "compute_budget_vs_performance": {
    331         "applies": true,
    332         "answer": false,
    333         "justification": "EVA uses iterative optimization (up to 10 iterations × 10 evaluations) while the static baseline generates samples in a single pass. This significant compute difference is never discussed or controlled for. It is unclear whether the improvement comes from the adaptive algorithm or simply from having more compute budget."
    334       },
    335       "benchmark_construct_validity": {
    336         "applies": true,
    337         "answer": false,
    338         "justification": "The paper does not discuss whether ASR on synthetic DOM-injected scenarios actually measures real-world GUI agent vulnerability. No analysis of whether these controlled scenarios represent the full attack surface or whether ASR correlates with real-world security risk."
    339       },
    340       "scaffold_confound_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The six agents have different architectures and internal scaffolding (generalist vs specialist, different visual grounding methods). When comparing vulnerability across agents, these differences are confounded with underlying model differences. The paper does not control for or discuss this confound."
    344       }
    345     },
    346     "data_leakage": {
    347       "temporal_leakage_addressed": {
    348         "applies": true,
    349         "answer": false,
    350         "justification": "The models may have been trained on similar attack patterns or defensive strategies from prior work. No discussion of whether the tested models' training data includes indirect prompt injection examples or defenses that could affect results."
    351       },
    352       "feature_leakage_addressed": {
    353         "applies": true,
    354         "answer": false,
    355         "justification": "The evaluation setup feeds the full rendered screenshot to agents. No discussion of whether the visual rendering process leaks information about what constitutes an injection (e.g., positioning, styling cues) that would not be present in real-world attacks."
    356       },
    357       "non_independence_addressed": {
    358         "applies": true,
    359         "answer": false,
    360         "justification": "The 50 static baseline samples are generated independently, but EVA's evolved samples within a single optimization run are correlated (each iteration builds on prior successes). This non-independence is not discussed and could inflate apparent ASR improvements."
    361       },
    362       "leakage_detection_method": {
    363         "applies": true,
    364         "answer": false,
    365         "justification": "No leakage detection or prevention methods are applied. No analysis of whether models have been specifically trained to resist these injection patterns."
    366       }
    367     }
    368   },
    369   "claims": [
    370     {
    371       "claim": "EVA consistently outperforms static baselines in attack success rate across all agents and scenarios, with up to +32% improvement in pop-up scenarios.",
    372       "evidence": "Tables 2-3 show EVA vs baseline results across 6 models and 4 scenarios. In pop-up scenarios, EVA achieves +32% on GLM-4v-Plus (48→80%), +26% on OS-Atlas-Base (20→46%). In chat scenarios, up to +26% on GPT-4V (16→42%).",
    373       "supported": "moderate"
    374     },
    375     {
    376       "claim": "EVA-evolved prompts show strong transferability across GUI agents, with improvements up to +46% when replaying prompts evolved on one model to another.",
    377       "evidence": "Table 4 shows cross-agent ASR for pop-up scenarios. Qwen2.5-VL→GPT-4V achieves +46% improvement. Most source-target pairs show positive transfer for EVA over baseline.",
    378       "supported": "moderate"
    379     },
    380     {
    381       "claim": "GUI agents are inherently resistant to high-risk injection scenarios such as payment fraud.",
    382       "evidence": "Table 3 shows 0% attack success for payment scenarios across all six models. The paper notes agents 'either ignore or explicitly reject the injected content.'",
    383       "supported": "strong"
    384     },
    385     {
    386       "claim": "Persuasive (49.8%) and urgency (40.0%) strategies dominate successful indirect prompt injections, with model-specific susceptibility patterns.",
    387       "evidence": "Table 5 and Figures 7-9 provide strategy distribution analysis. UI-TARS-7B-DPO shows 50.8% sensitivity to urgency while GPT-4V shows 51.6% to persuasive content.",
    388       "supported": "moderate"
    389     },
    390     {
    391       "claim": "Goal information significantly boosts attack success; removing user goals from adversarial prompts reduces ASR by 2-24 percentage points.",
    392       "evidence": "Table 7 (Appendix D) shows w/ Goal vs w/o Goal comparison across 6 models in pop-up scenario. Largest drop: GLM-4v-Plus (-24%), smallest: UI-TARS-7B-DPO (-2%).",
    393       "supported": "moderate"
    394     },
    395     {
    396       "claim": "Visual attention concentration is the key factor determining injection success — concentrated attention on injected elements leads to higher ASR than dispersed attention.",
    397       "evidence": "Figure 4 shows attention heatmaps for pop-up (concentrated, successful) vs chat link (dispersed, unsuccessful). Section 5.2 discusses the mechanism qualitatively.",
    398       "supported": "weak"
    399     }
    400   ],
    401   "red_flags": [
    402     {
    403       "flag": "No statistical significance testing",
    404       "detail": "All claims that EVA 'outperforms' or 'improves' are based on raw percentage comparisons from 50 samples per condition. With n=50, many of the reported differences (e.g., +2% on Qwen2.5-VL for chat) may not be statistically significant. No confidence intervals, p-values, or other statistical tests are reported for any comparison."
    405     },
    406     {
    407       "flag": "Self-created baseline favors the proposed method",
    408       "detail": "The static baseline is the authors' own creation (single-pass GLM-4v-Plus generations) rather than a published prior attack method. Despite citing AdvWeb, WASP, and other existing attack frameworks, none are included as experimental comparisons. The baseline may be deliberately simple to maximize the appearance of EVA's improvement."
    409     },
    410     {
    411       "flag": "Compute budget confound",
    412       "detail": "EVA uses up to 10 optimization iterations with 10 evaluations each per sample, while the static baseline generates samples in a single pass. The improvement could partially reflect having more compute/interaction budget rather than the quality of the adaptive algorithm. No compute-matched comparison is provided."
    413     },
    414     {
    415       "flag": "Automated evaluation without human validation",
    416       "detail": "Attack success is judged entirely by an LLM-based prompt (Appendix B.5) that classifies agent responses as success/failure/invalid. The reliability and accuracy of this automated judge are not validated against human judgments. Errors in the judge could systematically bias results."
    417     },
    418     {
    419       "flag": "Claimed but unreleased artifacts",
    420       "detail": "Contribution (ii) explicitly states 'We build and release a reproducible evaluation pipeline' but no code repository URL, data archive, or download link appears in the paper. This is a claim of artifact release without evidence of actual release."
    421     },
    422     {
    423       "flag": "Small sample sizes for some claims",
    424       "detail": "With 50 samples per condition, many reported differences are small in absolute terms (e.g., +2 percentage points = 1 additional success out of 50). The persuasion strategy analysis (Table 5) subdivides successful attacks further, resulting in very small cell counts for some strategy-scenario-model combinations."
    425     }
    426   ],
    427   "cited_papers": [
    428     {
    429       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    430       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    431       "year": 2023,
    432       "relevance": "Foundational work formalizing indirect prompt injection risks in real-world LLM toolchains, directly relevant to the attack paradigm EVA builds upon."
    433     },
    434     {
    435       "title": "Caution for the environment: Multimodal agents are susceptible to environmental distractions",
    436       "authors": ["Xinbei Ma", "Yiting Wang", "Yao Yao", "Tongxin Yuan", "Aston Zhang", "Zhuosheng Zhang", "Hai Zhao"],
    437       "year": 2024,
    438       "arxiv_id": "2408.02544",
    439       "relevance": "Empirically demonstrates multimodal GUI agents' susceptibility to visual distractions, establishing the vulnerability EVA exploits."
    440     },
    441     {
    442       "title": "Attacking vision-language computer agents via pop-ups",
    443       "authors": ["Yanzhe Zhang", "Tao Yu", "Diyi Yang"],
    444       "year": 2024,
    445       "arxiv_id": "2411.02391",
    446       "relevance": "Demonstrates pop-up-based attacks on vision-language agents, a key injection scenario evaluated in EVA."
    447     },
    448     {
    449       "title": "EIA: Environmental injection attack on generalist web agents for privacy leakage",
    450       "authors": ["Zeyi Liao", "Lingbo Mo", "Chejian Xu", "Mintong Kang", "Jiawei Zhang", "Chaowei Xiao", "Yuan Tian", "Bo Li", "Huan Sun"],
    451       "year": 2025,
    452       "relevance": "Formalizes environmental injection attacks on web agents for privacy leakage, directly related to the threat model EVA addresses."
    453     },
    454     {
    455       "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents",
    456       "authors": ["Qiusi Zhan", "Richard Fang", "Henil Shalin Panchal", "Daniel Kang"],
    457       "year": 2025,
    458       "arxiv_id": "2503.00061",
    459       "relevance": "Proposes joint layout-text optimization for adaptive attacks on LLM agents, a closely related concurrent approach to EVA's evolving injection strategy."
    460     },
    461     {
    462       "title": "WASP: Benchmarking web agent security against prompt injection attacks",
    463       "authors": ["Ivan Evtimov", "Arman Zharmagambetov", "Aaron Grattafiori", "Chuan Guo", "Kamalika Chaudhuri"],
    464       "year": 2025,
    465       "arxiv_id": "2504.18575",
    466       "relevance": "Provides standardized benchmarks for evaluating web agent security against prompt injection, directly relevant to systematic evaluation of GUI agent vulnerabilities."
    467     },
    468     {
    469       "title": "AdvWeb: Controllable black-box attacks on VLM-powered web agents",
    470       "authors": ["Chejian Xu", "Mintong Kang", "Jiawei Zhang", "Zeyi Liao", "Lingbo Mo", "Mengqi Yuan", "Huan Sun", "Bo Li"],
    471       "year": 2024,
    472       "arxiv_id": "2410.17401",
    473       "relevance": "DOM-level perturbation tool for black-box attacks on vision-language web agents, a direct comparison point for EVA's attack methodology."
    474     },
    475     {
    476       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    477       "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"],
    478       "year": 2024,
    479       "arxiv_id": "2401.05566",
    480       "relevance": "Introduces hidden-trigger deceptive behaviors in LLMs that persist through safety training, relevant to understanding latent vulnerabilities EVA seeks to exploit."
    481     },
    482     {
    483       "title": "Universal and transferable adversarial attacks on aligned language models",
    484       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    485       "year": 2023,
    486       "arxiv_id": "2307.15043",
    487       "relevance": "Foundational work on transferable adversarial attacks against aligned LLMs, establishing key concepts EVA extends to the multimodal GUI setting."
    488     },
    489     {
    490       "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models",
    491       "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"],
    492       "year": 2024,
    493       "relevance": "Automated generation of stealthy jailbreak prompts, related to EVA's evolutionary approach to crafting effective adversarial content."
    494     },
    495     {
    496       "title": "UI-TARS: Pioneering automated GUI interaction with native agents",
    497       "authors": ["Yujia Qin", "Yining Ye", "Junjie Fang"],
    498       "year": 2025,
    499       "arxiv_id": "2501.12326",
    500       "relevance": "One of the six GUI agents evaluated as a target in EVA's red-teaming experiments, representing specialist GUI agent architecture."
    501     },
    502     {
    503       "title": "OS-ATLAS: Foundation action model for generalist GUI agents",
    504       "authors": ["Zhiyong Wu", "Zhenyu Wu", "Fangzhi Xu"],
    505       "year": 2025,
    506       "relevance": "One of the six GUI agents evaluated as a target in EVA's experiments, representing generalist GUI agent design."
    507     },
    508     {
    509       "title": "Jailbreaking black box large language models in twenty queries",
    510       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"],
    511       "year": 2024,
    512       "relevance": "Black-box jailbreaking methodology using iterative refinement, conceptually related to EVA's feedback-driven optimization of adversarial injections."
    513     }
    514   ],
    515   "engagement_factors": {
    516     "practical_relevance": {
    517       "score": 2,
    518       "justification": "Security researchers and GUI agent developers can apply EVA's methodology to test their systems, though no code is released for direct use."
    519     },
    520     "surprise_contrarian": {
    521       "score": 1,
    522       "justification": "The finding that adaptive attacks outperform static ones is expected; the transferability results and payment resistance are mildly surprising but not contrarian."
    523     },
    524     "fear_safety": {
    525       "score": 2,
    526       "justification": "Demonstrates effective attacks against GUI agents including phishing and manipulation scenarios, with transferable patterns raising concern about deployed agents."
    527     },
    528     "drama_conflict": {
    529       "score": 1,
    530       "justification": "Shows commercial agents (GPT-4V, GPT-4o) are vulnerable to evolved attacks, but no major controversy or accusatory framing."
    531     },
    532     "demo_ability": {
    533       "score": 0,
    534       "justification": "No code, demo, or tool is released despite claiming to 'build and release a reproducible evaluation pipeline.'"
    535     },
    536     "brand_recognition": {
    537       "score": 2,
    538       "justification": "Tests against well-known commercial models (GPT-4V, GPT-4o from OpenAI; Qwen2.5-VL from Alibaba) which attract attention."
    539     }
    540   }
    541 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs