scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (26234B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "EVA: Red-Teaming GUI Agents via Evolving Indirect Prompt Injection",
      6     "authors": ["Yijie Lu", "Tianjie Ju", "Manman Zhao", "Xinbei Ma", "Yuan Guo", "Zhuosheng Zhang"],
      7     "year": 2025,
      8     "venue": "arXiv.org",
      9     "arxiv_id": "2505.14289",
     10     "doi": "10.48550/arXiv.2505.14289"
     11   },
     12   "checklist": {
     13     "claims_and_evidence": {
     14       "abstract_claims_supported": {
     15         "applies": true,
     16         "answer": true,
     17         "justification": "Core claims about EVA outperforming static baselines are supported by Tables 2 and 4; transferability claims are backed by cross-agent ASR data showing consistent gains.",
     18         "source": "haiku"
     19       },
     20       "causal_claims_justified": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper attributes higher ASR to the adaptive loop mechanism, but no ablation isolates evolution from simply generating more diverse prompt variants; the comparison does not control for number of LLM calls between EVA and the one-shot baseline.",
     24         "source": "haiku"
     25       },
     26       "generalization_bounded": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The conclusion broadly claims EVA 'reframes GUI agent robustness' and reveals 'common vulnerabilities,' but experiments cover only 4 synthetic scenarios and 6 agents without explicitly bounding what the findings do and do not generalize to.",
     30         "source": "haiku"
     31       },
     32       "alternative_explanations_discussed": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper does not consider that EVA's gains could be explained by prompt diversity rather than evolutionary adaptation; the attention-based mechanistic account is presented as the sole interpretation without considering alternatives.",
     36         "source": "haiku"
     37       },
     38       "proxy_outcome_distinction": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "ASR directly measures whether the agent clicks the injected element, which is precisely the attack goal claimed; no conflation between proxy measure and stated outcome.",
     42         "source": "haiku"
     43       }
     44     },
     45     "limitations_and_scope": {
     46       "limitations_section_present": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "Appendix A contains a dedicated 'Limitations' section, though it is a single brief paragraph.",
     50         "source": "haiku"
     51       },
     52       "threats_to_validity_specific": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The limitations mention synthetic environments and inability to explain why attacks succeed, but do not address key validity threats such as small sample size (n=50 per cell), confounds in ASR measurement, or the absence of comparison to contemporary adaptive baselines.",
     56         "source": "haiku"
     57       },
     58       "scope_boundaries_stated": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No explicit statement about what the results do NOT show; the limitations section describes framework constraints but does not demarcate empirical scope boundaries.",
     62         "source": "haiku"
     63       }
     64     },
     65     "conflicts_of_interest": {
     66       "funding_disclosed": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No funding acknowledgment or disclosure appears anywhere in the paper.",
     70         "source": "haiku"
     71       },
     72       "affiliations_disclosed": {
     73         "applies": true,
     74         "answer": true,
     75         "justification": "Author affiliations with Wuhan University and Shanghai Jiao Tong University are clearly stated on the first page.",
     76         "source": "haiku"
     77       },
     78       "funder_independent_of_outcome": {
     79         "applies": false,
     80         "answer": false,
     81         "justification": "No funding is disclosed, so funder independence cannot be assessed.",
     82         "source": "haiku"
     83       },
     84       "financial_interests_declared": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No competing interests or financial disclosure statement appears in the paper.",
     88         "source": "haiku"
     89       }
     90     },
     91     "scope_and_framing": {
     92       "key_terms_defined": {
     93         "applies": true,
     94         "answer": true,
     95         "justification": "Key terms are defined: 'indirect prompt injection' is explained in the introduction, 'GUI agent' is characterized with the formal policy notation, and 'attack success rate' (ASR) is formally defined in Section 4.1.",
     96         "source": "haiku"
     97       },
     98       "intended_contribution_clear": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "Three explicit contributions are enumerated in Section 1: the EVA framework, the reproducible evaluation pipeline, and the large-scale cross-agent transferability study.",
    102         "source": "haiku"
    103       },
    104       "engagement_with_prior_work": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Section 2 engages substantively with prior work on GUI agents and indirect prompt injection, explicitly positioning EVA against static methods like EIA, AdvWeb, and WASP and explaining the methodological gap EVA addresses.",
    108         "source": "haiku"
    109       }
    110     }
    111   },
    112   "type_checklist": {
    113     "empirical": {
    114       "artifacts": {
    115         "code_released": {
    116           "applies": true,
    117           "answer": false,
    118           "justification": "Contribution (ii) claims to 'release a reproducible evaluation pipeline' but no URL, repository link, or access method is provided anywhere in the paper.",
    119           "source": "haiku"
    120         },
    121         "data_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "The HTML injection scenarios and trial-level outcomes used in experiments are not released; no dataset URL is provided.",
    125           "source": "haiku"
    126         },
    127         "environment_specified": {
    128           "applies": true,
    129           "answer": false,
    130           "justification": "Table 6 provides generation hyperparameters but no Python version, package dependencies, or environment specification (requirements.txt, Dockerfile, etc.) is included.",
    131           "source": "haiku"
    132         },
    133         "reproduction_instructions": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "Prompt templates are provided in Appendix B but no step-by-step instructions for running the full EVA pipeline exist; the evaluation framework itself is not accessible.",
    137           "source": "haiku"
    138         }
    139       },
    140       "statistical_methodology": {
    141         "confidence_intervals_or_error_bars": {
    142           "applies": true,
    143           "answer": false,
    144           "justification": "All results in Tables 2–4 and 7 are reported as single percentage values with no confidence intervals or error bars.",
    145           "source": "haiku"
    146         },
    147         "significance_tests": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "No statistical significance tests are applied to comparative claims between EVA and baseline despite multiple percentage comparisons across agents and scenarios.",
    151           "source": "haiku"
    152         },
    153         "effect_sizes_reported": {
    154           "applies": true,
    155           "answer": true,
    156           "justification": "Tables 2 and 4 report delta values (e.g., '+32%', '+26%') showing absolute improvement in ASR over baseline, providing effect size context.",
    157           "source": "haiku"
    158         },
    159         "sample_size_justified": {
    160           "applies": true,
    161           "answer": false,
    162           "justification": "50 samples per agent per scenario are used with no justification for adequacy and no power analysis.",
    163           "source": "haiku"
    164         },
    165         "variance_reported": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "No variance, standard deviation, or run-to-run variability is reported; all results are single point estimates.",
    169           "source": "haiku"
    170         }
    171       },
    172       "evaluation_design": {
    173         "baselines_included": {
    174           "applies": true,
    175           "answer": true,
    176           "justification": "A static one-shot baseline using GLM-4v-Plus is included for all scenarios and compared directly to EVA in Tables 2 and 4.",
    177           "source": "haiku"
    178         },
    179         "baselines_contemporary": {
    180           "applies": true,
    181           "answer": false,
    182           "justification": "The only baseline is a trivially simple one-shot generator; contemporary adaptive attack methods cited in related work (AdvWeb, EIA, WASP, Zhan et al. 2025) are not compared against despite being directly relevant.",
    183           "source": "haiku"
    184         },
    185         "ablation_study": {
    186           "applies": true,
    187           "answer": true,
    188           "justification": "Appendix D provides a goal-prompt ablation comparing 'w/ Goal' vs 'w/o Goal' prompt variants across all six models in the pop-up scenario.",
    189           "source": "haiku"
    190         },
    191         "multiple_metrics": {
    192           "applies": true,
    193           "answer": false,
    194           "justification": "ASR (decomposed into success/failure/invalid percentages) is the only evaluation metric; no additional metrics such as task completion rate, detection evasion, or attack efficiency are reported.",
    195           "source": "haiku"
    196         },
    197         "human_evaluation": {
    198           "applies": false,
    199           "answer": false,
    200           "justification": "All evaluation is automated using LLM classifiers; human evaluation is not applicable to this automated attack benchmark study.",
    201           "source": "haiku"
    202         },
    203         "held_out_test_set": {
    204           "applies": false,
    205           "answer": false,
    206           "justification": "This is an adversarial attack study, not a prediction task; held-out test sets are not applicable.",
    207           "source": "haiku"
    208         },
    209         "per_category_breakdown": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "Results are broken down by scenario (Tables 2–3) and by model (Tables 2–4), providing granular per-category and per-agent breakdowns.",
    213           "source": "haiku"
    214         },
    215         "failure_cases_discussed": {
    216           "applies": true,
    217           "answer": true,
    218           "justification": "Table 3 reports 0% success on payment and email scenarios, and Section 4.2 discusses why these high-risk contexts are more resistant, including the over-50% invalid action rates.",
    219           "source": "haiku"
    220         },
    221         "negative_results_reported": {
    222           "applies": true,
    223           "answer": true,
    224           "justification": "Table 3 shows complete failure (0% ASR) on payment and email scenarios; Table 4 shows some negative EVA transfer gains (-2% to -4%) for certain source-target model pairs.",
    225           "source": "haiku"
    226         }
    227       },
    228       "setup_transparency": {
    229         "model_versions_specified": {
    230           "applies": true,
    231           "answer": false,
    232           "justification": "Models like 'GPT-4V' and 'GPT-4o' are referenced by technical report citations but no API snapshot dates or specific version identifiers are provided.",
    233           "source": "haiku"
    234         },
    235         "prompts_provided": {
    236           "applies": true,
    237           "answer": true,
    238           "justification": "Appendix B provides the actual prompt templates for button generation, reject button rewriting, popup generation, action summarization, action evaluation, and attack-type classification.",
    239           "source": "haiku"
    240         },
    241         "hyperparameters_reported": {
    242           "applies": true,
    243           "answer": true,
    244           "justification": "Table 6 reports temperature (0.7), top_p (1.0), top_k (32), max_tokens (512), max_iter_steps (10), num_evals (10), and success_threshold (7).",
    245           "source": "haiku"
    246         },
    247         "scaffolding_described": {
    248           "applies": true,
    249           "answer": true,
    250           "justification": "Section 3 describes the EVA framework in detail including keyword lexicon initialization, injection construction (Eq. 4), feedback update equations (Eq. 5), lexicon evolution, and termination criteria.",
    251           "source": "haiku"
    252         },
    253         "data_preprocessing_documented": {
    254           "applies": true,
    255           "answer": true,
    256           "justification": "The HTML injection generation process, template filling, and action classification pipeline are documented through prompt templates in Appendix B and the methodology in Section 3.",
    257           "source": "haiku"
    258         }
    259       },
    260       "data_integrity": {
    261         "raw_data_available": {
    262           "applies": true,
    263           "answer": false,
    264           "justification": "No trial-level raw data (individual attack outcomes per iteration) is made available; only aggregate percentages are reported.",
    265           "source": "haiku"
    266         },
    267         "data_collection_described": {
    268           "applies": true,
    269           "answer": true,
    270           "justification": "Section 4.1 describes the evaluation procedure: agents receive rendered screenshots, output actions, and are classified into success/failure/invalid categories using the LLM classifier prompt in Appendix B.5.",
    271           "source": "haiku"
    272         },
    273         "recruitment_methods_described": {
    274           "applies": false,
    275           "answer": false,
    276           "justification": "No human participants; recruitment methods are not applicable.",
    277           "source": "haiku"
    278         },
    279         "data_pipeline_documented": {
    280           "applies": true,
    281           "answer": true,
    282           "justification": "The full pipeline from HTML injection generation through agent evaluation to ASR calculation is documented across Section 3, Section 4.1, and Appendix B.",
    283           "source": "haiku"
    284         }
    285       },
    286       "contamination": {
    287         "training_cutoff_stated": {
    288           "applies": true,
    289           "answer": false,
    290           "justification": "No training data cutoffs are stated for any of the six evaluated models including GPT-4V, GPT-4o, and Qwen2.5-VL.",
    291           "source": "haiku"
    292         },
    293         "train_test_overlap_discussed": {
    294           "applies": true,
    295           "answer": false,
    296           "justification": "No discussion of whether the injection styles, prompt patterns, or GUI scenarios could overlap with model training data.",
    297           "source": "haiku"
    298         },
    299         "benchmark_contamination_addressed": {
    300           "applies": true,
    301           "answer": false,
    302           "justification": "The scenarios are novel but no discussion of whether similar adversarial injection patterns appeared in training corpora is provided.",
    303           "source": "haiku"
    304         }
    305       },
    306       "human_studies": {
    307         "pre_registered": {
    308           "applies": false,
    309           "answer": false,
    310           "justification": "No human participants.",
    311           "source": "haiku"
    312         },
    313         "irb_or_ethics_approval": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants.",
    317           "source": "haiku"
    318         },
    319         "demographics_reported": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human participants.",
    323           "source": "haiku"
    324         },
    325         "inclusion_exclusion_criteria": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants.",
    329           "source": "haiku"
    330         },
    331         "randomization_described": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human participants.",
    335           "source": "haiku"
    336         },
    337         "blinding_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants.",
    341           "source": "haiku"
    342         },
    343         "attrition_reported": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human participants.",
    347           "source": "haiku"
    348         }
    349       },
    350       "cost_and_practicality": {
    351         "inference_cost_reported": {
    352           "applies": true,
    353           "answer": false,
    354           "justification": "No API costs or inference latency are reported despite using multiple commercial API calls per optimization iteration across six models.",
    355           "source": "haiku"
    356         },
    357         "compute_budget_stated": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "No total computational budget (API calls, GPU hours, total cost) is stated anywhere in the paper.",
    361           "source": "haiku"
    362         }
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "EVA achieves substantially higher attack success rates than static baselines, e.g., +32% ASR improvement on GLM-4v-Plus in pop-up scenario",
    369       "evidence": "Table 2 shows EVA vs. baseline ASR across six models in pop-up and chat scenarios; pop-up gains range from +10% to +32%",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "EVA-evolved prompts transfer across GUI agent architectures with gains up to +46% over static baselines in cross-agent settings",
    374       "evidence": "Table 4 reports cross-agent pop-up transferability; Qwen2.5-VL→GPT-4V shows +46%, though some pairs show -2% to -4%",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "High-risk scenarios (payment, email) are inherently more resistant to indirect prompt injection, achieving 0% attack success rate across all six models",
    379       "evidence": "Table 3 shows 0% ASR for all six models in payment and email scenarios with high invalid/failure rates",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Injection styles reveal shared behavioral biases across GUI agents, with persuasive (49.8%) and urgency (40.0%) strategies dominating successful attacks",
    384       "evidence": "Table 5 shows consistent strategy distribution across scenarios and models; Figure 8 shows model-level heterogeneity within this pattern",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "Goal-conditioned adversarial prompts significantly outperform goal-agnostic ones, with ASR gains of +6% to +24% across models",
    389       "evidence": "Table 7 (Appendix D) shows w/ Goal vs w/o Goal comparison across six models in pop-up scenario",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "Spatial attention concentration — not semantic content alone — is the key mechanism distinguishing successful from unsuccessful injections",
    394       "evidence": "Figure 4 shows attention heatmap contrast between pop-up (concentrated) and chat (dispersed) injection; Section 5.2 discussion",
    395       "supported": "weak"
    396     }
    397   ],
    398   "methodology_tags": ["benchmark-eval", "case-study"],
    399   "key_findings": "EVA, a feedback-driven adaptive red-teaming framework, consistently outperforms static one-shot injection baselines in attacking GUI agents, with pop-up attack success rates improving 10–32 percentage points across six commercial and open-source models. Injection patterns evolved on one model transfer to others with consistent gains, revealing shared perceptual biases in GUI agents' visual decision-making. Counterintuitively, high-risk scenarios (payment, email) showed complete resistance (0% ASR), likely because overtly malicious injections in these contexts are more detectable. Persuasive and urgency-based language dominate successful attacks across scenarios, but susceptibility varies heterogeneously across agent architectures, confirming that no single static injection style is universally effective.",
    400   "red_flags": [
    401     {
    402       "flag": "No contemporary adaptive baselines",
    403       "detail": "Despite citing AdvWeb, EIA, WASP, and Zhan et al. 2025 in related work as closely related adaptive methods, none are used as baselines; the sole comparison is against a trivially weak one-shot generator."
    404     },
    405     {
    406       "flag": "Code not actually released",
    407       "detail": "Contribution (ii) claims to 'build and release a reproducible evaluation pipeline' but no URL, repository link, or access method is provided anywhere in the paper."
    408     },
    409     {
    410       "flag": "No statistical rigor on comparative claims",
    411       "detail": "All comparative claims are based on point estimates from n=50 samples per cell with no confidence intervals, significance tests, or variance reporting across runs."
    412     },
    413     {
    414       "flag": "Attention mechanism claimed without mechanistic evidence",
    415       "detail": "Section 5.2 attributes success to 'concentrated attention' but EVA operates in black-box mode with no access to attention maps; the explanation relies on post-hoc visualization with no causal validation."
    416     },
    417     {
    418       "flag": "ASR classifier not validated",
    419       "detail": "Success/failure/invalid classification is performed by an LLM using a custom prompt (Appendix B.5); no inter-rater reliability, human spot-check, or classifier accuracy is reported."
    420     }
    421   ],
    422   "cited_papers": [
    423     {
    424       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    425       "relevance": "Foundational paper formalizing indirect prompt injection in LLM toolchains; direct conceptual precursor to EVA's threat model"
    426     },
    427     {
    428       "title": "Caution for the environment: Multimodal agents are susceptible to environmental distractions",
    429       "relevance": "Empirically demonstrates GUI agent vulnerability to environmental injection; key prior work EVA directly builds upon"
    430     },
    431     {
    432       "title": "Attacking vision-language computer agents via pop-ups",
    433       "relevance": "Directly related static pop-up attack work against GUI agents; static predecessor to EVA's adaptive approach"
    434     },
    435     {
    436       "title": "AdvWeb: Controllable black-box attacks on VLM-powered web agents",
    437       "relevance": "Related DOM-level injection attack framework; contemporary adaptive method cited but not used as baseline"
    438     },
    439     {
    440       "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents",
    441       "relevance": "Contemporary adaptive injection attack work directly relevant to EVA's contribution; not compared against despite 2025 publication"
    442     },
    443     {
    444       "title": "WASP: Benchmarking web agent security against prompt injection attacks",
    445       "relevance": "Provides standardized testbeds for web agent security evaluation; cited as static benchmark EVA moves beyond"
    446     },
    447     {
    448       "title": "EIA: Environmental Injection Attack on Generalist Web Agents for Privacy Leakage",
    449       "relevance": "Defines the environmental injection attack class; key reference for the threat model and attack surface EVA extends"
    450     }
    451   ],
    452   "engagement_factors": {
    453     "practical_relevance": {
    454       "score": 3,
    455       "justification": "Demonstrates exploitable vulnerabilities in widely deployed commercial GUI agents (GPT-4o, GPT-4V) with concrete attack scenarios including phishing links and payment fraud."
    456     },
    457     "surprise_contrarian": {
    458       "score": 2,
    459       "justification": "The finding that high-stakes financial and email scenarios are MORE resistant to attack (0% ASR) is counterintuitive and challenges the assumption that higher-value targets are more vulnerable."
    460     },
    461     "fear_safety": {
    462       "score": 3,
    463       "justification": "Directly demonstrates that GUI agents executing real-world tasks (shopping, email, payments) can be hijacked via visual injection to perform phishing clicks and unintended financial actions."
    464     },
    465     "drama_conflict": {
    466       "score": 2,
    467       "justification": "Security arms race framing with major commercial models (OpenAI GPT-4V, GPT-4o) shown to be vulnerable creates adversarial conflict narrative."
    468     },
    469     "demo_ability": {
    470       "score": 1,
    471       "justification": "The attack framework is described with prompt templates and figures, but code is not released despite the paper's release claim, limiting immediate reproducibility."
    472     },
    473     "brand_recognition": {
    474       "score": 2,
    475       "justification": "Tests GPT-4V and GPT-4o from OpenAI; from Shanghai Jiao Tong University with active visibility in multimodal agent security research."
    476     }
    477   },
    478   "hn_data": {
    479     "threads": [
    480       {"hn_id": "44475634", "title": "Techno-feudalism and the rise of AGI: A future without economic rights?", "points": 239, "comments": 244, "url": "https://news.ycombinator.com/item?id=44475634"},
    481       {"hn_id": "45338086", "title": "Tech Report: Winning CRS from Team Atlanta (DARPA AIxCC)", "points": 23, "comments": 0, "url": "https://news.ycombinator.com/item?id=45338086"},
    482       {"hn_id": "44694507", "title": "Market-Derived Financial Sentiment Analysis: Context-Aware Language Models", "points": 5, "comments": 0, "url": "https://news.ycombinator.com/item?id=44694507"},
    483       {"hn_id": "32381602", "title": "Cryptocurrency Giveaway Scam with YouTube Live Stream", "points": 3, "comments": 1, "url": "https://news.ycombinator.com/item?id=32381602"},
    484       {"hn_id": "45955957", "title": "Official LIGO-Virgo-Kagra Benchmark Shows KFR Outperforming FFTW in CERN Root", "points": 3, "comments": 0, "url": "https://news.ycombinator.com/item?id=45955957"},
    485       {"hn_id": "44318076", "title": "The Impact of Generative AI on Social Media: An Experimental Study", "points": 3, "comments": 0, "url": "https://news.ycombinator.com/item?id=44318076"},
    486       {"hn_id": "42839070", "title": "Analyzing and Exploiting Branch Mispredictions in Microcode [pdf]", "points": 3, "comments": 0, "url": "https://news.ycombinator.com/item?id=42839070"},
    487       {"hn_id": "44696287", "title": "Cryptocurrency Giveaway Scam with YouTube Live Stream (2022)", "points": 2, "comments": 0, "url": "https://news.ycombinator.com/item?id=44696287"},
    488       {"hn_id": "44689864", "title": "A Fact-Grounded Multimodal Writing Assistant Based on Offline Knowledge Base", "points": 2, "comments": 0, "url": "https://news.ycombinator.com/item?id=44689864"},
    489       {"hn_id": "43424107", "title": "Tapered Off-Policy Reinforce: Stable and Efficient RL for LLMs", "points": 2, "comments": 0, "url": "https://news.ycombinator.com/item?id=43424107"}
    490     ],
    491     "top_points": 239,
    492     "total_points": 285,
    493     "total_comments": 245
    494   }
    495 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs