ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (29369B)


      1 {
      2   "paper": {
      3     "title": "BrowseSafe: Understanding and Preventing Prompt Injection Within AI Browser Agents",
      4     "authors": ["Kaiyuan Zhang", "Mark Tenenholtz", "Kyle Polley", "Jerry Ma", "Denis Yarats", "Ninghui Li"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2511.20597",
      8     "doi": "10.48550/arXiv.2511.20597"
      9   },
     10   "checklist": {
     11     "artifacts": {
     12       "code_released": {
     13         "applies": true,
     14         "answer": false,
     15         "justification": "The paper provides a HuggingFace model URL (https://huggingface.co/perplexity-ai/browsesafe) and dataset URL (https://huggingface.co/datasets/perplexity-ai/browsesafe-bench), but no source code repository is provided for the benchmark construction pipeline, evaluation scripts, or training code."
     16       },
     17       "data_released": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "The paper provides a HuggingFace dataset link for BrowseSafe-Bench (https://huggingface.co/datasets/perplexity-ai/browsesafe-bench) and describes the dataset as comprising 14,719 samples partitioned into training (11,039) and test (3,680) sets."
     21       },
     22       "environment_specified": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions using Qwen3-30B-A3B-Instruct-2507 as the base model and states training hyperparameters (learning rate 1e-5, weight decay 0.1, one epoch) but does not specify software dependencies or environment details."
     26       },
     27       "reproduction_instructions": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are provided. The evaluation protocol is described at a high level (full HTML content, 80k token max) but lacks specific commands or procedures for reproducing results."
     31       }
     32     },
     33     "statistical_methodology": {
     34       "confidence_intervals_or_error_bars": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "All results in Table 2 and throughout the paper are reported as point estimates (e.g., F1=0.904, Precision=0.978) without confidence intervals, error bars, or uncertainty quantification."
     38       },
     39       "significance_tests": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper makes numerous comparative claims (e.g., BrowseSafe achieves state-of-the-art, fine-tuning yields performance advantage) but no statistical significance tests are reported. Comparisons are based solely on numerical differences in metrics."
     43       },
     44       "effect_sizes_reported": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "The paper reports absolute metric values with baselines, enabling effect size assessment. For example, BrowseSafe F1=0.904 vs. Sonnet 4.5 (32K) F1=0.863, with precision and recall breakdowns (Section 5.3). The generalization ablation (Table 3) shows F1 drops from 0.905 baseline to 0.788 for held-out injection strategies."
     48       },
     49       "sample_size_justified": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The benchmark comprises 14,719 samples (11,039 train, 3,680 test) but no justification is given for these sizes. No power analysis or discussion of whether the test set is large enough for the granular per-attack-type breakdowns (some categories have as few as n=2 for table cell rewrite)."
     53       },
     54       "variance_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run evaluations. The fine-tuned BrowseSafe model results are reported without any indication of training variance across seeds."
     58       }
     59     },
     60     "evaluation_design": {
     61       "baselines_included": {
     62         "applies": true,
     63         "answer": true,
     64         "justification": "The paper compares BrowseSafe against 23 frontier models including PromptGuard-2 (22M, 86M), gpt-oss-safeguard (20B, 120B), GPT-5, GPT-5 Mini, Haiku 4.5, and Sonnet 4.5 with various configurations (Table 2)."
     65       },
     66       "baselines_contemporary": {
     67         "applies": true,
     68         "answer": true,
     69         "justification": "The baselines include the most recent frontier models available at the time of writing: GPT-5, Sonnet 4.5, Haiku 4.5, PromptGuard-2, and gpt-oss-safeguard. These are contemporary state-of-the-art models."
     70       },
     71       "ablation_study": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Table 3 presents a generalization ablation study examining BrowseSafe performance under held-out URLs, held-out attack types, and held-out injection strategies. The paper also examines the impact of reasoning settings and context window sizes on model performance (Section 5.4)."
     75       },
     76       "multiple_metrics": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "The paper uses five evaluation metrics: F1 Score, Precision, Recall, Balanced Accuracy, and Refusals (Section 5.1). All are reported in Table 2."
     80       },
     81       "human_evaluation": {
     82         "applies": true,
     83         "answer": false,
     84         "justification": "No human evaluation is included. The paper evaluates prompt injection detection using automated binary classification metrics only. Human evaluation of detection quality, false positive cases, or attack realism could strengthen the claims."
     85       },
     86       "held_out_test_set": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "The benchmark is explicitly partitioned into a training set of 11,039 samples and a test set of 3,680 samples (Section 3). All evaluation results in Table 2 are reported on the test set."
     90       },
     91       "per_category_breakdown": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper provides extensive per-category breakdowns: detection accuracy by attack type (Figure 7), by injection strategy (Figure 8), by linguistic style (Figure 9), by distractor count (Figure 10), and full heatmaps of balanced accuracy across all models and categories (Figures 12, 13)."
     95       },
     96       "failure_cases_discussed": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "The paper discusses failure modes including: multilanguage attacks being hardest to detect (76.0% balanced accuracy, Figure 7), visible content rewriting strategies being challenging (footer rewrite 69.1%, Figure 8), stealth linguistic style evading detection (75.3%, Figure 9), and distractor elements causing accuracy drops (Section 3.6)."
    100       },
    101       "negative_results_reported": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "The paper reports several negative findings: PromptGuard-2 performs poorly (F1=0.35-0.36), held-out injection strategies cause significant degradation (F1 drops to 0.788 from 0.905 in Table 3), and Sonnet 4.5 has high refusal rates (419-669 refusals). The paper also notes that LLMs 'quickly overfit and generalize poorly to realistic attacks' without hard negatives (Section 4.5.1)."
    105       }
    106     },
    107     "claims_and_evidence": {
    108       "abstract_claims_supported": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The abstract claims the benchmark is comprehensive (supported by 14,719 samples across multiple dimensions in Section 3), that frontier models remain vulnerable (supported by Table 2 showing no model exceeding 0.904 F1), and that BrowseSafe achieves state-of-the-art performance (supported by Table 2 showing BrowseSafe at 0.904 F1 vs. best baseline 0.863)."
    112       },
    113       "causal_claims_justified": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "The paper's causal claims are modest and generally supported. The ablation study (Table 3) uses controlled single-variable manipulation (holding out URLs, attack types, or injection strategies) to demonstrate what factors affect generalization. Claims about fine-tuning benefits are supported by comparing the same base model with and without fine-tuning."
    117       },
    118       "generalization_bounded": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The paper explicitly bounds its scope in the threat model (Section 4.2): 'we do not handle or benchmark attacks based on non-textual inputs, such as malicious images or other inputs specialized for vision-based models.' The generalization analysis (Section 5.6) explicitly tests and reports limitations for unseen injection strategies. The paper also notes small sample sizes for some categories (footer rewrite n=15, table cell rewrite n=2)."
    122       },
    123       "alternative_explanations_discussed": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper discusses alternative explanations: that detection models may rely on 'shallow heuristics' rather than understanding (Section 3.4), that high accuracy on hidden attacks could reflect 'spurious correlations' (Section 3.6), and that the standard test set 'may represent a challenging sample of websites' explaining why held-out URL performance was higher (Section 5.6)."
    127       }
    128     },
    129     "setup_transparency": {
    130       "model_versions_specified": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "The paper refers to models by marketing names without specific version IDs: 'GPT-5', 'GPT-5 Mini', 'Haiku 4.5', 'Sonnet 4.5'. No API version, snapshot date, or model checkpoint identifier is provided for any closed-weight model. The fine-tuned model uses 'Qwen3-30B-A3B-Instruct-2507' which does include a version identifier."
    134       },
    135       "prompts_provided": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "The paper provides the safeguard policy prompt used for gpt-oss-safeguard models (Appendix A.4) and the instruction prompt used for Anthropic/OpenAI models (Appendix A.5), both with sufficient detail to reconstruct the evaluation setup."
    139       },
    140       "hyperparameters_reported": {
    141         "applies": true,
    142         "answer": false,
    143         "justification": "For fine-tuning, the paper reports learning rate (1e-5), weight decay (0.1), and one epoch (Section 5.1). However, critical inference-time hyperparameters like temperature and top-p are not reported for any of the 23 models evaluated. Different reasoning settings (Low, Medium, High) are tested but the specific parameter values behind these labels are not disclosed."
    144       },
    145       "scaffolding_described": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "The BrowseSafe defense architecture is described in detail across Sections 4.3-4.7, including trust boundary enforcement, content preprocessing, chunking strategy, conservative OR aggregation, and context-engineered intervention. The multi-layered design is clearly documented."
    149       },
    150       "data_preprocessing_documented": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Section 3.2 documents the benchmark construction pipeline in four stages: (1) content extraction and anonymization from real websites, (2) HTML template wrapping with eight styles, (3) distractor element insertion, and (4) attack injection. Section 4.4 describes the raw content extraction preprocessing for the defense system."
    154       }
    155     },
    156     "limitations_and_scope": {
    157       "limitations_section_present": {
    158         "applies": true,
    159         "answer": true,
    160         "justification": "Section 4.2 (Threat Model / BrowseSafe's Defense Scope) contains a dedicated discussion of limitations: 'BrowseSafe has limitations and in this work we do not handle or benchmark attacks based on non-textual inputs, such as malicious images or other inputs specialized for vision-based models.' Additional limitations are noted in Section 4.5.4 about boundary case handling."
    161       },
    162       "threats_to_validity_specific": {
    163         "applies": true,
    164         "answer": false,
    165         "justification": "There is no dedicated threats to validity section. While the paper notes specific scope limitations (no image attacks, no vision-based models), it does not discuss threats specific to the study design, such as: whether the LLM-generated attacks in the benchmark represent real-world attack distributions, whether the production data used for HTML scaffolds introduces bias, or whether the single-run evaluation design could produce unreliable results."
    166       },
    167       "scope_boundaries_stated": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "The paper explicitly states what it does NOT cover: non-textual/image-based attacks (Section 4.2), tool call scanning (Section 4.5.5, noted as 'future work'), global content analysis across chunks (Section 4.6), and the scope is bounded to text-based prompt injection in browser agents specifically."
    171       }
    172     },
    173     "data_integrity": {
    174       "raw_data_available": {
    175         "applies": true,
    176         "answer": true,
    177         "justification": "The BrowseSafe-Bench dataset is released on HuggingFace (https://huggingface.co/datasets/perplexity-ai/browsesafe-bench), enabling independent verification of the benchmark samples and evaluation results."
    178       },
    179       "data_collection_described": {
    180         "applies": true,
    181         "answer": true,
    182         "justification": "Section 3.2 describes the data collection process: starting from 'usage data from a production browser agent with millions of active users,' filtering to 100,000 anonymized tool call outputs satisfying quality and length criteria, then using these to derive HTML scaffolds and identify variation axes."
    183       },
    184       "recruitment_methods_described": {
    185         "applies": false,
    186         "answer": false,
    187         "justification": "No human participants were recruited for this study. The benchmark is constructed from production data and LLM-generated attacks, not from human subjects."
    188       },
    189       "data_pipeline_documented": {
    190         "applies": true,
    191         "answer": true,
    192         "justification": "The full data pipeline is documented in Section 3.2 with four explicit stages (content extraction/anonymization, HTML wrapping, distractor insertion, attack injection). The paper states the final dataset size (14,719 samples) and the train/test split (11,039/3,680). Section 3.3-3.7 detail each component of the pipeline."
    193       }
    194     },
    195     "conflicts_of_interest": {
    196       "funding_disclosed": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "The Acknowledgements section discloses NSF funding: 'Work by Kaiyuan Zhang and Ninghui Li was supported by the U.S. National Science Foundation AI Institute for Agent-based Cyber Threat Intelligence and Operation (ACTION), with NSF grant number 2229876.'"
    200       },
    201       "affiliations_disclosed": {
    202         "applies": true,
    203         "answer": true,
    204         "justification": "Author affiliations are clearly listed: Purdue University (Kaiyuan Zhang, Ninghui Li) and Perplexity AI (Mark Tenenholtz, Kyle Polley, Jerry Ma, Denis Yarats). This is relevant since the paper evaluates browser agent security and Perplexity operates a production browser agent (Comet, referenced in Section 1)."
    205       },
    206       "funder_independent_of_outcome": {
    207         "applies": true,
    208         "answer": false,
    209         "justification": "Four of six authors are affiliated with Perplexity AI, which operates Perplexity Comet, a production browser agent referenced in the paper. Perplexity has a direct financial interest in demonstrating that their BrowseSafe defense works well for browser agent security. The NSF funding for the Purdue authors appears independent, but the primary research effort comes from Perplexity employees."
    210       },
    211       "financial_interests_declared": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No competing interests or financial interests statement is provided. The Perplexity AI affiliation is disclosed, but the paper does not include an explicit declaration of whether authors hold equity or other financial interests related to the findings, which is relevant given that Perplexity is a startup evaluating its own defense technology."
    215       }
    216     },
    217     "contamination": {
    218       "training_cutoff_stated": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "The paper evaluates multiple pre-trained models (GPT-5, Sonnet 4.5, Haiku 4.5, etc.) on the BrowseSafe-Bench benchmark but does not state training data cutoff dates for any of these models. This is relevant because the benchmark uses content derived from production data that these models may have been trained on."
    222       },
    223       "train_test_overlap_discussed": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "No discussion of whether the HTML content used in the benchmark overlaps with training data of the evaluated frontier models. The benchmark is constructed from anonymized production data, but there is no analysis of whether similar content appeared in the training sets of GPT-5, Sonnet 4.5, etc."
    227       },
    228       "benchmark_contamination_addressed": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "BrowseSafe-Bench is a new benchmark, which reduces contamination risk. However, the paper does not discuss whether the attack patterns and HTML structures in the benchmark resemble training data used to build the safety capabilities of the models being evaluated. Since system cards for GPT-5 and Claude models reference internal prompt injection benchmarks, there could be indirect overlap."
    232       }
    233     },
    234     "human_studies": {
    235       "pre_registered": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "No human participants in this study. The evaluation is entirely automated using benchmark classification metrics."
    239       },
    240       "irb_or_ethics_approval": {
    241         "applies": false,
    242         "answer": false,
    243         "justification": "No human participants in this study."
    244       },
    245       "demographics_reported": {
    246         "applies": false,
    247         "answer": false,
    248         "justification": "No human participants in this study."
    249       },
    250       "inclusion_exclusion_criteria": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "randomization_described": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "blinding_described": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "attrition_reported": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       }
    270     },
    271     "cost_and_practicality": {
    272       "inference_cost_reported": {
    273         "applies": true,
    274         "answer": true,
    275         "justification": "Figure 11 reports median inference latency (P50) for all evaluated models. BrowseSafe achieves <1 second, GPT-5 family ~2 seconds, Sonnet 4.5 family 23-36 seconds, PromptGuard-2 0.19s on CPU. The paper explicitly analyzes the performance-latency tradeoff (Section 5.5)."
    276       },
    277       "compute_budget_stated": {
    278         "applies": true,
    279         "answer": false,
    280         "justification": "No total computational budget is stated. The paper does not report GPU hours for fine-tuning, total API costs for evaluating 23 models across 3,680 test samples, hardware used for training, or total wall-clock time for the experiments."
    281       }
    282     }
    283   },
    284   "claims": [
    285     {
    286       "claim": "BrowseSafe achieves state-of-the-art F1 score of 0.904 on BrowseSafe-Bench, outperforming all 23 evaluated frontier models.",
    287       "evidence": "Table 2 shows BrowseSafe at F1=0.904, Precision=0.978, Recall=0.841, Balanced Accuracy=0.912. The best baseline is Sonnet 4.5 (32K) at F1=0.863.",
    288       "supported": "moderate"
    289     },
    290     {
    291       "claim": "Even the most capable frontier AI models remain vulnerable to complex and realistic prompt injection payloads in BrowseSafe-Bench.",
    292       "evidence": "Table 2 shows that no off-the-shelf model exceeds F1=0.863 on the benchmark. Figure 2 shows F1 scores ranging from 35% to 90.4% across all models.",
    293       "supported": "strong"
    294     },
    295     {
    296       "claim": "Current detection models rely on shallow linguistic heuristics rather than robust semantic comprehension, as evidenced by degraded performance on stealth and indirect attacks.",
    297       "evidence": "Figure 9 shows explicit attacks at 84.6% balanced accuracy vs. indirect at 78.1% and stealth at 75.3%. Figure 7 shows multilanguage attacks at 76.0% (lowest) vs. system prompt exfiltration at 85.0% (highest).",
    298       "supported": "strong"
    299     },
    300     {
    301       "claim": "Fine-tuning a specialized detection model on BrowseSafe-Bench yields a distinct performance advantage over general-purpose models.",
    302       "evidence": "Section 5.3 and Table 2: BrowseSafe (fine-tuned Qwen3-30B-A3B) achieves F1=0.904 vs. best general-purpose Sonnet 4.5 (32K) at F1=0.863. Precision improves to 0.978 vs 0.935.",
    303       "supported": "moderate"
    304     },
    305     {
    306       "claim": "Distractor elements cause a precipitous drop in detection accuracy for existing models.",
    307       "evidence": "Figure 10 shows average balanced accuracy drops from 90.5% with zero distractors to 81.3% with three distractors, then stabilizes in the 79.6%-83.0% range for higher distractor counts.",
    308       "supported": "strong"
    309     },
    310     {
    311       "claim": "BrowseSafe achieves sub-1-second latency while maintaining the highest F1 score.",
    312       "evidence": "Figure 11 plots F1 vs. P50 latency, showing BrowseSafe in the top-left quadrant with F1>90% and latency <1 second, compared to Sonnet 4.5 at 23-36 seconds.",
    313       "supported": "moderate"
    314     },
    315     {
    316       "claim": "Unseen injection strategies present the greatest generalization challenge for the fine-tuned model.",
    317       "evidence": "Table 3: Held-out injection strategies F1=0.788 vs. baseline 0.905, while held-out attack types only dropped to 0.863 and held-out URLs improved to 0.935.",
    318       "supported": "strong"
    319     }
    320   ],
    321   "methodology_tags": ["benchmark-eval"],
    322   "key_findings": "BrowseSafe-Bench is a 14,719-sample benchmark for evaluating prompt injection detection in browser agents, featuring 11 attack types, 9 injection strategies, and realistic HTML with distractor elements. Evaluation of 23 frontier models shows that even the best off-the-shelf models achieve at most F1=0.863, with particular weakness against stealth linguistic styles, visible content rewrites, and multilanguage attacks. A fine-tuned Qwen3-30B-A3B model (BrowseSafe) achieves F1=0.904 with sub-1-second latency, outperforming all general-purpose models on precision and balanced accuracy while maintaining practical deployment speed.",
    323   "red_flags": [
    324     {
    325       "flag": "Company evaluating its own defense product",
    326       "detail": "Four of six authors are Perplexity AI employees, and the paper evaluates BrowseSafe, a defense mechanism designed for Perplexity's production browser agent (Comet). The benchmark and defense were developed by the same team, creating a potential self-evaluation bias. No external or independent evaluation is included."
    327     },
    328     {
    329       "flag": "No statistical tests or variance reporting",
    330       "detail": "All results are single-run point estimates without confidence intervals, error bars, significance tests, or multi-seed variance. The claim that BrowseSafe outperforms baselines (F1=0.904 vs 0.863) is not accompanied by any statistical test to determine if this difference is significant."
    331     },
    332     {
    333       "flag": "No model version IDs for closed-weight models",
    334       "detail": "The evaluated models (GPT-5, Sonnet 4.5, Haiku 4.5) are identified only by marketing names without API version strings or snapshot dates. Since model behavior changes across versions, exact reproduction is impossible."
    335     },
    336     {
    337       "flag": "Small sample sizes in some benchmark categories",
    338       "detail": "Some injection strategy categories have very few samples: table cell rewrite (n=2), footer rewrite (n=15), list item rewrite (n=9). Per-category balanced accuracy results for these categories are likely unreliable, as the paper itself acknowledges for footer and table cell rewrite."
    339     },
    340     {
    341       "flag": "Benchmark constructed by same team as defense",
    342       "detail": "BrowseSafe-Bench was designed and constructed by the same team that built the BrowseSafe defense. The defense model was trained on the benchmark's training split. While a held-out test set is used, there is a risk that benchmark design decisions inadvertently favor the defense approach."
    343     }
    344   ],
    345   "cited_papers": [
    346     {
    347       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    348       "authors": ["Q. Zhan", "Z. Liang", "Z. Ying", "D. Kang"],
    349       "year": 2024,
    350       "arxiv_id": "2403.02691",
    351       "relevance": "Prompt injection benchmark for tool-integrated LLM agents, directly compared in BrowseSafe-Bench evaluation."
    352     },
    353     {
    354       "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents",
    355       "authors": ["E. Debenedetti", "J. Zhang", "M. Balunovic", "L. Beurer-Kellner", "M. Fischer", "F. Tramer"],
    356       "year": 2024,
    357       "relevance": "Dynamic benchmark for prompt injection attack/defense evaluation in LLM agents, one of the key prior benchmarks compared against."
    358     },
    359     {
    360       "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-based Agents",
    361       "authors": ["H. Zhang", "J. Huang", "K. Mei"],
    362       "year": 2024,
    363       "arxiv_id": "2410.02644",
    364       "relevance": "Benchmark for agent security covering prompt injection, memory poisoning, and backdoor attacks in LLM-based agents."
    365     },
    366     {
    367       "title": "WASP: Benchmarking Web Agent Security Against Prompt Injection Attacks",
    368       "authors": ["I. Evtimov", "A. Zharmagambetov", "A. Grattafiori", "C. Guo", "K. Chaudhuri"],
    369       "year": 2025,
    370       "arxiv_id": "2504.18575",
    371       "relevance": "Web agent security benchmark built on VisualWebArena, directly compared in the paper's benchmark comparison table."
    372     },
    373     {
    374       "title": "Defeating Prompt Injections by Design",
    375       "authors": ["E. Debenedetti", "I. Shumailov", "T. Fan", "J. Hayes", "N. Carlini"],
    376       "year": 2025,
    377       "arxiv_id": "2503.18813",
    378       "relevance": "Proposes CaMeL, a protective system layer for LLM security against prompt injection, representing an architectural defense approach."
    379     },
    380     {
    381       "title": "Securing AI Agents with Information-Flow Control",
    382       "authors": ["M. Costa", "B. Kopf", "A. Kolluri", "A. Paverd"],
    383       "year": 2025,
    384       "arxiv_id": "2505.23643",
    385       "relevance": "Explores formal security guarantees for AI agents using information-flow control (FIDES system), a defense approach complementary to BrowseSafe."
    386     },
    387     {
    388       "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses",
    389       "authors": ["Y. Liu", "Y. Jia", "R. Geng", "J. Jia", "N. Z. Gong"],
    390       "year": 2024,
    391       "relevance": "Provides formal treatment of prompt injection attacks and defenses, published at USENIX Security 2024."
    392     },
    393     {
    394       "title": "LLM Agents Should Employ Security Principles",
    395       "authors": ["K. Zhang", "Z. Su", "P.-Y. Chen", "E. Bertino", "X. Zhang", "N. Li"],
    396       "year": 2025,
    397       "arxiv_id": "2505.24019",
    398       "relevance": "Proposes AgentSandbox enforcing security principles (defense-in-depth, least privilege) for agentic systems, by the same first author."
    399     },
    400     {
    401       "title": "StruQ: Defending against prompt injection with structured queries",
    402       "authors": ["S. Chen", "J. Piet", "C. Sitawarin", "D. Wagner"],
    403       "year": 2025,
    404       "relevance": "Defense approach separating prompts and data into two channels, published at USENIX Security 2025."
    405     },
    406     {
    407       "title": "The attacker moves second: Stronger adaptive attacks bypass defenses against llm jailbreaks and prompt injections",
    408       "authors": ["M. Nasr", "N. Carlini", "C. Sitawarin"],
    409       "year": 2025,
    410       "arxiv_id": "2510.09023",
    411       "relevance": "Demonstrates that adaptive attacks can bypass prompt injection defenses, directly relevant to evaluating defense robustness."
    412     },
    413     {
    414       "title": "WAInjectBench: Benchmarking Prompt Injection Detections for Web Agents",
    415       "authors": ["Y. Liu", "R. Xu", "X. Wang", "Y. Jia", "N. Z. Gong"],
    416       "year": 2025,
    417       "arxiv_id": "2510.01354",
    418       "relevance": "Prompt injection detection benchmark for web agents with malicious/benign samples, one of the key prior benchmarks compared against."
    419     },
    420     {
    421       "title": "EIA: Environmental Injection Attack on Generalist Web Agents for Privacy Leakage",
    422       "authors": ["Z. Liao", "L. Mo", "C. Xu"],
    423       "year": 2024,
    424       "arxiv_id": "2409.11295",
    425       "relevance": "Demonstrates environmental injection attacks for privacy leakage against web agents, a key attack type addressed by BrowseSafe."
    426     }
    427   ]
    428 }

Impressum · Datenschutz