ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27023B)


      1 {
      2   "paper": {
      3     "title": "AgentTypo: Adaptive Typographic Prompt Injection Attacks against Black-box Multimodal Agents",
      4     "authors": [
      5       "Yanjie Li",
      6       "Yiming Cao",
      7       "Dong Wang",
      8       "Bin Xiao"
      9     ],
     10     "year": 2025,
     11     "venue": "arXiv preprint (submitted IEEE, manuscript received Oct 1, 2025)",
     12     "arxiv_id": "2510.04257"
     13   },
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No GitHub repository or archive URL is provided in the paper. The paper describes code artifacts (the ATPI algorithm, optuna-based TPE, RAG module) but provides no link or archive for reproduction."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "The paper uses the publicly available VWA-Adv benchmark and VisualWebArena benchmark, which are referenced with links to their public repositories. The authors did not collect new proprietary data beyond these public benchmarks."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions 'eight NVIDIA RTX 3090 GPUs' and the 'optuna' library for TPE, but provides no requirements file, Dockerfile, or comprehensive list of library versions sufficient to recreate the environment."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No README, script, or step-by-step instructions are provided. The paper describes the algorithm abstractly (Algorithm 1) but does not provide executable reproduction instructions."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All results in Tables III and IV are reported as point estimates (e.g., '0.45', '0.68') with no confidence intervals, error bars, or uncertainty measures. Multiple trials per task are mentioned but only aggregate ASR is reported."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper makes comparative claims (e.g., 'AgentTypo significantly outperforms') but applies no statistical tests — no p-values, bootstrapped confidence intervals, or any other significance measure."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "Only raw ASR differences are reported (e.g., 23% to 45%). No standardized effect sizes (Cohen's d, etc.) or relative improvement normalized to a meaningful baseline are provided."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The VWA-Adv benchmark comprises 77 tasks. No justification is given for whether 77 tasks provide adequate statistical power to support the comparative claims made. No power analysis is discussed."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "The paper states 'each prompt is executed with three independent trials' but only aggregate ASR is reported. No standard deviation, variance, or spread across the three trials is provided for any result."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Four baselines are included: Raw Prompt Injection, InjecAgent, AdvAgent, and AgentAttack. These cover both text-based and image-based prior attack methods."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include AgentAttack (ICLR 2025), AdvAgent (ICML 2025), and InjecAgent (ACL 2024), which are contemporary to the 2025 submission. The baselines represent the current state of the art in multimodal agent attacks."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Section VI.E presents an ablation across four configurations: ATPI alone, Strategy Library alone, Strategy+RAG, and the full AgentTypo-pro (ATPI+Strategy+RAG), clearly isolating the contribution of each component."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Only Attack Success Rate (ASR) is reported as the evaluation metric. No secondary metrics such as query cost, number of iterations to success, stealthiness scores, or defense evasion rates are reported alongside ASR in the main results tables."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The paper mentions 'manual human review' to validate the LLM-based scorer's accuracy, but does not conduct human evaluation of actual attack outputs or agent behavior. The primary evaluation pipeline is automated (LLM-based scorer)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "Section VI.G tunes hyperparameters (lambda, step budget, number of retrieved examples) by measuring ASR on the same 77 VWA-Adv tasks (Figures 6, 7, 8). No separate dev/test split is mentioned. The reported numbers are on data used for hyperparameter selection decisions."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table III provides per-domain breakdowns (Classifieds, Shopping, Reddit) and per-model breakdowns (GPT-4V, GPT-4o, GPT-4o-mini, Gemini-1.5-Pro, Claude-3-Opus) for both image+text and image-only settings."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The limitations section (Section VIII) discusses the trade-off between stealth and effectiveness: 'achieving high attack success rates currently requires the embedded text to be relatively conspicuous.' Section VI.D also discusses why AgentAttack fails on complex tasks (0% ASR on email)."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The ablation (Table IV) shows that using Strategy Library alone without ATPI yields lower ASR than the full system, and Section VI.G shows ASR degrades when using too many retrieved examples (beyond 5), reporting negative effects of component choices."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The abstract claims 'raises the success rate from 23% to 45%' on GPT-4o image-only attacks, and 'achieves 68% ASR in image+text settings.' Table III confirms these numbers for GPT-4o. The abstract's main quantitative claims are backed by Table III results."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The paper makes causal claims such as 'Incorporating the Strategy Library shows a substantial improvement in ASR' and attributes performance gains to specific components via ablation. The ablation controls for single variables, which is methodologically appropriate for causal attribution, but the absence of statistical tests means the claimed causal effect sizes cannot be distinguished from noise over 77 tasks."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The abstract and conclusion claim AgentTypo 'poses a practical and potent threat to multimodal agents' broadly, but experiments are only on three simulated websites (Classifieds, Shopping, Reddit) using VWA-Adv. The paper does not bound claims to this specific setting — Section VIII acknowledges limited benchmark diversity only briefly."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not consider alternative explanations for its results. For example, it does not discuss whether the observed improvement over AgentAttack is partly due to AgentAttack's known CLIP surrogate mismatch (which it states as fact rather than discussing as a confound), or whether the improvements might be task-specific. The limitations section addresses stealth vs. effectiveness but not alternative explanations for the observed gains."
    131       }
    132     },
    133     "setup_transparency": {
    134       "model_versions_specified": {
    135         "applies": true,
    136         "answer": false,
    137         "justification": "Models are cited as 'GPT-4V', 'GPT-4o', 'GPT-4o-mini', 'Gemini-1.5-Pro', and 'Claude-3-Opus' without snapshot dates or API version strings. Per the schema, marketing names without version identifiers do not count. The attacker/scorer/summarizer LLMs are described only as 'GPT-4' without version."
    138       },
    139       "prompts_provided": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Prompts for the Attacker LLM, Scorer LLM, and Summarizer LLM are described only in natural language (e.g., 'the Attacker LLM formulates a hijacking prompt according to a specified adversarial goal'). Table V shows example injection prompts but not the system prompts used for the LLM components. No full prompt text is provided."
    143       },
    144       "hyperparameters_reported": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Section VI.C reports key hyperparameters: stealthiness weight lambda=10.0, maximum 20 optimization steps, top-k=5 retrieval examples, success threshold=0.8. Table I lists the ATPI parameter ranges (font size, color, position, etc.)."
    148       },
    149       "scaffolding_described": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "AgentTypo-pro's multi-LLM scaffolding is described in detail in Section V: Attacker LLM, Scorer LLM, Summarizer LLM, and RAG module are each described with their roles, inputs, outputs, and interaction logic (Equations 7-10, Figure 4)."
    153       },
    154       "data_preprocessing_documented": {
    155         "applies": true,
    156         "answer": true,
    157         "justification": "Section VI.A describes the data setup: VWA-Adv's 77 tasks across three sites, and Section III.A describes how webpages are preprocessed (JavaScript annotation of interactable elements, SoM parsing, captioning). The data pipeline from webpage to agent input is documented."
    158       }
    159     },
    160     "limitations_and_scope": {
    161       "limitations_section_present": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Section VIII 'Limitations' is a dedicated section discussing two limitations: the stealth-effectiveness trade-off and the restricted benchmark scope (three websites)."
    165       },
    166       "threats_to_validity_specific": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The limitations mentioned ('achieving high attack success rates currently requires the embedded text to be relatively conspicuous' and 'limited availability of benchmarks') are partially specific but do not address core validity threats such as the small benchmark size (77 tasks), potential cherry-picking of the threshold (0.8), or the reliability of the LLM-based scorer itself."
    170       },
    171       "scope_boundaries_stated": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "The paper does not explicitly state what results do NOT show. Section VIII mentions the benchmark is limited to three websites, but does not make explicit statements like 'these results do not imply effectiveness against text-only agents' or enumerate claims they are not making."
    175       }
    176     },
    177     "data_integrity": {
    178       "raw_data_available": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Raw experimental results (the individual task outcomes, LLM scorer responses, agent action logs) are not released. Only aggregate ASR tables are published. Independent verification of the reported numbers is not possible."
    182       },
    183       "data_collection_described": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "The VWA-Adv benchmark is a well-described public dataset. The paper uses this existing benchmark without collecting new data. The benchmark's composition (77 tasks, three domains) is described in Section VI.A."
    187       },
    188       "recruitment_methods_described": {
    189         "applies": false,
    190         "answer": false,
    191         "justification": "No human participants are involved. The study uses automated benchmark evaluation with LLM-based agents; no participant recruitment is needed."
    192       },
    193       "data_pipeline_documented": {
    194         "applies": true,
    195         "answer": true,
    196         "justification": "The pipeline from benchmark task to attack execution to scoring is documented: task selection from VWA-Adv -> image modification via ATPI -> agent execution (3 trials per task) -> LLM-based scoring -> ASR calculation. The flow is clear in Sections VI.A-C."
    197       }
    198     },
    199     "conflicts_of_interest": {
    200       "funding_disclosed": {
    201         "applies": true,
    202         "answer": false,
    203         "justification": "There is no acknowledgments section and no mention of funding sources anywhere in the paper. The absence of any funding disclosure is a concern."
    204       },
    205       "affiliations_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "All authors are affiliated with the Computing Department, Hong Kong Polytechnic University. Affiliations are listed in the author footnote. No author is affiliated with any of the evaluated commercial model providers."
    209       },
    210       "funder_independent_of_outcome": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No funding is disclosed. The paper does not state it is unfunded, so we cannot confirm the 'NA if unfunded' exception applies. Absence of funding disclosure means independence cannot be verified."
    214       },
    215       "financial_interests_declared": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "There is no competing interests statement anywhere in the paper. Absence of disclosure is not the same as absence of conflict."
    219       }
    220     },
    221     "contamination": {
    222       "training_cutoff_stated": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "The paper uses GPT-4V, GPT-4o, GPT-4o-mini, Gemini-1.5-Pro, and Claude-3-Opus as target agents but states no training cutoff dates for any of these models. This matters for assessing whether models could have seen VWA-Adv tasks."
    226       },
    227       "train_test_overlap_discussed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper does not discuss whether any VWA-Adv benchmark tasks or VisualWebArena underlying data could have been in the training data of the evaluated LVLMs. This is particularly relevant since VisualWebArena was published in 2024."
    231       },
    232       "benchmark_contamination_addressed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "The VWA-Adv benchmark is based on VisualWebArena (2024). Several evaluated models (GPT-4o, Gemini-1.5-Pro, Claude-3-Opus) have training cutoffs that may include this benchmark. Contamination is not discussed."
    236       }
    237     },
    238     "human_studies": {
    239       "pre_registered": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants are involved. This is a purely automated benchmark evaluation study."
    243       },
    244       "irb_or_ethics_approval": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants are involved. IRB approval is not applicable."
    248       },
    249       "demographics_reported": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants are involved. Demographics are not applicable."
    253       },
    254       "inclusion_exclusion_criteria": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants are involved. Inclusion/exclusion criteria are not applicable."
    258       },
    259       "randomization_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants are involved. Randomization of participant assignment is not applicable."
    263       },
    264       "blinding_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants are involved. Blinding is not applicable."
    268       },
    269       "attrition_reported": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants are involved. Attrition is not applicable."
    273       }
    274     },
    275     "cost_and_practicality": {
    276       "inference_cost_reported": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "The paper calls GPT-4o, GPT-4V, and other commercial APIs extensively (3 trials per task × 77 tasks × multiple attack iterations up to 20 steps) but provides no cost estimates, token counts, or API expenditure figures."
    280       },
    281       "compute_budget_stated": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "Only 'eight NVIDIA RTX 3090 GPUs' is mentioned for ATPI optimization. Total wall-clock time, GPU hours, or API budget for the full experimental suite are not reported."
    285       }
    286     }
    287   },
    288   "claims": [
    289     {
    290       "claim": "AgentTypo-pro raises the image-only attack success rate against GPT-4o agents from 23% (AgentAttack) to 45%.",
    291       "evidence": "Table III, Section VI.D: GPT-4o image-only column shows AgentAttack ASR=0.23 average vs. AgentTypo-pro ASR=0.45 average across Classifieds/Shopping/Reddit.",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "In image+text settings, AgentTypo-pro achieves 68% ASR on GPT-4o agents, outperforming all baselines.",
    296       "evidence": "Table III, GPT-4o image+text column: AgentTypo-pro=0.68 vs. AdvAgent=0.60, InjecAgent=0.35, AgentAttack=0.26, RawInject=0.05.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "ATPI alone (without prompt content optimization) already outperforms AgentAttack on image-only attacks.",
    301       "evidence": "Table IV ablation: AgentTypo-base (ATPI) achieves 0.45 vs. AgentAttack 0.24 on GPT-4V in image+text setting. However, this is shown in the high-permission setting, not a direct image-only comparison.",
    302       "supported": "moderate"
    303     },
    304     {
    305       "claim": "The proposed defense (captioning-based detection) reduces GPT-4o ASR from 0.68 to 0.21.",
    306       "evidence": "Section VII states this result, but no detailed experimental setup (which images, what captioner threshold) is provided for the defense evaluation.",
    307       "supported": "weak"
    308     },
    309     {
    310       "claim": "AgentTypo achieves 65% ASR on 'wrong email' tasks where AgentAttack achieves 0%.",
    311       "evidence": "Section VI.D mentions these figures in the text but does not break out per-task results in a table; only per-domain aggregates are tabulated.",
    312       "supported": "moderate"
    313     }
    314   ],
    315   "methodology_tags": [
    316     "benchmark-eval",
    317     "case-study"
    318   ],
    319   "key_findings": "AgentTypo introduces typographic prompt injection attacks against multimodal LVLM agents, embedding adversarial text into webpage images via Bayesian optimization to maximize attack success while maintaining visual stealth. The full AgentTypo-pro system (combining ATPI, strategy learning, and RAG) achieves 45-75% attack success rates across five commercial models (GPT-4V, GPT-4o, GPT-4o-mini, Gemini-1.5-Pro, Claude-3-Opus) in image+text settings, substantially outperforming prior baselines including AgentAttack. The image-only variant is particularly novel, demonstrating that LVLM agents can be compromised through visual channels alone, with 31-45% ASR across models. A proposed captioning-based defense reduces ASR on GPT-4o from 68% to 21% but at significant computational cost.",
    320   "red_flags": [
    321     {
    322       "flag": "No statistical tests for comparative claims",
    323       "detail": "The paper claims AgentTypo 'significantly outperforms' baselines across all models based on comparing point estimates over only 77 tasks, with no significance tests, confidence intervals, or effect-size quantification. With 77 tasks and no uncertainty quantification, differences on the order of 5-10 percentage points may not be reliable."
    324     },
    325     {
    326       "flag": "LLM-based scorer reliability",
    327       "detail": "The primary evaluation metric (ASR) is computed by an LLM-based scorer, which is itself a proprietary model (GPT-4) with no version specified. The scorer's 'threshold of 0.8' is described as 'empirically determined.' Human validation is mentioned but only as a spot-check, not a systematic audit with inter-rater reliability."
    328     },
    329     {
    330       "flag": "Unspecified model versions",
    331       "detail": "All evaluated models (GPT-4V, GPT-4o, GPT-4o-mini, Gemini-1.5-Pro, Claude-3-Opus) and the attacker/scorer LLMs are identified by marketing names without API snapshot dates, making results unreproducible as model versions change."
    332     },
    333     {
    334       "flag": "No funding or competing interests disclosure",
    335       "detail": "The paper contains no acknowledgments section, no funding disclosure, and no competing interests statement. This omission is notable for a paper published through standard IEEE channels."
    336     },
    337     {
    338       "flag": "Overgeneralized threat claim",
    339       "detail": "The paper concludes that AgentTypo 'poses a practical and potent threat to multimodal agents' based on three simulated websites in a controlled benchmark setting. Real-world web agents face different webpage structures, security controls, and human oversight that are not captured in VWA-Adv."
    340     },
    341     {
    342       "flag": "Defense evaluation underspecified",
    343       "detail": "The captioning-based defense result (ASR reduction from 68% to 21%) is reported in Section VII with minimal experimental detail — which captioner version (Qwen2.5 — what version?), what detection threshold, which subset of tasks, and whether the attacker adapts to the defense are not stated."
    344     }
    345   ],
    346   "cited_papers": [
    347     {
    348       "title": "Dissecting adversarial robustness of multimodal LM agents",
    349       "authors": [
    350         "C. H. Wu",
    351         "R. R. Shah",
    352         "J. Y. Koh",
    353         "R. Salakhutdinov",
    354         "D. Fried",
    355         "A. Raghunathan"
    356       ],
    357       "year": 2025,
    358       "relevance": "Introduces VWA-Adv benchmark and AgentAttack (image-based adversarial attack against multimodal agents), which is the primary baseline and evaluation dataset used in AgentTypo."
    359     },
    360     {
    361       "title": "AdvAgent: Controllable blackbox red-teaming on web agents",
    362       "authors": [
    363         "C. Xu",
    364         "M. Kang",
    365         "J. Zhang",
    366         "Z. Liao",
    367         "L. Mo",
    368         "M. Yuan",
    369         "H. Sun",
    370         "B. Li"
    371       ],
    372       "year": 2025,
    373       "relevance": "Proposes AdvAgent, a fine-tuning-based text prompt injection attack on web agents; used as a key baseline for comparison with AgentTypo."
    374     },
    375     {
    376       "title": "AgentDojo: A dynamic environment to evaluate prompt injection attacks and defenses for LLM agents",
    377       "authors": [
    378         "E. Debenedetti",
    379         "J. Zhang",
    380         "M. Balunovic",
    381         "L. Beurer-Kellner",
    382         "M. Fischer",
    383         "F. Tramer"
    384       ],
    385       "year": 2024,
    386       "relevance": "Provides a benchmark environment for evaluating prompt injection attacks against LLM agents, directly relevant to the survey's focus on agentic AI security evaluation."
    387     },
    388     {
    389       "title": "VisualWebArena: Evaluating multimodal agents on realistic visual web tasks",
    390       "authors": [
    391         "J. Y. Koh",
    392         "R. Lo",
    393         "L. Jang",
    394         "V. Duvvur",
    395         "M. C. Lim",
    396         "P.-Y. Huang",
    397         "G. Neubig",
    398         "S. Zhou",
    399         "R. Salakhutdinov",
    400         "D. Fried"
    401       ],
    402       "year": 2024,
    403       "relevance": "The base benchmark on which VWA-Adv is built; directly evaluates multimodal agent performance on realistic web tasks, core to the survey scope."
    404     },
    405     {
    406       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    407       "authors": [
    408         "K. Greshake",
    409         "S. Abdelnabi",
    410         "S. Mishra",
    411         "C. Endres",
    412         "T. Holz",
    413         "M. Fritz"
    414       ],
    415       "year": 2023,
    416       "relevance": "Proposes the first indirect prompt injection attack against LLM applications; foundational reference for prompt injection threat models in agentic systems."
    417     },
    418     {
    419       "title": "EIA: Environmental injection attack on generalist web agents for privacy leakage",
    420       "authors": [
    421         "Z. Liao",
    422         "L. Mo",
    423         "C. Xu",
    424         "M. Kang",
    425         "J. Zhang",
    426         "C. Xiao",
    427         "Y. Tian",
    428         "B. Li",
    429         "H. Sun"
    430       ],
    431       "year": 2025,
    432       "relevance": "Another attack on generalist web agents via HTML injection targeting privacy leakage; part of the growing literature on agentic AI security that this survey covers."
    433     },
    434     {
    435       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    436       "authors": [
    437         "Q. Zhan",
    438         "Z. Liang",
    439         "Z. Ying",
    440         "D. Kang"
    441       ],
    442       "year": 2024,
    443       "relevance": "Provides a benchmark specifically for indirect prompt injection in tool-integrated agents; directly relevant to methodology evaluation of agentic AI security research."
    444     },
    445     {
    446       "title": "UDora: A unified red teaming framework against LLM agents by dynamically hijacking their own reasoning",
    447       "authors": [
    448         "J. Zhang",
    449         "S. Yang",
    450         "B. Li"
    451       ],
    452       "year": 2025,
    453       "relevance": "Another red-teaming approach against LLM agents that hijacks reasoning; part of the emerging body of work on adversarial evaluation of agentic AI systems."
    454     },
    455     {
    456       "title": "Tree of attacks: Jailbreaking black-box LLMs automatically",
    457       "authors": [
    458         "A. Mehrotra",
    459         "M. Zampetakis",
    460         "P. Kassianik",
    461         "B. Nelson",
    462         "H. Anderson",
    463         "Y. Singer",
    464         "A. Karbasi"
    465       ],
    466       "year": 2024,
    467       "relevance": "Adaptive jailbreak method that AgentTypo-pro's strategy learning is inspired by; relevant to automated red-teaming methodology in agentic AI evaluation."
    468     },
    469     {
    470       "title": "AutoDAN-turbo: A lifelong agent for strategy self-exploration to jailbreak LLMs",
    471       "authors": [
    472         "X. Liu",
    473         "P. Li",
    474         "G. E. Suh",
    475         "Y. Vorobeychik",
    476         "Z. Mao",
    477         "S. Jha",
    478         "P. McDaniel",
    479         "H. Sun",
    480         "B. Li",
    481         "C. Xiao"
    482       ],
    483       "year": 2025,
    484       "relevance": "Combines strategy learning with RAG for LLM jailbreaking; directly informs AgentTypo-pro's design and relevant to automated evaluation methodology in the agentic AI space."
    485     }
    486   ]
    487 }

Impressum · Datenschutz