ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31251B)


      1 {
      2   "paper": {
      3     "title": "'Do as I say not as I do': A Semi-Automated Approach for Jailbreak Prompt Attack against Multimodal LLMs",
      4     "authors": [
      5       "Chun Wai Chiu",
      6       "Linghan Huang",
      7       "Bo Li",
      8       "Huaming Chen",
      9       "Kim-Kwang Raymond Choo"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv",
     13     "arxiv_id": "2502.00735",
     14     "doi": "10.48550/arXiv.2502.00735"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "The paper introduces Flanking Attack, a voice-based jailbreak method that embeds adversarial audio prompts between benign queries to bypass multimodal LLM safety filters. Tested against Gemini 1.5 Flash across 7 forbidden scenarios (2,100 prompts), the full attack achieves 0.81 average ASR, compared to 0.57 without the flanking technique and 0.12 with plot-only prompts. Ablation shows that both the text prompt framing and the flanking technique are critical components, with flanking contributing approximately 24 percentage points to ASR.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository, notebook, or archive URL is provided despite describing a Google Colab-based pipeline. The paper claims to establish 'a replicable testing framework' but releases no code."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The forbidden question set is partially shown in Table III (21 questions across 7 scenarios), but the full 2,100 prompts, audio files (MP3), and model responses are not released."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": true,
     35         "justification": "Section VI.A specifies: Google Colab (Python 3.10, Ubuntu 22.04), 2 vCPUs, 12 GB RAM, no GPU, google-generativeai 0.4.1, python-docx 1.0. Sufficient detail to recreate the environment."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The experimental setup (Section VI.A) describes the pipeline conceptually but provides no step-by-step reproduction instructions, scripts, or notebook. A researcher would need to reverse-engineer the methodology from the paper text."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Table I reports only point estimates for ASR (e.g., 0.93, 0.80). No confidence intervals, error bars, or uncertainty measures are provided for any result."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims differences between configurations (e.g., Config 1 at 0.81 vs Config 2 at 0.57) without any statistical significance tests. All comparisons are based on raw number differences."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": false,
     57         "justification": "No formal effect sizes (Cohen's d, odds ratios, etc.) are reported. The paper shows ASR values for each configuration but does not compute or report standardized effect sizes."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "The paper uses 50 requests per run and mentions 2,100 total prompts, but provides no justification for these sample sizes. No power analysis or rationale for why 50 per run is sufficient."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Table I states 'averages across multiple runs' but reports no standard deviation, variance, or spread measures. The reader cannot assess result stability across runs."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Table I presents 4 configurations as progressive ablations: full method (Config 1), without Flanking Attack (Config 2), Setting+Character+Plot only (Config 3), and Plot only (Config 4). These serve as baselines against the full method."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "No comparison with other published jailbreak attack methods from the literature (e.g., Voice Jailbreak [8], Sandwich Attack [9]). All baselines are ablated variants of the authors' own method."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Table I and Section VI.E present a systematic ablation across 4 configurations, progressively removing components (Flanking Attack, Text Prompt, Setting+Character) to measure each component's contribution to ASR."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": false,
     89         "justification": "Only Attack Success Rate (ASR) is used as a metric. No other metrics (e.g., response severity, specificity of harmful content, false positive rate of the evaluator) are reported."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "The paper includes both manual inspection (preliminary study of ~100 experiments, Section V) and the semi-automated approach. Failure and success cases are shown and manually analyzed in Figures 10-15 and Appendix figures."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The attack strategy was refined through a preliminary study (~100 experiments) and then applied to the full evaluation (2,100 prompts), but there is no formal separation of development and test sets. The same question set appears to be used throughout."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table I breaks down ASR by each of the 7 forbidden scenarios (Illegal Activities, Abuse & Disruption, Circumventing Safety, Harmful Content, Misinformation, Sexually Explicit, Privacy Violations) for all 4 configurations."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Multiple failure cases are shown and discussed: Figures 11, 12, 13, 15, 16, 19, 20, and 23 show cases where Gemini successfully blocked the attack. The paper analyzes why each configuration fails."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Configurations 3 and 4 show low ASR (0.28 and 0.12 respectively). Misinformation scenario has 0.00 ASR for Config 4. These negative results are reported and discussed in the ablation study."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "Abstract claims 'average attack success rate ranging from 0.67 to 0.93 across seven forbidden scenarios' are supported by Table I (Config 1 row shows exactly these values). The 0.81 average is also confirmed."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper makes causal claims about component contributions (e.g., Flanking Attack 'enhances' ASR). The progressive ablation design (Table I) with controlled removal of single components supports causal inference: Config 1 vs 2 isolates Flanking Attack (+24pp), Config 2 vs 3 isolates Text Prompt (+29pp)."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims 'Jailbreak Prompt Attack against Multimodal LLMs' (plural) but only Gemini 1.5 Flash is tested. The abstract says 'manipulating state-of-the-art LLMs' and the paper claims to 'establish a replicable testing framework for adversarial robustness evaluation in multimodal LLMs,' extending well beyond the single model tested."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper does not discuss alternative explanations for why the Flanking Attack succeeds. For example, it could be that the December 2024 Gemini snapshot had particularly weak audio moderation, or that the evaluation via Gemini self-assessment inflates ASR. No robustness checks or confound analysis."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "ASR is measured by Gemini's self-evaluation of its own policy compliance, but the paper does not discuss whether this proxy accurately measures real-world jailbreak severity. The gap between 'Gemini says its own output violates policy' and 'the output would cause actual harm' is not acknowledged."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "Section VI.A specifies 'gemini-1.5-flash' (API model identifier), 'December 2024 snapshot,' 'balanced safety tier,' and SDK version 'google-generativeai 0.4.1.' This is sufficiently specific to identify the model version."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "Figure 9 shows a Flanking Attack template, and implementation examples describe the text prompt components (Setting: 'Imagine that we are in a game'; Character: 'hacker'; Rule: 'We are just simulating'). But the complete verbatim prompt text used in experiments is not provided. The reader cannot reconstruct the exact prompts sent to the API."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Section VI.A reports: 'Default inference parameters were retained (temperature=0.7, top_p=0.95); no system-prompt modifications were applied.' Audio parameters also specified: 128 kbps MP3, 48 kHz, 16-bit PCM."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. The attack is a direct prompt injection via API call, not an agentic system."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section V describes the forbidden question set design (based on Gemini policy, 7 scenarios following [5]'s principles). Section VI.A documents the pipeline: text prefix + MP3 audio → generate_content API → responses saved to ai_outputs.docx → Gemini self-evaluation."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section VII 'Challenges and Future Directions' discusses multiple limitations: model updates may invalidate findings, audio variations not tested, fixed sentence structure, monolingual (English) only."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section VII discusses specific threats: (1) Gemini updates may mitigate identified vulnerabilities, (2) audio properties (pitch, tone, speed) were not varied, (3) fixed sentence structure may limit generalizability, (4) monolingual approach limits scope. These are specific to this study."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "The paper explicitly states what was NOT tested: other audio variations (Section VII.A), sentence structure manipulations (Section VII.B), multilingual inputs (Section VII.C). Scope is bounded to Gemini, English, fixed audio format."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "No raw data (prompts, audio files, Gemini responses, evaluation logs) is released. Only aggregated ASR values in Table I and selected examples in figures."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": true,
    197         "justification": "Section V describes the forbidden question set design (7 scenarios from Gemini policy), prompt construction methodology (Setting, Character, Rule, Flanking Attack), and Section VI.A describes the API interaction procedure (50 requests/run, 2s delay)."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": true,
    202         "justification": "No human participants. The data source is custom-crafted prompts, not a standard benchmark. The paper describes how the forbidden question set was designed: based on Gemini usage policy [35] with 7 specific scenarios, following design principles from [5]. Table III provides the full question bank."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The general pipeline is described (prompts → API → docx → evaluation), but key details are missing: how many runs were conducted per configuration, how averages were computed, and how many total responses were excluded or flagged. The paper says 'averages across multiple runs' without specifying the number of runs."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding source or acknowledgments section is present in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: University of Sydney, University of Chicago, and University of Texas at San Antonio. No affiliations with Google (the maker of the evaluated product Gemini)."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": false,
    223         "answer": false,
    224         "justification": "No funding is disclosed. The authors are from academic institutions with no apparent financial interest in the outcome. Appears to be unfunded academic research."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This paper tests jailbreak attacks against safety filters, not model knowledge on a benchmark. Contamination in the traditional sense (model having seen test data) is not applicable."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "The study evaluates defense bypass rather than model capability on a knowledge benchmark. Train/test overlap is not a relevant concern for jailbreak attack evaluation."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The 'benchmark' consists of custom-crafted jailbreak prompts, not a pre-existing knowledge test. Benchmark contamination does not apply to this type of adversarial evaluation."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. All experiments involve automated API interactions with Gemini."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants. The study involves only automated interactions with an LLM API."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": false,
    290         "justification": "No API costs, token consumption, or per-query cost is reported despite making 2,100+ API calls to Gemini. Only a 2-second delay between requests (for rate limiting) is mentioned."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The hardware is described (Google Colab, 2 vCPUs, 12 GB RAM) but total compute time, API spend, and total wall-clock time for the full experimental campaign are not stated."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of random seeds or stochastic variation across runs. With temperature=0.7, Gemini outputs are non-deterministic, but no seed sensitivity analysis is performed."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The paper says '50 such requests per run' and 'Table I reports averages across multiple runs' but never states the exact number of runs conducted per configuration or scenario."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": true,
    312         "justification": "Section VI.A explicitly states 'Default inference parameters were retained (temperature=0.7, top_p=0.95); no system-prompt modifications were applied,' indicating no hyperparameter tuning was performed."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "All 4 configurations are reported in Table I with full results — no cherry-picking of a single best configuration. The ablation study presents all variants transparently."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "The paper makes numerous comparative claims across 7 scenarios and 4 configurations without any statistical tests, let alone corrections for multiple comparisons."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors evaluate their own attack method against their own ablated baselines without acknowledging self-comparison bias. No independent evaluation or acknowledgment of the bias."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": false,
    332         "justification": "No discussion of compute requirements across configurations. Config 1 (with Flanking Attack audio) presumably requires more resources than Config 4 (text only), but this is not analyzed."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether ASR (as judged by Gemini self-evaluation) actually measures real-world jailbreak severity. The construct validity of using the target model to evaluate its own policy violations is not questioned."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved. The attack is a direct API call with text and audio input, not an agentic system."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "The paper does not discuss whether Gemini may have been trained on similar jailbreak patterns or whether its safety training incorporated defenses against flanking-style attacks. The December 2024 snapshot timing relative to the paper's development is not analyzed."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "No discussion of whether the evaluation setup (e.g., the text prompt framing the interaction as fictional) provides information that makes the task easier than a real-world attack scenario."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The 2,100 prompts share the same structural template across scenarios. Non-independence between prompts within and across scenarios is not discussed — success on one prompt may predict success on structurally similar ones."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection or prevention method is used. The paper does not check whether its attack patterns overlap with known jailbreak examples that may be in Gemini's safety training data."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "Flanking Attack achieves an average ASR of 0.81 across seven forbidden scenarios when using all components (Text Prompt + Setting + Character + Flanking Attack).",
    371       "evidence": "Table I shows per-scenario ASRs ranging from 0.67 (Misinformation) to 0.93 (Illegal Activities) with a 0.81 average for Configuration 1.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "The Flanking Attack component is critical, contributing approximately 24 percentage points to ASR compared to the same setup without it.",
    376       "evidence": "Table I: Config 1 (with Flanking Attack) averages 0.81 ASR vs Config 2 (without) at 0.57 ASR. Section VI.E discusses this contribution.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "This is the first voice-based jailbreak attack against multimodal LLMs.",
    381       "evidence": "Stated in abstract and Section I. The paper references Voice Jailbreak [14] against GPT-4o as prior work but claims novelty in the multi-modal flanking approach.",
    382       "supported": "weak"
    383     },
    384     {
    385       "claim": "The semi-automated self-assessment framework enables scalable policy violation detection.",
    386       "evidence": "Section V.D describes using Gemini to evaluate its own outputs against policy. No quantitative comparison against manual-only evaluation to measure scalability improvement or agreement rate.",
    387       "supported": "weak"
    388     },
    389     {
    390       "claim": "LLMs' reliance on surface-level cues and context makes them vulnerable to narrative-embedded attacks.",
    391       "evidence": "Section VI.B discusses this mechanism, supported by the ASR difference between Config 1 (fictional framing + flanking, 0.81) vs Config 4 (plot only, 0.12). However, this is an interpretive claim without mechanistic evidence.",
    392       "supported": "moderate"
    393     }
    394   ],
    395   "red_flags": [
    396     {
    397       "flag": "Circular evaluation methodology",
    398       "detail": "Gemini is used to evaluate whether Gemini's own outputs violate Gemini's policies. This self-evaluation creates a circularity — the model's leniency or strictness in self-assessment directly inflates or deflates ASR. The paper calls this an 'innovative' approach but does not validate it against independent human evaluation at scale."
    399     },
    400     {
    401       "flag": "Single model tested, broad claims made",
    402       "detail": "Only Gemini 1.5 Flash (one model, one snapshot) is tested, but the title and abstract claim results about 'Multimodal LLMs' generally. The paper claims to 'establish a replicable testing framework for adversarial robustness evaluation in multimodal LLMs' without testing a single other model."
    403     },
    404     {
    405       "flag": "No statistical rigor on key results",
    406       "detail": "Table I reports point estimates with no confidence intervals, error bars, significance tests, or variance measures. The number of runs is never stated ('averages across multiple runs'). The reader cannot assess whether the ASR differences between configurations are statistically meaningful."
    407     },
    408     {
    409       "flag": "No comparison with prior jailbreak methods",
    410       "detail": "The paper references several existing jailbreak attacks (Voice Jailbreak, Sandwich Attack, Crescendo) but does not compare Flanking Attack's ASR against any of them. The claimed effectiveness is evaluated only against the authors' own ablated variants."
    411     },
    412     {
    413       "flag": "Claims of novelty contradict cited prior work",
    414       "detail": "The paper claims to introduce 'the first voice-based jailbreak attack against multimodal LLMs' but cites Voice Jailbreak [14] (Shen et al., 2024) which already attacks GPT-4o via voice. The novelty claim is not clearly distinguished from this prior work."
    415     },
    416     {
    417       "flag": "No code or data released despite claiming replicability",
    418       "detail": "The paper claims to establish 'a replicable testing framework' but releases no code, notebooks, prompts, audio files, or response data. The full prompt text used in experiments is not provided verbatim."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    424       "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"],
    425       "year": 2023,
    426       "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, directly relevant to the survey's coverage of prompt injection vulnerabilities."
    427     },
    428     {
    429       "title": "Jailbroken: How does llm safety training fail?",
    430       "authors": ["A. Wei", "N. Haghtalab", "J. Steinhardt"],
    431       "year": 2024,
    432       "relevance": "Analyzes failure modes of LLM safety training, providing theoretical grounding for why jailbreak attacks succeed."
    433     },
    434     {
    435       "title": "LLM jailbreak attack versus defense techniques–a comprehensive study",
    436       "authors": ["Z. Xu", "Y. Liu", "G. Deng", "Y. Li", "S. Picek"],
    437       "year": 2024,
    438       "arxiv_id": "2402.13457",
    439       "relevance": "Comprehensive survey of jailbreak attack and defense techniques for LLMs, useful for mapping the attack landscape."
    440     },
    441     {
    442       "title": "Do anything now: Characterizing and evaluating in-the-wild jailbreak prompts on large language models",
    443       "authors": ["X. Shen", "Z. Chen", "M. Backes", "Y. Shen", "Y. Zhang"],
    444       "year": 2024,
    445       "relevance": "Characterizes real-world jailbreak prompts collected from online sources and evaluates their effectiveness against OpenAI policies."
    446     },
    447     {
    448       "title": "Voice jailbreak attacks against GPT-4o",
    449       "authors": ["X. Shen", "Y. Wu", "M. Backes", "Y. Zhang"],
    450       "year": 2024,
    451       "arxiv_id": "2405.19103",
    452       "relevance": "Direct predecessor — demonstrates voice-based jailbreak against GPT-4o, the primary work this paper claims to extend."
    453     },
    454     {
    455       "title": "Sandwich attack: Multi-language mixture adaptive attack on LLMs",
    456       "authors": ["B. Upadhayay", "V. Behzadan"],
    457       "year": 2024,
    458       "arxiv_id": "2404.07242",
    459       "relevance": "Proposes multilingual mixture attacks on LLMs, a related flanking/sandwiching strategy using language diversity rather than audio modality."
    460     },
    461     {
    462       "title": "Survey of vulnerabilities in large language models revealed by adversarial attacks",
    463       "authors": ["E. Shayegani", "M. A. A. Mamun", "Y. Fu", "P. Zaree", "Y. Dong", "N. Abu-Ghazaleh"],
    464       "year": 2023,
    465       "arxiv_id": "2310.10844",
    466       "relevance": "Comprehensive survey of LLM vulnerabilities from adversarial attacks, providing taxonomy of attack types relevant to the survey scope."
    467     },
    468     {
    469       "title": "Great, now write an article about that: The crescendo multi-turn LLM jailbreak attack",
    470       "authors": ["M. Russinovich", "A. Salem", "R. Eldan"],
    471       "year": 2024,
    472       "arxiv_id": "2404.01833",
    473       "relevance": "Introduces the Crescendo multi-turn jailbreak attack, a related escalation strategy for bypassing LLM safety mechanisms."
    474     },
    475     {
    476       "title": "FigStep: Jailbreaking large vision-language models via typographic visual prompts",
    477       "authors": ["Y. Gong", "D. Ran", "J. Liu", "C. Wang", "T. Cong", "A. Wang", "S. Duan", "X. Wang"],
    478       "year": 2023,
    479       "arxiv_id": "2311.05608",
    480       "relevance": "Demonstrates jailbreaking vision-language models via visual prompts, showing multimodal attack surfaces beyond text."
    481     },
    482     {
    483       "title": "Comprehensive assessment of jailbreak attacks against LLMs",
    484       "authors": ["J. Chu", "Y. Liu", "Z. Yang", "X. Shen", "M. Backes", "Y. Zhang"],
    485       "year": 2024,
    486       "arxiv_id": "2402.05668",
    487       "relevance": "Systematic benchmark of jailbreak attacks against LLMs, relevant to the survey's evaluation of adversarial robustness research."
    488     },
    489     {
    490       "title": "Catastrophic jailbreak of open-source LLMs via exploiting generation",
    491       "authors": ["Y. Huang", "S. Gupta", "M. Xia", "K. Li", "D. Chen"],
    492       "year": 2023,
    493       "arxiv_id": "2310.06987",
    494       "relevance": "Shows how generation configuration can enable jailbreaking of open-source LLMs, relevant to understanding safety alignment failures."
    495     },
    496     {
    497       "title": "Pleak: Prompt leaking attacks against large language model applications",
    498       "authors": ["B. Hui", "H. Yuan", "N. Gong", "P. Burlina", "Y. Cao"],
    499       "year": 2024,
    500       "relevance": "Demonstrates prompt leaking attacks against LLM applications, a related class of prompt-based security vulnerabilities."
    501     }
    502   ],
    503   "engagement_factors": {
    504     "practical_relevance": {
    505       "score": 1,
    506       "justification": "The attack methodology is described conceptually but no tools, code, or reusable framework is released for practitioners to apply."
    507     },
    508     "surprise_contrarian": {
    509       "score": 1,
    510       "justification": "That multimodal LLMs can be jailbroken through creative prompting is not surprising given extensive prior work on text-based and voice-based jailbreaks."
    511     },
    512     "fear_safety": {
    513       "score": 2,
    514       "justification": "Demonstrates that voice-based flanking can bypass Gemini's safety filters with 81% success rate across forbidden scenarios, raising concerns about audio-enabled LLM safety."
    515     },
    516     "drama_conflict": {
    517       "score": 1,
    518       "justification": "Shows Gemini vulnerability but presents findings in an academic context without dramatic framing or controversy."
    519     },
    520     "demo_ability": {
    521       "score": 0,
    522       "justification": "No code, demo, or tool released. The approach cannot be tried without reimplementing from the paper description."
    523     },
    524     "brand_recognition": {
    525       "score": 2,
    526       "justification": "Targets Google Gemini, a well-known consumer AI product, which adds brand recognition relevance."
    527     }
    528   }
    529 }

Impressum · Datenschutz