scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26305B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "'Do as I say not as I do': A Semi-Automated Approach for Jailbreak Prompt Attack against Multimodal LLMs",
      6     "authors": [
      7       "Chun Wai Chiu",
      8       "Linghan Huang",
      9       "Bo Li",
     10       "Huaming Chen"
     11     ],
     12     "year": 2025,
     13     "venue": "arXiv.org",
     14     "arxiv_id": "2502.00735",
     15     "doi": "10.48550/arXiv.2502.00735"
     16   },
     17   "checklist": {
     18     "claims_and_evidence": {
     19       "abstract_claims_supported": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The abstract's core claims (first voice-based jailbreak attack, ASR ranging 0.67–0.93, semi-automated framework) are verified in Table I and the methodology section.",
     23         "source": "haiku"
     24       },
     25       "causal_claims_justified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses a 4-configuration ablation study to isolate the contribution of each component (Text Prompt, Setting/Character/Plot, Flanking Attack), which supports the causal claim that each element additively improves ASR.",
     29         "source": "haiku"
     30       },
     31       "generalization_bounded": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "The title and abstract claim results against 'multimodal LLMs' generally, but all experiments are conducted exclusively on a single model—Gemini 1.5 Flash (December 2024 snapshot)—with no testing on other multimodal LLMs such as GPT-4o.",
     35         "source": "haiku"
     36       },
     37       "alternative_explanations_discussed": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "The paper does not consider alternative explanations for the high ASR, such as the possibility that Gemini's safety filters are simply weaker for these specific topic categories, or that the evaluation method (Gemini self-evaluating its own outputs) inflates the ASR.",
     41         "source": "haiku"
     42       },
     43       "proxy_outcome_distinction": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The primary evaluation metric (ASR) is measured by asking Gemini to evaluate whether its own outputs violated policy—a circular proxy. The paper does not systematically compare this proxy to human judgments or discuss the potential for self-evaluation bias.",
     47         "source": "haiku"
     48       }
     49     },
     50     "limitations_and_scope": {
     51       "limitations_section_present": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "Section VII is titled 'Challenges and Future Directions' and discusses some limitations (fixed sentence structure, monolingual scope, model updates), but there is no dedicated limitations or threats-to-validity section.",
     55         "source": "haiku"
     56       },
     57       "threats_to_validity_specific": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The future directions section mentions that model updates may mitigate vulnerabilities, but no formal validity threats are enumerated (e.g., single-model generalizability, circular self-evaluation, unvalidated sample sizes).",
     61         "source": "haiku"
     62       },
     63       "scope_boundaries_stated": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The paper does not explicitly state what the results do NOT show. The single-model scope, English-only constraint, and reliance on a single API snapshot are not framed as explicit scope limits.",
     67         "source": "haiku"
     68       }
     69     },
     70     "conflicts_of_interest": {
     71       "funding_disclosed": {
     72         "applies": true,
     73         "answer": false,
     74         "justification": "No funding disclosure appears anywhere in the paper.",
     75         "source": "haiku"
     76       },
     77       "affiliations_disclosed": {
     78         "applies": true,
     79         "answer": true,
     80         "justification": "Author affiliations (University of Sydney, University of Chicago, University of Texas at San Antonio) are disclosed on the title page.",
     81         "source": "haiku"
     82       },
     83       "funder_independent_of_outcome": {
     84         "applies": false,
     85         "answer": false,
     86         "justification": "No funding source is mentioned, so funder independence cannot be assessed.",
     87         "source": "haiku"
     88       },
     89       "financial_interests_declared": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No competing interests statement or financial interests declaration appears in the paper.",
     93         "source": "haiku"
     94       }
     95     },
     96     "scope_and_framing": {
     97       "key_terms_defined": {
     98         "applies": true,
     99         "answer": true,
    100         "justification": "Key terms are defined: 'jailbreak attack' is explained in the introduction, 'Flanking Attack' is named and operationalized in Section V, and 'Attack Success Rate (ASR)' is defined in the results section.",
    101         "source": "haiku"
    102       },
    103       "intended_contribution_clear": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "The introduction explicitly lists three numbered contributions: a systematic benchmarking of audio-based jailbreak attacks, a novel attack framework (Flanking Attack), and a semi-automated evaluation approach.",
    107         "source": "haiku"
    108       },
    109       "engagement_with_prior_work": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Section III provides a dedicated related work section covering adversarial attacks, multimodal attacks, and jailbreak prompt attacks, with the proposed work explicitly positioned against Shen et al. (voice jailbreak) and Upadhayay et al. (multilingual attacks).",
    113         "source": "haiku"
    114       }
    115     }
    116   },
    117   "type_checklist": {
    118     "empirical": {
    119       "artifacts": {
    120         "code_released": {
    121           "applies": true,
    122           "answer": false,
    123           "justification": "No code repository is linked. The paper claims to establish 'a replicable testing framework' but provides no public code release.",
    124           "source": "haiku"
    125         },
    126         "data_released": {
    127           "applies": true,
    128           "answer": false,
    129           "justification": "The forbidden question set is partially shown in Table III (21 examples), but no complete dataset is publicly released. The full set of 2,100 prompts used in experiments is not available.",
    130           "source": "haiku"
    131         },
    132         "environment_specified": {
    133           "applies": true,
    134           "answer": true,
    135           "justification": "The experimental setup specifies Google Colab (Python 3.10, Ubuntu 22.04), exact package versions (google-generativeai 0.4.1, python-docx 1.0), audio format (128 kbps MP3, 48 kHz, 16-bit PCM), and inference parameters (temperature=0.7, top_p=0.95).",
    136           "source": "haiku"
    137         },
    138         "reproduction_instructions": {
    139           "applies": true,
    140           "answer": false,
    141           "justification": "The pipeline is described at a high level (50 requests per run, 2s delay, logging to docx) but no step-by-step instructions or code are provided. The text prompt template (Figure 9) is shown but the audio file (breakAuthorisation.mp3) is not released.",
    142           "source": "haiku"
    143         }
    144       },
    145       "statistical_methodology": {
    146         "confidence_intervals_or_error_bars": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "Table I reports single ASR values with no confidence intervals, error bars, or standard deviations. The paper mentions 'averages across multiple runs' but no variance is reported.",
    150           "source": "haiku"
    151         },
    152         "significance_tests": {
    153           "applies": true,
    154           "answer": false,
    155           "justification": "No statistical significance tests are applied to the ASR comparisons across configurations, despite these being the central comparative claims of the paper.",
    156           "source": "haiku"
    157         },
    158         "effect_sizes_reported": {
    159           "applies": true,
    160           "answer": true,
    161           "justification": "Effect sizes are implicitly reported as absolute ASR differences (e.g., Config 1 at 0.81 vs Config 2 at 0.57 vs Config 4 at 0.12), which provides magnitude context relative to a baseline.",
    162           "source": "haiku"
    163         },
    164         "sample_size_justified": {
    165           "applies": true,
    166           "answer": false,
    167           "justification": "The choice of 50 requests per run and 2,100 total prompts is not statistically justified through power analysis or prior work establishing minimum detectable effect sizes.",
    168           "source": "haiku"
    169         },
    170         "variance_reported": {
    171           "applies": true,
    172           "answer": false,
    173           "justification": "Table I reports single point estimates per configuration per scenario. Multiple runs are alluded to ('averages across multiple runs') but no variance, standard deviation, or range is reported.",
    174           "source": "haiku"
    175         }
    176       },
    177       "evaluation_design": {
    178         "baselines_included": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "Configuration 4 (Plot only) serves as a baseline, and the ablation progressively builds from this baseline to the full attack (Configuration 1).",
    182           "source": "haiku"
    183         },
    184         "baselines_contemporary": {
    185           "applies": true,
    186           "answer": false,
    187           "justification": "There are no comparisons against other contemporary jailbreak methods (e.g., Crescendo, sandwich attack, FigStep). The 'baselines' are only ablations of the authors' own method, not competitive comparisons.",
    188           "source": "haiku"
    189         },
    190         "ablation_study": {
    191           "applies": true,
    192           "answer": true,
    193           "justification": "A 4-configuration ablation systematically removes components (Text Prompt, Setting/Character/Plot, Flanking Attack) to isolate the contribution of each element, reported in Table I.",
    194           "source": "haiku"
    195         },
    196         "multiple_metrics": {
    197           "applies": true,
    198           "answer": false,
    199           "justification": "Attack Success Rate (ASR) is the sole reported metric. No secondary metrics such as response quality, specificity of harmful content, or false positive rate of the evaluator are reported.",
    200           "source": "haiku"
    201         },
    202         "human_evaluation": {
    203           "applies": true,
    204           "answer": false,
    205           "justification": "Manual inspection is mentioned as complementary to the automated approach, but no systematic human evaluation results are reported—no inter-rater agreement, no sample sizes, no comparison with automated ASR.",
    206           "source": "haiku"
    207         },
    208         "held_out_test_set": {
    209           "applies": false,
    210           "answer": false,
    211           "justification": "This is a security evaluation demonstrating a vulnerability, not a predictive modeling task; a train/test split is not applicable.",
    212           "source": "haiku"
    213         },
    214         "per_category_breakdown": {
    215           "applies": true,
    216           "answer": true,
    217           "justification": "Table I breaks down ASR across all 7 forbidden scenarios (Illegal Activities, Abuse & Disruption, Circumventing Safety, Harmful Content, Misinformation, Sexually Explicit, Privacy Violation) for each configuration.",
    218           "source": "haiku"
    219         },
    220         "failure_cases_discussed": {
    221           "applies": true,
    222           "answer": true,
    223           "justification": "Multiple failure cases are shown explicitly (Figures 11, 12, 13, 15, 19, 20, 23) with explanations of why Gemini's defenses succeeded in those instances.",
    224           "source": "haiku"
    225         },
    226         "negative_results_reported": {
    227           "applies": true,
    228           "answer": true,
    229           "justification": "Configurations 3 and 4 with low ASR (0.28 and 0.12 respectively) are analyzed and discussed as negative results, with explanations for why those approaches fail.",
    230           "source": "haiku"
    231         }
    232       },
    233       "setup_transparency": {
    234         "model_versions_specified": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "The target model is specified as 'gemini-1.5-flash, December 2024 snapshot, balanced safety tier'—including model name, version, snapshot date, and safety configuration.",
    238           "source": "haiku"
    239         },
    240         "prompts_provided": {
    241           "applies": true,
    242           "answer": true,
    243           "justification": "The Flanking Attack prompt template is shown in Figure 9, the text prompt structure is illustrated in Figure 8, and concrete examples (e.g., bank heist scenario) are provided throughout the paper.",
    244           "source": "haiku"
    245         },
    246         "hyperparameters_reported": {
    247           "applies": true,
    248           "answer": true,
    249           "justification": "Inference parameters are reported: temperature=0.7, top_p=0.95, 30 QPM rate limit, 2s delay between requests, 50 requests per run.",
    250           "source": "haiku"
    251         },
    252         "scaffolding_described": {
    253           "applies": true,
    254           "answer": true,
    255           "justification": "The pipeline is described: fixed text prefix + MP3 audio file submitted via generate_content, outputs logged to docx, then second Gemini instance evaluates compliance. The dual-model architecture is clear.",
    256           "source": "haiku"
    257         },
    258         "data_preprocessing_documented": {
    259           "applies": true,
    260           "answer": true,
    261           "justification": "Audio specifications are documented (128 kbps MP3, 48 kHz, 16-bit PCM). The forbidden question construction follows Shen et al.'s design principles across 7 categories.",
    262           "source": "haiku"
    263         }
    264       },
    265       "data_integrity": {
    266         "raw_data_available": {
    267           "applies": true,
    268           "answer": false,
    269           "justification": "Raw prompts and Gemini outputs are not publicly released. Only selected examples are shown in figures. The full ai_outputs.docx log files are not shared.",
    270           "source": "haiku"
    271         },
    272         "data_collection_described": {
    273           "applies": true,
    274           "answer": true,
    275           "justification": "The methodology section describes how prompts were constructed (Setting + Character + Plot + Flanking Attack), how audio was formatted, and how outputs were collected via API with specific rate-limiting procedures.",
    276           "source": "haiku"
    277         },
    278         "recruitment_methods_described": {
    279           "applies": false,
    280           "answer": false,
    281           "justification": "No human participants; all data is generated via API calls to Gemini.",
    282           "source": "haiku"
    283         },
    284         "data_pipeline_documented": {
    285           "applies": true,
    286           "answer": false,
    287           "justification": "The pipeline steps are described qualitatively but the code generating the prompts, making API calls, and processing responses is not released, making independent replication difficult.",
    288           "source": "haiku"
    289         }
    290       },
    291       "contamination": {
    292         "training_cutoff_stated": {
    293           "applies": true,
    294           "answer": false,
    295           "justification": "The Gemini 1.5 Flash snapshot date (December 2024) is stated, but no training data cutoff is provided. The forbidden question set was adapted from Shen et al. (2024), which could have been in Gemini's training data.",
    296           "source": "haiku"
    297         },
    298         "train_test_overlap_discussed": {
    299           "applies": true,
    300           "answer": false,
    301           "justification": "No discussion of whether the forbidden question templates (derived from prior published work by Shen et al. 2024 and others) may have been seen during Gemini's training, which could affect baseline defense behavior.",
    302           "source": "haiku"
    303         },
    304         "benchmark_contamination_addressed": {
    305           "applies": true,
    306           "answer": false,
    307           "justification": "The forbidden question bank is based on Shen et al.'s dataset published before Gemini's December 2024 snapshot. No discussion of whether familiarity with these question types affects the reported ASR.",
    308           "source": "haiku"
    309         }
    310       },
    311       "human_studies": {
    312         "pre_registered": {
    313           "applies": false,
    314           "answer": false,
    315           "justification": "No human participants in this study.",
    316           "source": "haiku"
    317         },
    318         "irb_or_ethics_approval": {
    319           "applies": false,
    320           "answer": false,
    321           "justification": "No human participants in this study.",
    322           "source": "haiku"
    323         },
    324         "demographics_reported": {
    325           "applies": false,
    326           "answer": false,
    327           "justification": "No human participants in this study.",
    328           "source": "haiku"
    329         },
    330         "inclusion_exclusion_criteria": {
    331           "applies": false,
    332           "answer": false,
    333           "justification": "No human participants in this study.",
    334           "source": "haiku"
    335         },
    336         "randomization_described": {
    337           "applies": false,
    338           "answer": false,
    339           "justification": "No human participants in this study.",
    340           "source": "haiku"
    341         },
    342         "blinding_described": {
    343           "applies": false,
    344           "answer": false,
    345           "justification": "No human participants in this study.",
    346           "source": "haiku"
    347         },
    348         "attrition_reported": {
    349           "applies": false,
    350           "answer": false,
    351           "justification": "No human participants in this study.",
    352           "source": "haiku"
    353         }
    354       },
    355       "cost_and_practicality": {
    356         "inference_cost_reported": {
    357           "applies": true,
    358           "answer": false,
    359           "justification": "No API cost or latency information is reported despite the study relying entirely on paid API calls to Gemini across 2,100 prompts plus the secondary evaluator calls.",
    360           "source": "haiku"
    361         },
    362         "compute_budget_stated": {
    363           "applies": true,
    364           "answer": false,
    365           "justification": "Google Colab hardware specs are mentioned (2 vCPUs, 12 GB RAM) but no total compute cost, API usage cost, or wall-clock time is reported.",
    366           "source": "haiku"
    367         }
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Flanking Attack is the first voice-based jailbreak attack against multimodal LLMs.",
    374       "evidence": "Claimed in abstract and introduction; comparison is drawn to text-based and multilingual attacks, and the paper cites Shen et al.'s voice jailbreak against GPT-4o as related but distinct prior work.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Flanking Attack achieves an average ASR of 0.81 across seven forbidden scenarios against Gemini 1.5 Flash.",
    379       "evidence": "Table I reports per-scenario ASRs for Configuration 1 ranging from 0.67 (Misinformation) to 0.93 (Illegal Activities), with an average of 0.81 across 2,100 prompts.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Each component of the attack (Text Prompt, Setting/Character/Plot, Flanking Attack) contributes independently to ASR improvement.",
    384       "evidence": "Ablation study (Table I) shows ASR drops from 0.81 (full) → 0.57 (no Flanking) → 0.28 (no Text Prompt) → 0.12 (Plot only), supporting additive contribution.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "The semi-automated Gemini self-evaluation approach is an effective substitute for manual inspection of policy violations.",
    389       "evidence": "The paper asserts self-evaluation provides 'more subjective and compatible results for policy violation detection' but presents no systematic comparison with human ground-truth labels.",
    390       "supported": "weak"
    391     },
    392     {
    393       "claim": "Fictional framing reduces Gemini's sensitivity to harmful content by exploiting surface-level context cues rather than deep semantic analysis.",
    394       "evidence": "Qualitative examples show Gemini adding fictional disclaimers while still producing detailed harmful content; no formal mechanism analysis is provided.",
    395       "supported": "weak"
    396     }
    397   ],
    398   "methodology_tags": [
    399     "benchmark-eval",
    400     "case-study"
    401   ],
    402   "key_findings": "The Flanking Attack embeds adversarial queries within sequences of benign voice prompts, achieving a 0.81 mean ASR against Gemini 1.5 Flash across seven forbidden content categories. A 4-configuration ablation confirms each attack component (fictional text framing, character/plot context, audio flanking) contributes additively to bypassing Gemini's content moderation. The primary evaluation relies on Gemini self-assessing its own outputs for policy compliance, which introduces a circularity concern. All results are from a single model (Gemini 1.5 Flash, December 2024) tested in English only, limiting generalizability despite broad claims about 'multimodal LLMs.'",
    403   "red_flags": [
    404     {
    405       "flag": "Single-model overgeneralization",
    406       "detail": "All experiments run exclusively on Gemini 1.5 Flash (one snapshot), yet the paper's title, abstract, and conclusions refer to 'multimodal LLMs' generally, implying broader applicability that is not demonstrated."
    407     },
    408     {
    409       "flag": "Circular self-evaluation",
    410       "detail": "The primary metric (ASR) is measured by asking Gemini to evaluate whether its own outputs violated Gemini's usage policy. This creates a fundamental validity concern: the same model's safety behavior determines both the attack and the evaluation outcome."
    411     },
    412     {
    413       "flag": "No competitive baselines",
    414       "detail": "No comparisons to other published jailbreak methods (Crescendo, sandwich attack, FigStep, etc.) are included. The only comparisons are ablations of the authors' own method, making relative effectiveness claims unsupported."
    415     },
    416     {
    417       "flag": "No statistical rigor",
    418       "detail": "Point estimates only in Table I, with no confidence intervals, standard deviations, or significance tests despite the paper making comparative claims across configurations and categories."
    419     },
    420     {
    421       "flag": "No code or data release despite 'replicable framework' claim",
    422       "detail": "The paper claims to establish a 'replicable testing framework' but releases no code, no full forbidden question set, and no raw output logs. The core audio file (breakAuthorisation.mp3) is not shared."
    423     },
    424     {
    425       "flag": "Inconsistent attack name",
    426       "detail": "The paper alternates between 'Flanking Attack' and 'Franking Attack' in multiple places (e.g., 'the Franking Attack' in the contributions section), suggesting the paper was not carefully proofread and may have methodological inconsistencies."
    427     },
    428     {
    429       "flag": "Contamination risk unaddressed",
    430       "detail": "The forbidden question bank is adapted from Shen et al. (2024), published before the December 2024 Gemini snapshot. The training data cutoff is not stated, and there is no discussion of whether the model's baseline defense behavior reflects familiarity with these specific question patterns."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "'Do Anything Now': Characterizing and Evaluating In-the-Wild Jailbreak Prompts on Large Language Models",
    436       "relevance": "Primary methodological inspiration for forbidden question set design; this work's approach is directly built upon"
    437     },
    438     {
    439       "title": "Jailbroken: How does LLM safety training fail?",
    440       "relevance": "Foundational work on LLM jailbreak mechanisms cited as motivation"
    441     },
    442     {
    443       "title": "Voice Jailbreak Attacks against GPT-4o",
    444       "relevance": "Closest prior work; this paper directly extends to Gemini and the Flanking Attack structure"
    445     },
    446     {
    447       "title": "Sandwich Attack: Multi-language Mixture Adaptive Attack on LLMs",
    448       "relevance": "Prior work on flanking-style multilingual attacks that inspired the Flanking Attack's sequential structure"
    449     },
    450     {
    451       "title": "Comprehensive Assessment of Jailbreak Attacks against LLMs",
    452       "relevance": "Benchmark paper providing systematic evaluation framework for jailbreak methods"
    453     },
    454     {
    455       "title": "Survey of Vulnerabilities in Large Language Models Revealed by Adversarial Attacks",
    456       "relevance": "Broad taxonomy of LLM attack surfaces used to contextualize multimodal vulnerabilities"
    457     },
    458     {
    459       "title": "FigStep: Jailbreaking Large Vision-Language Models via Typographic Visual Prompts",
    460       "relevance": "Prior work on visual modality jailbreaks that motivates the question of whether audio similarly exposes new attack surfaces"
    461     },
    462     {
    463       "title": "Jailbreak Attacks and Defenses against Multimodal Generative Models: A Survey",
    464       "relevance": "Survey contextualizing this work's contribution within multimodal jailbreak research"
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 2,
    470       "justification": "Security researchers and AI developers can use the Flanking Attack framework to red-team audio-capable LLM deployments, though limited to the Gemini API."
    471     },
    472     "surprise_contrarian": {
    473       "score": 1,
    474       "justification": "Voice-based attacks on multimodal LLMs are a novel angle, but the underlying insight (fictional framing weakens moderation) is well-established in the jailbreak literature."
    475     },
    476     "fear_safety": {
    477       "score": 3,
    478       "justification": "Demonstrates that a production AI system (Gemini) can be manipulated into producing detailed instructions for bank robbery, terrorism, and other harms with a simple audio framing trick achieving 93% success rate."
    479     },
    480     "drama_conflict": {
    481       "score": 2,
    482       "justification": "Direct empirical attack on Google's Gemini showing real harmful outputs (Figures 10, 17, 21) is newsworthy and frames Google's safety claims unfavorably."
    483     },
    484     "demo_ability": {
    485       "score": 2,
    486       "justification": "Anyone with a Gemini API key and Python could replicate the basic setup from the described methodology, though the specific audio file is not released."
    487     },
    488     "brand_recognition": {
    489       "score": 2,
    490       "justification": "The attack specifically targets Google's Gemini, a high-profile product, lending name recognition; authors are from recognized universities."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [],
    495     "top_points": 0,
    496     "total_points": 0,
    497     "total_comments": 0
    498   }
    499 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs