ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30444B)


      1 {
      2   "paper": {
      3     "title": "Safeguarding Vision-Language Models Against Patched Visual Prompt Injectors",
      4     "authors": [
      5       "Jiachen Sun",
      6       "Changsheng Wang",
      7       "Jiongxiao Wang",
      8       "Yiwei Zhang",
      9       "Chaowei Xiao"
     10     ],
     11     "year": 2024,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2405.10529",
     14     "doi": "10.48550/arXiv.2405.10529"
     15   },
     16   "scan_version": 3,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval", "theoretical"],
     19   "key_findings": "SmoothVLM applies randomized pixel masking and majority voting to defend VLMs against adversarial patch-based visual prompt injection. The defense reduces attack success rates to 0–5% on LLaVA-1.5 and miniGPT4 while recovering 67.3–95% of benign image context. The paper identifies a 'q-instability' property of adversarial patches—sensitivity to pixel-wise masking—that persists even under adaptive (EOT) attacks, and provides a formal probability guarantee for defense success.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": false,
     25         "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": false,
     30         "justification": "The 300 adversarial examples used for evaluation are not released. The source images are not identified or made available."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper mentions A100 GPU, openai/clip-vit-base-patch32, Vicuna-30B, LLaVA-v1.5-13b, and miniGPT4, but provides no requirements.txt, Dockerfile, or detailed environment setup with library versions."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions, README, or scripts for replicating experiments are provided."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "All ASR and distortion rate results are reported as point estimates in figures (Figures 2, 3, 5–9) with no confidence intervals, error bars, or uncertainty bounds."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims SmoothVLM 'significantly' reduces ASR and 'significantly outperforms existing defense strategies' but provides no statistical significance tests (no p-values, t-tests, or bootstrap tests)."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "The paper reports absolute ASR values (from 100% baseline down to 0–5%) and distortion rates with specific percentages. The abstract states 'lowers the attack success rate to a range between 0% and 5.0%' and '67.3% to 95.0% context recovery,' providing baseline-relative context."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "300 adversarial examples are used for evaluation (§3.2 and §4.1) with no justification for why this number was chosen and no power analysis."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs. Results appear to be from single evaluations over the 300-example set."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The undefended VLM (100% ASR) serves as the baseline. Three perturbation methods (mask, swap, replace) are compared. However, no competing defense methods from the literature (e.g., DiffPure) are included as baselines."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": false,
     79         "justification": "The paper claims to 'significantly outperform existing defense strategies' (abstract) but includes no comparison against any competing defense method. DiffPure (Nie et al., 2022), PatchCleanser (Xiang et al., 2022), and other methods are mentioned in related work but never evaluated against."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "The paper systematically varies perturbation type (mask, swap, replace), perturbation percentage q ∈ {5, 10, 15, 20}, and number of samples N ∈ {2, 4, 6, 8, 10}, effectively ablating the key design choices of SmoothVLM (Figures 5–9)."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Two metrics are used: Attack Success Rate (ASR) for defense effectiveness (§4.1) and distortion rate for visual prompt recovery / context preservation (§4.2)."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation of defense outputs. Attack success is judged by an oracle LLM (Vicuna-30B), and context recovery is also measured via Vicuna-30B similarity scores."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": false,
     99         "justification": "The 300 adversarial examples serve as the evaluation set, but there is no description of a separate dev/validation set used for tuning hyperparameters (q and N). The same examples appear to be used for both exploration and reporting."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Results are broken down by attack method (JIP vs VAE), perturbation type (mask, swap, replace), VLM (LLaVA-1.5 vs miniGPT4), and across different q and N values in Figures 5–9."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": true,
    109         "justification": "Section 6 discusses failure modes: vulnerability to ℓp-based attacks. Section 4.2 notes that small q=5% 'is insufficient to eliminate the concealed harmful context.' The paper also shows swap perturbation is less effective."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper reports that swap perturbation has 'significantly higher' ASR than mask and replace (§4.1), that q=5% with low N has high distortion rates (§4.2), and that the defense fails against ℓp attacks (§6)."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": false,
    121         "justification": "The abstract claims SmoothVLM 'significantly outperforms existing defense strategies, achieving state-of-the-art results in both detection accuracy and model performance retention.' However, no competing defense methods are compared against in the experiments. The ASR and recovery numbers are supported by the figures, but the state-of-the-art claim is unsubstantiated."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper's main causal claim—that random pixel masking disrupts adversarial patches—is supported by controlled experiments varying q and N (Figures 2, 5), theoretical analysis (Proposition 4, Appendix A), and adaptive attack experiments (Figure 3, §3.3). The ablation-style design isolates the effect of perturbation type and rate."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The title claims protection for 'Vision-Language Models' generally, but evaluation is limited to two VLMs (LLaVA-1.5 and miniGPT4) with one image size (224×224). The paper does not bound its claims to these specific models."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper argues q-instability arises from VLM next-token prediction characteristics (§3.3) but does not consider alternative explanations such as the adversarial examples being inherently fragile regardless of model architecture, or whether the oracle LLM's judgment is reliable."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Attack success is determined by an oracle LLM (Vicuna-30B) judging semantic similarity between model output and target adversarial content. The paper does not discuss limitations of this proxy or validate that Vicuna-30B's judgments align with actual attack success as perceived by humans."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": true,
    148         "justification": "LLaVA-v1.5-13b is specified (§3.2), miniGPT4 is named (though no version suffix), Vicuna-30B is specified as the oracle, and openai/clip-vit-base-patch32 is specified as the attack model. The primary models have sufficient specificity."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": false,
    153         "justification": "The textual prompts used with the VLMs are not provided. The paper uses notation '[xadv, ∅]' where ∅ appears to denote an empty or default prompt, but the actual text sent to the VLMs is not disclosed."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": true,
    158         "justification": "Key hyperparameters are reported: image size 224×224, perturbation percentages q ∈ {5, 10, 15, 20}, number of samples N ∈ {2, 4, 6, 8, 10}, 8000 optimization iterations for attacks, patch size parameters m×n (§3.1)."
    159       },
    160       "scaffolding_described": {
    161         "applies": false,
    162         "answer": false,
    163         "justification": "No agentic scaffolding is used. SmoothVLM is a defense wrapper that applies random perturbation and majority voting, not an agentic system."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper states '300 adversarial examples' were used and 'ensure that the attacks successfully launch on the images' (§3.2, §4.1), but does not describe the source of the base images, how they were selected, or the full pipeline from source images to adversarial examples."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": true,
    175         "justification": "Section 6 is titled 'Limitations' and provides substantive discussion of the defense's scope and known weaknesses."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": true,
    180         "justification": "Section 6 identifies specific threats: 'defense mechanism primarily addresses patch-based visual prompt injections and remains vulnerable to ℓp based adversarial attacks' and 'potential risk that our SmoothVLM may fail under stronger attacks beyond our threat model.'"
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "Section 6 explicitly states SmoothVLM only addresses patch-based attacks with ℓ0 constraints and acknowledges vulnerability to ℓp attacks. The authors frame the work as 'an initial step toward establishing certified robustness in VLMs.'"
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "Neither the source images nor the generated adversarial examples are available for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The paper states 300 adversarial examples were generated using JIP and VAE optimization methods, but the source/origin of the base images is not described. The attack optimization process is documented (§3.1) but the input data pipeline is not."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": true,
    201         "answer": false,
    202         "justification": "No human participants, but the selection of base images for adversarial example generation is not described. The source dataset or image selection criteria are not mentioned."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The adversarial optimization process is described (JIP and VAE methods in §3.1), but the full pipeline from source images to final evaluation results has gaps: image source unknown, filtering criteria for the 300 examples not fully specified beyond 'ensure successful attack.'"
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding sources, grants, or sponsorship are mentioned anywhere in the paper."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: University of Michigan Ann Arbor, Michigan State University, and University of Wisconsin Madison. No commercial product is being evaluated."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "No funding is disclosed, making it impossible to assess funder independence."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": false,
    235         "answer": false,
    236         "justification": "This paper evaluates a defense mechanism against adversarial attacks, not a pre-trained model's capability on a knowledge benchmark. Contamination is not a relevant concern."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": false,
    240         "answer": false,
    241         "justification": "The paper tests a non-parametric defense against adversarial patches, not model knowledge on a benchmark. Train/test overlap is not applicable."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "The adversarial examples are crafted specifically for the evaluation; benchmark contamination in the traditional sense does not apply."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Section 4.3 reports defense inference time: 'less than 1 minute under N=10' using Vicuna-30B, compared to 30 minutes per adversarial example for the JIP attack on one A100. This makes the defense '30x faster than the fastest attack method.'"
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Section 4.3 and §3.3 state: JIP optimization takes ~30 minutes on one A100, EOT training takes ~50 minutes for 8000 epochs on one A100. The defense requires N=10–20 VLM inference calls, compared to 100,000 for classic randomized smoothing."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No results across multiple random seeds are reported. The random masking in SmoothVLM involves stochasticity, but seed sensitivity is not analyzed."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of complete evaluation runs is not stated. 300 adversarial examples are evaluated, and N samples per input are drawn, but it is unclear whether the full evaluation was repeated multiple times."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The paper explores q and N values systematically but does not report a hyperparameter search budget or describe how the final recommended settings were determined."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "The paper reports results across all tested configurations (q ∈ {5, 10, 15, 20}, N ∈ {2, 4, 6, 8, 10}) in Figures 5–9 rather than cherry-picking a single best configuration."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": false,
    321         "answer": false,
    322         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors evaluate their own defense without acknowledging potential evaluation bias. No independent evaluation or discussion of author-evaluation bias is present."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "Section 4.3 explicitly discusses the compute cost of attack (30 min) vs defense (<1 min) and compares SmoothVLM's 10–20 runs per input against classic randomized smoothing's 100,000 runs."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "The paper does not discuss whether the ASR metric (determined by Vicuna-30B oracle) validly measures defense success, or whether the 300 adversarial examples are representative of real-world visual prompt injection threats."
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "No scaffolding is involved in the evaluation. SmoothVLM is a direct defense wrapper, not a scaffold."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": false,
    348         "answer": false,
    349         "justification": "Adversarial examples are crafted specifically for the target models; temporal leakage is not a meaningful concern for defense evaluation."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": false,
    353         "answer": false,
    354         "justification": "The defense processes adversarial images at inference time; feature leakage between train/test is not applicable to this non-parametric defense."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "The paper does not discuss whether the 300 adversarial examples are independent of each other or whether they share structural similarities (e.g., same base images, similar optimization trajectories)."
    360       },
    361       "leakage_detection_method": {
    362         "applies": false,
    363         "answer": false,
    364         "justification": "Not applicable to adversarial defense evaluation where test examples are specifically crafted, not drawn from a pre-existing dataset."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "SmoothVLM reduces the attack success rate to 0–5% on LLaVA-1.5 and miniGPT4.",
    371       "evidence": "Figures 5 and 8 show ASR across q ∈ {5, 10, 15, 20} and N ∈ {2, 4, 6, 8, 10} for both JIP and VAE attacks on both VLMs. With sufficient q and N, ASR consistently drops below 5%.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "SmoothVLM achieves 67.3% to 95.0% context recovery of benign images.",
    376       "evidence": "Figures 6 and 9 show distortion rates for various q and N. Recovery rates improve with higher q and N. The specific range is stated in the abstract and supported by the figures.",
    377       "supported": "moderate"
    378     },
    379     {
    380       "claim": "Patched adversarial prompts exhibit visual q-instability — sensitivity to pixel-wise randomization.",
    381       "evidence": "Figure 2 demonstrates ASR drops substantially when q% of pixels in the adversarial patch are randomly perturbed with mask, swap, or replace operations on 300 verified adversarial examples (§3.2).",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Adaptive attacks (Expectation over Transformation) are ineffective at breaking the q-instability.",
    386       "evidence": "Figure 3 shows EOT attack ASR remains low after masking, and the right subplot shows the optimization loss fails to reach the success threshold (0.4) with masking. However, only one adaptive attack formulation was tested (§3.3).",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "SmoothVLM is more than 30x faster than the fastest attack method.",
    391       "evidence": "Section 4.3 states defense takes <1 minute with N=10 using Vicuna-30B, while JIP attack optimization requires ~30 minutes per example on one A100.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "SmoothVLM significantly outperforms existing defense strategies, achieving state-of-the-art results.",
    396       "evidence": "Stated in abstract and contributions, but no competing defense methods are evaluated or compared against in the experiments.",
    397       "supported": "unsupported"
    398     }
    399   ],
    400   "red_flags": [
    401     {
    402       "flag": "Missing baseline comparisons",
    403       "detail": "The abstract and contributions claim SmoothVLM 'significantly outperforms existing defense strategies' and achieves 'state-of-the-art,' but no competing defense methods (DiffPure, PatchCleanser, etc.) are included in the evaluation. The only baseline is the undefended model."
    404     },
    405     {
    406       "flag": "No error bars or uncertainty quantification",
    407       "detail": "All ASR and distortion results are reported as point estimates without confidence intervals, error bars, or variance across runs. With random masking as a core component, result variability should be quantified."
    408     },
    409     {
    410       "flag": "Unvalidated oracle metric",
    411       "detail": "Attack success is determined by Vicuna-30B as an oracle LLM judge (§3.2, §4). The validity of this proxy is not assessed against human judgment, and no inter-rater agreement is reported."
    412     },
    413     {
    414       "flag": "Undisclosed image sources",
    415       "detail": "The 300 base images used for adversarial example generation are never identified. Their source, selection criteria, and characteristics are unknown, making it impossible to assess whether the evaluation is representative."
    416     },
    417     {
    418       "flag": "Unjustified sample size",
    419       "detail": "300 adversarial examples are used with no justification for this number. Without power analysis or variance estimates, it is unclear whether 300 is sufficient for the claims being made."
    420     },
    421     {
    422       "flag": "Claims outrun evidence",
    423       "detail": "The paper tests on two VLMs (LLaVA-1.5, miniGPT4) with 224×224 images but the title and claims address 'Vision-Language Models' generally. The generalization to other VLMs, image sizes, or attack types is unsubstantiated."
    424     }
    425   ],
    426   "cited_papers": [
    427     {
    428       "title": "GPT-4 Technical Report",
    429       "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"],
    430       "year": 2023,
    431       "arxiv_id": "2303.08774",
    432       "relevance": "Foundational LLM whose multimodal capabilities motivate VLM security research."
    433     },
    434     {
    435       "title": "SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks",
    436       "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J Pappas"],
    437       "year": 2023,
    438       "arxiv_id": "2310.03684",
    439       "relevance": "Direct precursor defense method for text-based LLM jailbreaks that SmoothVLM adapts to the visual domain."
    440     },
    441     {
    442       "title": "Visual Adversarial Examples Jailbreak Large Language Models",
    443       "authors": ["Xiangyu Qi", "Kaixuan Huang", "Ashwinee Panda", "Mengdi Wang", "Prateek Mittal"],
    444       "year": 2023,
    445       "arxiv_id": "2306.13213",
    446       "relevance": "One of two attack methods (VAE) directly evaluated in this paper; demonstrates visual prompt injection on VLMs."
    447     },
    448     {
    449       "title": "Jailbreak in Pieces: Compositional Adversarial Attacks on Multi-Modal Language Models",
    450       "authors": ["Erfan Shayegani", "Yue Dong", "Nael Abu-Ghazaleh"],
    451       "year": 2023,
    452       "arxiv_id": "2307.14539",
    453       "relevance": "The other attack method (JIP) evaluated in this paper; proposes compositional adversarial attacks on VLMs."
    454     },
    455     {
    456       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    457       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    458       "year": 2023,
    459       "arxiv_id": "2307.15043",
    460       "relevance": "Foundational work on adversarial attacks against aligned LLMs, directly relevant to jailbreak robustness."
    461     },
    462     {
    463       "title": "Jailbreaking Black Box Large Language Models in Twenty Queries",
    464       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J Pappas", "Eric Wong"],
    465       "year": 2023,
    466       "arxiv_id": "2310.08419",
    467       "relevance": "Demonstrates automated jailbreak discovery for LLMs, contributing to understanding of LLM vulnerabilities."
    468     },
    469     {
    470       "title": "Image Hijacks: Adversarial Images Can Control Generative Models at Runtime",
    471       "authors": ["Luke Bailey", "Euan Ong", "Stuart Russell", "Scott Emmons"],
    472       "year": 2023,
    473       "arxiv_id": "2309.00236",
    474       "relevance": "Demonstrates that adversarial images can hijack VLM behavior at runtime, directly relevant to visual prompt injection."
    475     },
    476     {
    477       "title": "Visual Instruction Tuning",
    478       "authors": ["Haotian Liu", "Chunyuan Li", "Qingyang Wu", "Yong Jae Lee"],
    479       "year": 2024,
    480       "relevance": "LLaVA model paper — one of two VLMs evaluated as a target in this defense work."
    481     },
    482     {
    483       "title": "MiniGPT-4: Enhancing Vision-Language Understanding with Advanced Large Language Models",
    484       "authors": ["Deyao Zhu", "Jun Chen", "Xiaoqian Shen", "Xiang Li", "Mohamed Elhoseiny"],
    485       "year": 2023,
    486       "arxiv_id": "2304.10592",
    487       "relevance": "Second VLM evaluated as a defense target; combines visual encoders with LLMs for multimodal understanding."
    488     },
    489     {
    490       "title": "Prompt Injection Attack Against LLM-Integrated Applications",
    491       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li", "Kailong Wang"],
    492       "year": 2023,
    493       "arxiv_id": "2306.05499",
    494       "relevance": "Directly relevant attack methodology for LLM-integrated applications that motivates defenses like SmoothVLM."
    495     },
    496     {
    497       "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection",
    498       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    499       "year": 2023,
    500       "relevance": "Demonstrates indirect prompt injection in real-world LLM applications, broadening the attack surface discussion."
    501     },
    502     {
    503       "title": "DiffPure: Diffusion Models for Adversarial Purification",
    504       "authors": ["Weili Nie", "Brandon Guo", "Yujia Huang", "Chaowei Xiao", "Arash Vahdat", "Anima Anandkumar"],
    505       "year": 2022,
    506       "arxiv_id": "2205.07460",
    507       "relevance": "Alternative defense approach using diffusion purification against adversarial attacks, discussed as motivation for patch-based threat model."
    508     },
    509     {
    510       "title": "Certified Adversarial Robustness via Randomized Smoothing",
    511       "authors": ["Jeremy Cohen", "Elan Rosenfeld", "Zico Kolter"],
    512       "year": 2019,
    513       "relevance": "Foundational certified defense technique that SmoothVLM builds upon, requiring 100,000 runs vs SmoothVLM's 10–20."
    514     },
    515     {
    516       "title": "Optimization-Based Prompt Injection Attack to LLM-as-a-Judge",
    517       "authors": ["Jiawen Shi", "Zenghui Yuan", "Yinuo Liu", "Yue Huang", "Pan Zhou", "Lichao Sun", "Neil Zhenqiang Gong"],
    518       "year": 2024,
    519       "arxiv_id": "2403.17710",
    520       "relevance": "Attacks on LLM-as-judge paradigm, relevant to understanding vulnerabilities in automated evaluation used by SmoothVLM."
    521     }
    522   ],
    523   "engagement_factors": {
    524     "practical_relevance": {
    525       "score": 2,
    526       "justification": "Proposes a defense mechanism applicable to VLMs in production, though no code release limits immediate adoption."
    527     },
    528     "surprise_contrarian": {
    529       "score": 1,
    530       "justification": "Extends known randomized smoothing techniques to VLMs; the q-instability finding is incremental rather than paradigm-shifting."
    531     },
    532     "fear_safety": {
    533       "score": 2,
    534       "justification": "Addresses visual prompt injection attacks that could manipulate VLM outputs in safety-critical applications."
    535     },
    536     "drama_conflict": {
    537       "score": 0,
    538       "justification": "No controversy or dramatic claims; straightforward defense proposal and evaluation."
    539     },
    540     "demo_ability": {
    541       "score": 0,
    542       "justification": "No code repository, demo, or tool released."
    543     },
    544     "brand_recognition": {
    545       "score": 1,
    546       "justification": "University research teams; paper references well-known models (GPT-4, LLaVA) but is from relatively lesser-known labs."
    547     }
    548   }
    549 }

Impressum · Datenschutz