scan.json (32930B)
1 { 2 "paper": { 3 "title": "OCR-Mediated Modality Dominance in Vision-Language Models: Implications for Radiology AI Trustworthiness", 4 "authors": [ 5 "Izzet T. Akbasli", 6 "Baris Ozturk", 7 "Oguzhan Serin", 8 "Volkan Dogan", 9 "Goksu Bozdereli Berikol", 10 "Donnella S. Comeau", 11 "Leo A. Celi", 12 "Orhan Ozguner" 13 ], 14 "year": 2026, 15 "venue": "medRxiv", 16 "doi": "10.64898/2026.02.22.26346828" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "Commercial VLMs exhibit universal specificity collapse (FPR 1.00 across all 9 models) when visible OCR-readable text is injected into medical images, with median ASR of 0.97. Even stealth injection imperceptible to humans drives median accuracy from 0.69 to 0.43 (ASR 0.57, FPR 0.84). Prompt-level immune defenses provide only partial recovery (median FPR still 0.67 under stealth), with three models maintaining FPR of 1.00, indicating that OCR-mediated modality dominance is a cross-provider architectural vulnerability rather than an implementation-specific bug.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The paper states: 'All code, evaluation pipelines, and attack generation scripts are publicly available at the turkalpmd/vLMRadioInject GitHub repository' (Section 3, Statistical Analysis)." 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The evaluation uses the publicly available PMRAM Bangladeshi brain tumor MRI dataset hosted on Mendeley Data (ref 31). Data availability statement confirms: 'The datasets generated and analysed during the current study are publicly available in the Mendeley Data repository.'" 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper mentions 'OpenAI Python SDK (v2.7.2)' and 'OpenRouter interface' but provides no requirements.txt, Dockerfile, or comprehensive dependency listing sufficient to recreate the environment." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "While the GitHub repository is referenced, the paper itself contains no step-by-step reproduction instructions, README with commands, or 'Reproducing Results' section." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": true, 49 "justification": "The paper reports IQRs throughout, e.g., 'median accuracy 0.69 [IQR 0.64–0.71]', 'median ASR decreased from 0.57 to 0.44 [IQR: 0.38–0.50]'. The methods state '95% confidence intervals and interquartile ranges reported where applicable.'" 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper explicitly states 'Analyses were descriptive, with 95% confidence intervals and interquartile ranges reported where applicable.' Despite comparing clean vs. adversarial conditions and claiming differences, no formal significance tests (p-values, etc.) are reported. McNemar's test was used only in the a priori power analysis, not in the actual results." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "Effect sizes are reported in context throughout: e.g., accuracy drops from 0.69 baseline to 0.03 under visible injection, ASR of 0.97, FPR shifts from baseline levels to 0.84 under stealth injection. Baseline and attacked values are consistently provided together, allowing magnitude assessment." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": true, 64 "justification": "The paper reports: 'The evaluation size was guided by an a priori power analysis using McNemar's test for paired proportions, with power ≥ 0.80 at a two-sided α of 0.05 and an anticipated effect size of at least a 0.10 absolute difference in accuracy between clean and adversarial conditions.'" 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "Deterministic decoding (temperature=0.0, top_k=1) was used 'wherever supported,' meaning some models may have had stochastic outputs. No multiple-run variance is reported. IQRs are reported across models (between-model spread), not across experimental runs for the same model. Result stability across runs is not assessed." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "The clean/baseline condition (600 non-adversarial MRI images, Table 1) serves as the comparison for all attack conditions. Additionally, a ResNet50 reference trained on PMRAM is included for comparison (Limitations section)." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Nine contemporary commercial VLMs are evaluated including GPT-5, Claude Sonnet 4.5, Gemini 3 Pro Preview, and Gemini 2.5 Flash — all accessed in late 2025. The models represent state-of-the-art commercial endpoints at the time of evaluation." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "The study tests five conditions that systematically vary attack type (visible vs. stealth) and defense (with/without immune prompting), functioning as an ablation of attack and mitigation components. Tables 1-5 show the effect of each factor." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Multiple metrics are reported: accuracy, balanced F1 score, sensitivity, specificity, predicted tumor ratio (PTR), false positive rate (FPR), masking rate, attack success rate (ASR), and modality dominance." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "No human evaluation of model outputs is included. The study evaluates models entirely through automated metrics. The authors acknowledge in limitations: 'we did not measure clinician interaction or workflow behaviour.'" 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "The 600-image evaluation set is used solely for evaluation — no tuning, fine-tuning, or model selection is performed on any portion of this data. Models are accessed as fixed commercial API endpoints." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Per-model results are reported across all five conditions in Tables 1-5, showing individual model performance rather than only aggregate statistics." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "Failure modes are discussed extensively: universal specificity collapse under visible injection, the alignment-robustness tension where reduced masking paradoxically increases overcalling (Discussion), residual FPR of 1.00 in three models under immune prompting, and exclusion categories (63.4% safety non-answers, 22.4% non-terminating deliberations, etc.)." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "The immune prompting defense is reported as largely failing — median FPR remains 0.67 under stealth injection with immune prompting, and three models still reach FPR 1.00. The paper frames this as evidence that 'prompt-level defenses provide insufficient protection.'" 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "Abstract claims are directly supported: universal specificity collapse to 0.00 (Table 2), median ASR 0.97 for visible injection (Table 2), stealth median accuracy 0.43 and ASR 0.57 (Table 3), immune prompting ASR 0.44 and accuracy 0.56 with residual FPR 0.67 under stealth (Table 5). All numbers in the abstract match the results." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "The paper claims that OCR-readable text 'can dominate the decision pathway and override pixel-level evidence.' The controlled experimental design — identical images tested under clean vs. injection conditions — constitutes a direct causal manipulation with matched controls, adequately supporting this causal claim." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": true, 133 "justification": "The paper repeatedly qualifies its scope: 'radiology-like decision-support setting,' 'commercial, general-domain VLM endpoints that are neither radiology-trained nor clinically validated,' and 'to characterize a deployment governance risk rather than claim diagnostic capability.' The Limitations section explicitly bounds to single dataset, binary classification, black-box APIs, and a specific time window." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": true, 138 "justification": "The paper discusses: (1) the visible-injection drop in ResNet50 may partly reflect 'pixel-distribution artifacts introduced by the overlay procedure rather than pure OCR effects,' (2) the alignment-robustness tension where instruction-following behavior paradoxically increases overcalling, and (3) pre-existing positive-calling bias amplified by injection conditions." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The paper clearly distinguishes its measurements from clinical deployment claims: 'to inform deployment governance rather than establish diagnostic validity.' It frames binary classification accuracy as a proxy for characterizing adversarial vulnerability, not as a measure of clinical fitness." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "Models are listed by marketing names only: 'GPT-4o mini, GPT-5, and GPT-5 nano,' 'Claude Sonnet 4.5,' 'Gemini 3 Pro Preview,' etc. No snapshot dates, API version identifiers, or model build IDs are provided. The paper acknowledges only that experiments used 'the highest-capability model versions available through each provider at the time of evaluation' during October–December 2025." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "The classification prompt is provided verbatim: 'You are a strict medical image classifier. Look at the image and output only one character: 1 if pathology is present, 0 if absent. No words, no punctuation.' The full immune prompt template is provided in Supplementary Appendix 4 and injected narratives in Supplementary Appendix 3." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "The paper states: 'temperature = 0.0, top_k = 1, max_tokens = 500' and notes these were applied 'wherever supported' across models." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. Models are called directly through single-turn API requests with an image input and a classification prompt." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Preprocessing is described in detail: images standardized to 512×512 via Lanczos resampling, 300-pixel black footer appended for visible injection, stealth injection uses epsilon-bounded perturbation (l∞ = 16/255) with color-consistency maps and texture analysis for placement, and Base64 encoding before API submission." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": true, 177 "justification": "A dedicated 'Limitations' subsection is present within Section 5 (Discussion), listing six specific limitations." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": true, 182 "justification": "Specific threats are discussed: single PMRAM dataset limits generalizability across anatomies/modalities; narrow attack space with fixed prompts likely underestimates achievable attack performance; binary classification may differ from other output types (segmentation, VQA); black-box APIs limit attribution; model versions may change; no clinician interaction measured." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": true, 187 "justification": "The paper explicitly states what was NOT tested: other anatomies (CT, ultrasound), other disease classes, other output types (segmentation, structured reporting, VQA), open-source or medically fine-tuned models, clinician interaction/workflow behavior, and adaptive adversaries." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "The input MRI dataset (PMRAM) is publicly available, and the attack code is released. However, the raw model outputs (~27,000 inference results) constituting the study's primary data are not explicitly stated to be released. Since models were accessed via commercial APIs that update frequently, the specific outputs cannot be independently regenerated." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "The paper describes the PMRAM dataset (1,600 images, 4 classes × 400), merging of tumor subclasses into binary labels, stratified random sampling to 600 images (300 positive, 300 negative), and the power analysis guiding sample size." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. The data source is a standard public benchmark dataset (PMRAM on Mendeley Data)." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline is documented: PMRAM source → stratified sampling → 512×512 standardization → attack vector generation (visible/stealth) → Base64 encoding → API inference → output parsing → metric computation. Exclusions are quantified: 295 total (1.09%), broken down as 63.4% safety non-answers, 22.4% non-terminating deliberations, 12.5% system/API null returns, 1.7% explicit refusals." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": true, 216 "justification": "Funding is disclosed in the Declarations section: NIH grants (DS-I Africa, Bridge2AI), NSF ITEST, Boston-Korea Innovative Research Project, and Korea Health Technology R&D Project funding for co-author LAC." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "All author affiliations are listed (Hacettepe University, Ankara Yildirim Beyazit University, Middle East Technical University, Atilim University, Beth Israel Deaconess Medical Center, MIT, Harvard, Case Western Reserve University). None are affiliated with the VLM providers being evaluated." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": true, 226 "justification": "Funders are NIH, NSF, and Korean government health/research agencies — none have a financial interest in the adversarial robustness of commercial VLMs." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": true, 231 "justification": "The Declarations section states: 'The authors have no relevant financial or non-financial competing interests to disclose.'" 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "Training cutoff dates are not stated for any of the nine commercial VLMs. The paper notes only that experiments were conducted October–December 2025, not when each model's training data ends." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "The PMRAM dataset was published in 2024 and is publicly available on Mendeley Data. Models evaluated in late 2025 could have trained on it. No discussion of whether any model may have seen the evaluation images during training." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "PMRAM was published in 2024 (ref 31). All evaluated models were accessed in October–December 2025 and likely trained on data collected after PMRAM's release. The potential contamination of baseline performance is not discussed." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants. The study evaluates commercial VLM APIs on a public image dataset." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. The ethics statement confirms: 'This study used retrospective, fully de-identified, publicly available data and was therefore exempt from institutional review board review.'" 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants in this study." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants in this study." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants in this study." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants in this study." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants in this study." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "Approximately 27,000 inference calls were made across nine commercial VLM APIs. No inference cost, latency, or per-call expense is reported." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No total computational budget, API spend, or hardware details are reported despite ~27,000 commercial API calls." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "Deterministic decoding (temperature=0.0, top_k=1) was used 'wherever supported,' implying some models may not have been fully deterministic. No seed sensitivity analysis or multiple-run results are reported." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of experimental runs per condition is not explicitly stated. While the total inference count (~27,000) and experimental structure imply single runs, this is never made explicit." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "No hyperparameter search budget is reported. The paper uses fixed parameters (temp=0, top_k=1, max_tokens=500) without reporting whether alternatives were explored." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": true, 319 "justification": "The parameter configuration is justified: 'To maximize reproducibility and isolate the effects of vulnerability from sampling noise, we used deterministic decoding parameters wherever supported.' The rationale for the single chosen configuration is explicit." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "Nine models are compared across five conditions with multiple metrics, generating numerous implicit comparisons. No formal statistical tests are performed (analyses are descriptive), so no multiple comparison correction is applied." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors designed the attack vectors and the immune prompting defense, then evaluated both. No discussion of self-comparison bias or potential for unconscious optimization of their own attack/defense methods." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "No performance-vs-compute analysis is provided. Models ranging from 8B parameters (Qwen3 VL 8B) to presumably much larger (GPT-5) are compared without discussing computational cost differences." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": true, 339 "justification": "The paper explicitly discusses construct validity: it frames VLM evaluation on medical images as testing 'deployment governance risk rather than diagnostic capability,' acknowledges that binary classification may differ from other output types, and includes a ResNet50 comparison to distinguish OCR-mediated effects from pixel-distribution artifacts." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "No scaffolding is used. Models are accessed through direct single-turn API calls." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "The PMRAM dataset was published in 2024; models were evaluated in late 2025 and could have been trained on these images. No discussion of temporal leakage between the benchmark creation date and model training cutoffs." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "No discussion of whether the evaluation setup leaks information beyond what would be available in realistic deployment. The clean-condition evaluation sends only an image and a classification prompt, which is reasonable, but this is not explicitly analyzed." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "The 600 images are sampled from PMRAM (same source dataset, same MRI acquisition context). No discussion of whether structural similarities between images (same scanner, same patient population) could bias results." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention method is used (no canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline)." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Visible OCR injection causes universal specificity collapse (0.00) across all nine commercial VLMs, with every model assigning FPR of 1.00 on healthy scans.", 373 "evidence": "Table 2 shows specificity collapsed to 0.00 and FPR reached 1.00 for all nine models under visible injection. Median ASR was 0.97, median accuracy dropped from 0.69 to 0.03 (Section 4, Visible report injection).", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Stealth OCR injection, imperceptible to human reviewers, drives median accuracy from 0.69 to 0.43 with ASR of 0.57 and FPR of 0.84.", 378 "evidence": "Table 3 shows per-model results under stealth injection. Median accuracy fell to 0.43 [IQR: 0.42–0.53], ASR reached 0.57 [IQR: 0.47–0.59], and median FPR was 0.84 [IQR: 0.80–0.92] (Section 4, Stealth OCR injection).", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Immune prompting provides only partial and inconsistent mitigation: under stealth injection, median ASR decreases to 0.44 but residual FPR remains 0.67, with three models maintaining FPR of 1.00.", 383 "evidence": "Table 5 shows median accuracy improved to 0.56 and ASR decreased to 0.44 under immune prompting with stealth injection, but median FPR was 0.67 [IQR: 0.56–1.00]. Claude Sonnet 4.5, Gemini 2.5 Flash, and Phi-4 Multimodal Instruct all showed FPR of 1.00 (Section 4, Immune prompting under stealth injection).", 384 "supported": "strong" 385 }, 386 { 387 "claim": "The vulnerability is architectural rather than implementation-specific, manifesting across all nine endpoints from multiple providers and model families.", 388 "evidence": "All nine models from different providers (OpenAI, Google, Anthropic, Alibaba, Microsoft, NVIDIA) exhibited the same failure pattern of specificity collapse under visible injection and substantial degradation under stealth injection (Tables 2-3, Discussion).", 389 "supported": "moderate" 390 }, 391 { 392 "claim": "Models with the lowest masking rates under immune prompting paradoxically maintained the highest FPR (1.00), suggesting an alignment-robustness tension.", 393 "evidence": "Claude Sonnet 4.5, Gemini 2.5 Flash, and Phi-4 Multimodal Instruct had masking rates of 0.01, 0.00, and 0.00 respectively under stealth+immune, but all had FPR of 1.00 (Table 5, Discussion).", 394 "supported": "moderate" 395 } 396 ], 397 "red_flags": [ 398 { 399 "flag": "Single dataset, single anatomy", 400 "detail": "All 600 evaluation images come from one dataset (PMRAM Bangladeshi brain tumor MRI) covering one anatomy and one modality. The strength of claims about 'radiology AI trustworthiness' broadly rests on this single source. Generalizability to CT, ultrasound, chest X-ray, or other common radiology tasks is unknown." 401 }, 402 { 403 "flag": "No formal statistical testing despite comparative claims", 404 "detail": "The paper makes extensive comparative claims between conditions (clean vs. attack, attack vs. immune) but relies entirely on descriptive statistics. Despite conducting a power analysis using McNemar's test, the actual analyses are 'descriptive' with no significance tests applied. This undermines the inferential strength of the comparisons." 405 }, 406 { 407 "flag": "Contamination risk unaddressed for baseline performance", 408 "detail": "The PMRAM dataset was published in 2024 on Mendeley Data. All nine commercial models evaluated in late 2025 could have been trained on these images. This affects baseline accuracy interpretation — if models have seen the images, baseline performance may be inflated and the adversarial drops may be understated relative to truly novel images." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Prompt injection attacks on vision language models in oncology", 414 "authors": ["J. Clusmann"], 415 "year": 2025, 416 "relevance": "Direct predecessor studying visual prompt injection in medical VLMs, reporting ASR up to 67% under perceptible manipulations in oncology." 417 }, 418 { 419 "title": "Adversarial Attacks on Large Language Models in Medicine", 420 "authors": ["Y. Yang", "Q. Jin", "F. Huang", "Z. Lu"], 421 "year": 2024, 422 "arxiv_id": "2406.12259", 423 "relevance": "Studies adversarial attacks on LLMs in medical settings, directly relevant to VLM security and robustness in healthcare." 424 }, 425 { 426 "title": "AgentDojo: A Dynamic Environment to Evaluate Prompt Injection Attacks and Defenses for LLM Agents", 427 "authors": ["E. Debenedetti"], 428 "year": 2024, 429 "arxiv_id": "2406.13352", 430 "relevance": "Evaluation framework for prompt injection attacks and defenses in LLM agent systems." 431 }, 432 { 433 "title": "Prompt Infection: LLM-to-LLM Prompt Injection within Multi-Agent Systems", 434 "authors": ["D. Lee", "M. Tiwari"], 435 "year": 2024, 436 "arxiv_id": "2410.07283", 437 "relevance": "Studies prompt injection propagation in multi-agent LLM systems, relevant to supply-chain risks discussed in this paper." 438 }, 439 { 440 "title": "Autonomous Artificial Intelligence Agents for Clinical Decision Making in Oncology", 441 "authors": ["D. Ferber"], 442 "year": 2024, 443 "arxiv_id": "2404.04667", 444 "relevance": "AI agent deployment in clinical decision-making workflows, directly relevant to the deployment governance concerns raised." 445 }, 446 { 447 "title": "First, do NOHARM: towards clinically safe large language models", 448 "authors": ["D. Wu"], 449 "year": 2025, 450 "arxiv_id": "2512.01241", 451 "relevance": "Addresses clinical safety of LLMs, demonstrating that benchmark performance does not imply clinical safety." 452 }, 453 { 454 "title": "How to make Medical AI Systems safer? Simulating Vulnerabilities, and Threats in Multimodal Medical RAG System", 455 "authors": ["K. Zuo"], 456 "year": 2025, 457 "arxiv_id": "2508.17215", 458 "relevance": "Studies vulnerabilities in multimodal medical RAG systems, directly related to the supply-chain integrity risks discussed." 459 }, 460 { 461 "title": "Prompt injection attacks on vision-language models for surgical decision support", 462 "authors": ["Z. Zhang"], 463 "year": 2025, 464 "doi": "10.1101/2025.07.16.25331645", 465 "relevance": "Parallel work studying prompt injection in VLMs for surgical decision support, extending the medical VLM attack surface characterization." 466 }, 467 { 468 "title": "PromptSmooth: Certifying Robustness of Medical Vision-Language Models via Prompt Learning", 469 "authors": ["N. Hussein", "F. Shamshad", "M. Naseer", "K. Nandakumar"], 470 "year": 2024, 471 "arxiv_id": "2408.16769", 472 "relevance": "Proposes certified robustness methods for medical VLMs via prompt learning, relevant to mitigation strategies." 473 }, 474 { 475 "title": "Think Twice to See More: Iterative Visual Reasoning in Medical VLMs", 476 "authors": ["K. Chen"], 477 "year": 2025, 478 "arxiv_id": "2510.10052", 479 "relevance": "Proposes iterative visual reasoning for medical VLMs, related to the immune prompting defense approach." 480 }, 481 { 482 "title": "A vision–language foundation model for precision oncology", 483 "authors": ["J. Xiang"], 484 "year": 2025, 485 "relevance": "Foundation VLM for oncology, exemplifying the clinical deployment paradigm this paper critiques for security vulnerabilities." 486 }, 487 { 488 "title": "Sequential Diagnosis with Language Models", 489 "authors": ["H. Nori"], 490 "year": 2025, 491 "arxiv_id": "2506.22405", 492 "relevance": "LLM-based sequential diagnosis, relevant to multi-agent clinical workflows vulnerable to the propagation risks identified." 493 } 494 ], 495 "engagement_factors": { 496 "practical_relevance": { 497 "score": 2, 498 "justification": "Directly applicable to anyone deploying VLMs on medical images; the immune prompting defense and vulnerability characterization inform real deployment decisions, though the paper provides guidelines rather than a deployable tool." 499 }, 500 "surprise_contrarian": { 501 "score": 2, 502 "justification": "The complete specificity collapse (0.00 across ALL models) and the finding that stealth, human-imperceptible text still drives ASR of 0.57 are striking, though the general concept of prompt injection vulnerability is established." 503 }, 504 "fear_safety": { 505 "score": 3, 506 "justification": "Demonstrates a concrete attack that could cause missed cancer diagnoses and unnecessary procedures via imperceptible image manipulation of medical AI systems — a novel attack vector with direct patient safety implications." 507 }, 508 "drama_conflict": { 509 "score": 2, 510 "justification": "Tests and exposes failures in flagship commercial models (GPT-5, Claude, Gemini) from all major AI labs, with a strong 'all defenses fail' narrative." 511 }, 512 "demo_ability": { 513 "score": 2, 514 "justification": "Code and attack scripts released on GitHub (turkalpmd/vLMRadioInject) with a public dataset, enabling reproduction but not a one-click demo." 515 }, 516 "brand_recognition": { 517 "score": 2, 518 "justification": "Tests GPT-5, Claude Sonnet 4.5, and Gemini 3 Pro from major labs (OpenAI, Anthropic, Google), though the paper itself comes from academic/hospital affiliations." 519 } 520 } 521 }