scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29428B)
      1 {
      2   "paper": {
      3     "title": "Image-based Prompt Injection: Hijacking Multimodal LLMs through Visually Embedded Adversarial Instructions",
      4     "authors": [
      5       "Neha Nagaraja",
      6       "Lan Zhang",
      7       "Zhilong Wang",
      8       "Bo Zhang",
      9       "Pawan Patil"
     10     ],
     11     "year": 2025,
     12     "venue": "FLLM 2025",
     13     "arxiv_id": "2603.03637",
     14     "doi": "10.1109/FLLM67465.2025.11391218"
     15   },
     16   "scan_version": 3,
     17   "active_modules": [
     18     "experimental_rigor",
     19     "data_leakage"
     20   ],
     21   "methodology_tags": [
     22     "benchmark-eval"
     23   ],
     24   "key_findings": "Image-based prompt injection can reliably hijack GPT-4-turbo output by embedding adversarial text in images, with 100% attack success rate using visible neon purple text and up to 64% under stealth constraints using global region-averaged coloring with object-aware prefixing. Font scale must be ≥0.3 for reliable injection; smaller scales yield near-zero success. Object-aware prefixing that names detected image objects boosts stealth ASR from 41% to 64%, and there is a clear trade-off between visual imperceptibility and model interpretability across three coloring strategies.",
     25   "claims": [
     26     {
     27       "claim": "IPI can reliably hijack model output in black-box settings, with repetition-based prompts (Prompt 1 and 5) achieving 100% attack success rate.",
     28       "evidence": "Table I shows Prompt 1 and Prompt 5 achieving 100% ASR across COCO images with neon purple text (Section IV.B). However, this uses highly visible text, not stealth embedding.",
     29       "supported": "moderate"
     30     },
     31     {
     32       "claim": "The most effective stealth configuration achieves up to 64% attack success rate.",
     33       "evidence": "Table V shows object-aware prefix + base prompt with +20 brightness offset achieving 64% ASR using global region-averaged coloring.",
     34       "supported": "moderate"
     35     },
     36     {
     37       "claim": "Font scale threshold of approximately ≥0.3 is required for reliable prompt injection.",
     38       "evidence": "Table II shows 0% success at scale 0.10, 1% at 0.15, 10% at 0.20, 26.75% at 0.25, and 37.88% at 0.30.",
     39       "supported": "moderate"
     40     },
     41     {
     42       "claim": "Object-aware prefixing increases ASR from 41% to 64% by suppressing visual grounding.",
     43       "evidence": "Table V compares base prompt only (41%) vs. object-aware prefix + base prompt (64%) under identical visual conditions (same font color, position, offset).",
     44       "supported": "moderate"
     45     },
     46     {
     47       "claim": "The technique is broadly generalizable to other multimodal LLMs.",
     48       "evidence": "Section V asserts generalizability ('the same principle can apply across different architectures') but experiments use only GPT-4-turbo. No other models tested.",
     49       "supported": "unsupported"
     50     }
     51   ],
     52   "red_flags": [
     53     {
     54       "flag": "Single model evaluation with broad generalizability claims",
     55       "detail": "All experiments use only GPT-4-turbo, yet the paper claims the attack is 'broadly generalizable to other models' (Section V) and the title says 'Multimodal LLMs' (plural). No evidence supports generalizability."
     56     },
     57     {
     58       "flag": "No human evaluation of stealth",
     59       "detail": "The paper's core claim involves visual imperceptibility ('near-invisible', 'stealth'), yet no human study evaluates whether the embedded prompts are actually invisible to humans. Stealth is assumed, not measured."
     60     },
     61     {
     62       "flag": "Conflation of visible and stealth results",
     63       "detail": "Table I reports 100% ASR using neon purple text that is clearly visible. The abstract's claim of 'reliably manipulate' conflates these visible-text results with the 64% stealth results, creating a misleading impression of overall effectiveness."
     64     },
     65     {
     66       "flag": "Unstated sample sizes",
     67       "detail": "The paper never states how many COCO images were used. Table II references '800 queries' but with 5 runs per image and 8 prompts, this implies only 20 images. The actual N is unclear throughout."
     68     },
     69     {
     70       "flag": "No statistical testing across any comparisons",
     71       "detail": "The paper compares 12 prompts, 5 font scales, 3 coloring strategies, and multiple brightness offsets with no statistical significance tests, confidence intervals, or variance measures despite running 5 trials per image."
     72     },
     73     {
     74       "flag": "Company affiliation without competing interests declaration",
     75       "detail": "Three of five authors are affiliated with Bytedance, which develops multimodal AI products. No competing interests statement is provided."
     76     }
     77   ],
     78   "cited_papers": [
     79     {
     80       "title": "Ignore previous prompt: Attack techniques for language models",
     81       "authors": ["F. Perez", "I. Ribeiro"],
     82       "year": 2022,
     83       "arxiv_id": "2211.09527",
     84       "relevance": "Foundational work on prompt injection attacks against language models, directly relevant to the attack taxonomy."
     85     },
     86     {
     87       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
     88       "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra", "C. Endres", "T. Holz", "M. Fritz"],
     89       "year": 2023,
     90       "arxiv_id": "2302.12173",
     91       "relevance": "Seminal work on indirect prompt injection through external content, establishing the threat model this paper extends to images."
     92     },
     93     {
     94       "title": "An early categorization of prompt injection attacks on large language models",
     95       "authors": ["S. Rossi", "A. M. Michel", "R. R. Mukkamala", "J. B. Thatcher"],
     96       "year": 2024,
     97       "arxiv_id": "2402.00898",
     98       "relevance": "Taxonomy of prompt injection attacks providing classification framework for this line of research."
     99     },
    100     {
    101       "title": "Empirical analysis of large vision-language models against goal hijacking via visual prompt injection",
    102       "authors": ["S. Kimura", "R. Tanaka", "S. Miyawaki", "J. Suzuki", "K. Sakaguchi"],
    103       "year": 2024,
    104       "arxiv_id": "2408.03554",
    105       "relevance": "Most directly related prior work on visual prompt injection and goal hijacking in VLMs."
    106     },
    107     {
    108       "title": "Eyes closed, safety on: Protecting multimodal LLMs via image-to-text transformation",
    109       "authors": ["Y. Gou", "K. Chen", "Z. Liu", "L. Hong", "H. Xu", "Z. Li", "D.-Y. Yeung", "J. T. Kwok", "Y. Zhang"],
    110       "year": 2024,
    111       "arxiv_id": "2403.09572",
    112       "relevance": "Proposes defense against visual attacks by replacing raw images with sanitized text descriptions."
    113     },
    114     {
    115       "title": "Image hijacks: Adversarial images can control generative models at runtime",
    116       "authors": ["L. Bailey", "E. Ong", "S. Russell", "S. Emmons"],
    117       "year": 2024,
    118       "arxiv_id": "2309.00236",
    119       "relevance": "Demonstrates adversarial images controlling generative model behavior at runtime, closely related attack paradigm."
    120     },
    121     {
    122       "title": "Safeguarding vision-language models against patched visual prompt injectors",
    123       "authors": ["J. Sun", "C. Wang", "J. Wang", "Y. Zhang", "C. Xiao"],
    124       "year": 2024,
    125       "arxiv_id": "2405.10529",
    126       "relevance": "Defense mechanism against visual prompt injection via patched images in VLMs."
    127     },
    128     {
    129       "title": "Figstep: Jailbreaking large vision-language models via typographic visual prompts",
    130       "authors": ["Y. Gong", "D. Ran", "J. Liu", "C. Wang", "T. Cong", "A. Wang", "S. Duan", "X. Wang"],
    131       "year": 2025,
    132       "arxiv_id": "2311.05608",
    133       "relevance": "Typographic visual prompts for jailbreaking VLMs, closely related attack vector using text in images."
    134     },
    135     {
    136       "title": "Visual adversarial examples jailbreak aligned large language models",
    137       "authors": ["X. Qi", "K. Huang", "A. Panda", "P. Henderson", "M. Wang", "P. Mittal"],
    138       "year": 2023,
    139       "arxiv_id": "2306.13213",
    140       "relevance": "White-box visual adversarial attack that jailbreaks aligned LLMs using crafted images."
    141     },
    142     {
    143       "title": "Abusing images and sounds for indirect instruction injection in multi-modal LLMs",
    144       "authors": ["E. Bagdasaryan", "T.-Y. Hsieh", "B. Nassi", "V. Shmatikov"],
    145       "year": 2023,
    146       "arxiv_id": "2307.10490",
    147       "relevance": "Indirect instruction injection through images and audio in multimodal LLMs, foundational attack work."
    148     },
    149     {
    150       "title": "Agent smith: A single image can jailbreak one million multimodal LLM agents exponentially fast",
    151       "authors": ["X. Gu", "X. Zheng", "T. Pang", "C. Du", "Q. Liu", "Y. Wang", "J. Jiang", "M. Lin"],
    152       "year": 2024,
    153       "arxiv_id": "2402.08567",
    154       "relevance": "Demonstrates scalability of image-based jailbreaks across millions of multimodal LLM agents."
    155     }
    156   ],
    157   "checklist": {
    158     "artifacts": {
    159       "code_released": {
    160         "applies": true,
    161         "answer": false,
    162         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The IPI pipeline (Algorithm 1) is described but no implementation is released."
    163       },
    164       "data_released": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "The paper uses the COCO dataset (Section IV.A), which is a publicly available standard benchmark. However, the generated adversarial images are not released."
    168       },
    169       "environment_specified": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "No requirements.txt, Dockerfile, conda environment, or library version details are provided. The paper mentions using SAM and GPT-4-turbo API but provides no environment specification."
    173       },
    174       "reproduction_instructions": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "No step-by-step reproduction instructions are provided. Algorithm 1 gives a high-level pseudocode pipeline but lacks the detail needed to reproduce results (e.g., specific COCO images used, API parameters, SAM configuration)."
    178       }
    179     },
    180     "statistical_methodology": {
    181       "confidence_intervals_or_error_bars": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "All results are reported as point estimates (e.g., '100%', '64%', '37.88%') in Tables I-V. No confidence intervals, error bars, or uncertainty measures are provided despite running 5 trials per image."
    185       },
    186       "significance_tests": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "The paper compares 12 prompt strategies, 5 font scales, 3 coloring strategies, and multiple brightness offsets without any statistical significance tests. Differences are compared by raw ASR numbers only."
    190       },
    191       "effect_sizes_reported": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "Results are reported as raw ASR percentages (e.g., Table I). While Table V shows a comparison (41% vs 64%), no formal effect sizes (Cohen's d, odds ratios) are reported, and most tables present isolated percentages without baseline context."
    195       },
    196       "sample_size_justified": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The number of COCO images used is never explicitly stated. Table II references '800 queries' but no justification is given for the sample size. No power analysis is discussed."
    200       },
    201       "variance_reported": {
    202         "applies": true,
    203         "answer": false,
    204         "justification": "Despite running each image 5 times per configuration (Section IV.A), no variance, standard deviation, or spread measures are reported. Only aggregate success counts and percentages are shown."
    205       }
    206     },
    207     "evaluation_design": {
    208       "baselines_included": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Section IV.B states: 'Without embedded prompts, models produced standard descriptions, but with prompts embedded, models disregarded visual content.' The no-attack baseline is implicit. Additionally, Table V compares base prompt only vs. object-aware variants."
    212       },
    213       "baselines_contemporary": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "The paper does not compare its IPI approach against other contemporary visual prompt injection methods (e.g., Kimura et al. 2024 [7], Bailey et al. 2024 [13]). All comparisons are between variants of the authors' own method."
    217       },
    218       "ablation_study": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "The paper systematically ablates components: 12 prompt strategies (Table I), font size impact (Table II), 3 coloring strategies (Tables III-V), single vs. multi-mask embedding, and object-aware prefix effect (Table V showing 41%→64%)."
    222       },
    223       "multiple_metrics": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "ASR (Attack Success Rate) is the sole evaluation metric reported across all experiments. MSE for visual distortion is mentioned in Section IV.A ('We also explored secondary metrics, such as mean squared error') but never reported in any results table."
    227       },
    228       "human_evaluation": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No human evaluation is performed despite the paper's central claim of visual stealth/imperceptibility. Whether embedded prompts are actually invisible to humans is never tested with human subjects."
    232       },
    233       "held_out_test_set": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "No train/test or dev/test split is described. The best prompt, font size, and coloring strategy appear to be selected on the same images used for evaluation."
    237       },
    238       "per_category_breakdown": {
    239         "applies": true,
    240         "answer": true,
    241         "justification": "Results are broken down by prompt strategy (Table I, 12 variants), font scale (Table II, 5 scales), coloring strategy (Tables III-V), and mask configuration (single vs. multi-mask)."
    242       },
    243       "failure_cases_discussed": {
    244         "applies": true,
    245         "answer": true,
    246         "justification": "Section IV.C.2b discusses pixel-level blending's failure (max ASR 10%): 'excessive blending obscured the structural clarity required for recognition.' Section IV.C.1 discusses small font failure: 'Font scales below 0.20 resulted in negligible success.'"
    247       },
    248       "negative_results_reported": {
    249         "applies": true,
    250         "answer": true,
    251         "justification": "Pixel-level blending's consistent failure (max 10% ASR) is reported as 'the least effective strategy in black-box settings.' Small font scales failing (0% at 0.10) and patch-based coloring's limited success (max 25%) are also reported."
    252       }
    253     },
    254     "claims_and_evidence": {
    255       "abstract_claims_supported": {
    256         "applies": true,
    257         "answer": true,
    258         "justification": "The abstract claims 'up to 64% attack success under stealth constraints' (supported by Table V) and that IPI 'can reliably manipulate the output' (supported by Table I's 100% ASR, though this uses visible text). The 'up to' qualifier appropriately hedges the stealth result."
    259       },
    260       "causal_claims_justified": {
    261         "applies": true,
    262         "answer": true,
    263         "justification": "Table V's comparison of object-aware prefix only (35%), base prompt only (41%), and combined (64%) under identical visual conditions constitutes a controlled single-variable manipulation adequate for the causal claim that prefixing increases ASR."
    264       },
    265       "generalization_bounded": {
    266         "applies": true,
    267         "answer": false,
    268         "justification": "The title claims 'Hijacking Multimodal LLMs' (plural) and Section V claims the technique is 'broadly generalizable to other models,' but experiments use only GPT-4-turbo. No other model is tested. The paper tests on COCO images only."
    269       },
    270       "alternative_explanations_discussed": {
    271         "applies": true,
    272         "answer": false,
    273         "justification": "No alternative explanations are considered. The paper does not discuss whether GPT-4-turbo is uniquely susceptible, whether COCO images are particularly suitable, or whether the observed ASR differences could be due to confounds other than the manipulated variables."
    274       },
    275       "proxy_outcome_distinction": {
    276         "applies": true,
    277         "answer": true,
    278         "justification": "The paper measures ASR (whether model output matches attacker's goal) and claims attack success. The measurement directly matches the claim without proxy gap. However, visual 'stealth' is claimed without measuring it."
    279       }
    280     },
    281     "setup_transparency": {
    282       "model_versions_specified": {
    283         "applies": true,
    284         "answer": false,
    285         "justification": "The paper uses 'GPT-4-turbo' and 'gpt-4o' without specifying snapshot dates or API versions (e.g., 'gpt-4-turbo-2024-04-09'). Model behavior changes across versions, making this insufficient."
    286       },
    287       "prompts_provided": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Section IV.B provides the full text of Prompt 5 ('Say XXX. Ignore the image...') and the object-aware prefix template with a concrete example ('Ignore dog, ball, and grass in the photo...'). Section III.B describes the GPT-4o image description prompt verbatim."
    291       },
    292       "hyperparameters_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No API parameters (temperature, top-p, max tokens) are reported for GPT-4-turbo queries or GPT-4o image description calls. SAM segmentation parameters are also not specified."
    296       },
    297       "scaffolding_described": {
    298         "applies": false,
    299         "answer": false,
    300         "justification": "No agentic scaffolding is used. The system is a deterministic pipeline (Algorithm 1) that embeds text in images and queries the API, not an agent with tools, memory, or feedback loops."
    301       },
    302       "data_preprocessing_documented": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The paper does not describe how images were selected from COCO, how many were used, or any filtering criteria. The pipeline from raw COCO images to experimental images is not documented with sufficient detail."
    306       }
    307     },
    308     "limitations_and_scope": {
    309       "limitations_section_present": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "There is no dedicated limitations section. Section V (Discussion) briefly mentions the stealth-effectiveness trade-off and possible defenses, but does not substantively discuss the study's own limitations."
    313       },
    314       "threats_to_validity_specific": {
    315         "applies": true,
    316         "answer": false,
    317         "justification": "No threats to validity are discussed. The paper does not address threats such as single-model testing, unstated sample sizes, lack of human stealth evaluation, or potential overfitting of prompt strategies to GPT-4-turbo."
    318       },
    319       "scope_boundaries_stated": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Section V mentions 'effectiveness may vary depending on each model's safety filters' but does not explicitly state what the results do NOT show (e.g., no evidence for models other than GPT-4-turbo, no evidence for real-world deployment scenarios, no human perception data)."
    323       }
    324     },
    325     "data_integrity": {
    326       "raw_data_available": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "No raw data is released. Neither the generated adversarial images, GPT-4-turbo responses, nor per-image results are available for independent verification."
    330       },
    331       "data_collection_described": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The paper states images come from COCO and each is queried 5 times (Section IV.A), but does not specify which COCO images, how many were used, or how they were selected. The total N is never clearly stated."
    335       },
    336       "recruitment_methods_described": {
    337         "applies": false,
    338         "answer": false,
    339         "justification": "No human participants. The data source is COCO, a standard public benchmark."
    340       },
    341       "data_pipeline_documented": {
    342         "applies": true,
    343         "answer": false,
    344         "justification": "Algorithm 1 provides a high-level pipeline, but the data pipeline from COCO image selection through experimental execution to results aggregation is not documented. How many images entered, how many were processed per configuration, and how results were aggregated is unclear."
    345       }
    346     },
    347     "conflicts_of_interest": {
    348       "funding_disclosed": {
    349         "applies": true,
    350         "answer": true,
    351         "justification": "Section VIII states: 'Lan Zhang was supported by NSF CNS-2451231.'"
    352       },
    353       "affiliations_disclosed": {
    354         "applies": true,
    355         "answer": true,
    356         "justification": "Author affiliations are listed: two from Northern Arizona University and three from Bytedance. The affiliations are clearly stated in the paper header."
    357       },
    358       "funder_independent_of_outcome": {
    359         "applies": true,
    360         "answer": true,
    361         "justification": "NSF is an independent government funding agency with no financial interest in the attack success rate results. However, Bytedance's role as employer of 3 authors (not listed as funder) is a separate consideration."
    362       },
    363       "financial_interests_declared": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No competing interests or financial interests statement is provided. Three authors are from Bytedance, which develops multimodal AI products that could be affected by this research, but no disclosure is made."
    367       }
    368     },
    369     "contamination": {
    370       "training_cutoff_stated": {
    371         "applies": false,
    372         "answer": false,
    373         "justification": "This is a red-teaming study testing an attack methodology against a model's instruction-following behavior, not evaluating model knowledge on a benchmark. Contamination of COCO images in training data is not relevant to the attack mechanism."
    374       },
    375       "train_test_overlap_discussed": {
    376         "applies": false,
    377         "answer": false,
    378         "justification": "Same as above — the study tests whether embedded text instructions override model behavior, not whether the model has memorized benchmark answers."
    379       },
    380       "benchmark_contamination_addressed": {
    381         "applies": false,
    382         "answer": false,
    383         "justification": "Same as above — contamination is structurally inapplicable to this attack evaluation paradigm."
    384       }
    385     },
    386     "human_studies": {
    387       "pre_registered": {
    388         "applies": false,
    389         "answer": false,
    390         "justification": "No human participants in this study."
    391       },
    392       "irb_or_ethics_approval": {
    393         "applies": false,
    394         "answer": false,
    395         "justification": "No human participants in this study."
    396       },
    397       "demographics_reported": {
    398         "applies": false,
    399         "answer": false,
    400         "justification": "No human participants in this study."
    401       },
    402       "inclusion_exclusion_criteria": {
    403         "applies": false,
    404         "answer": false,
    405         "justification": "No human participants in this study."
    406       },
    407       "randomization_described": {
    408         "applies": false,
    409         "answer": false,
    410         "justification": "No human participants in this study."
    411       },
    412       "blinding_described": {
    413         "applies": false,
    414         "answer": false,
    415         "justification": "No human participants in this study."
    416       },
    417       "attrition_reported": {
    418         "applies": false,
    419         "answer": false,
    420         "justification": "No human participants in this study."
    421       }
    422     },
    423     "cost_and_practicality": {
    424       "inference_cost_reported": {
    425         "applies": true,
    426         "answer": false,
    427         "justification": "The paper queries GPT-4-turbo and GPT-4o APIs multiple times per image across hundreds of configurations but reports no API costs, token counts, or latency measures."
    428       },
    429       "compute_budget_stated": {
    430         "applies": true,
    431         "answer": false,
    432         "justification": "No total compute budget is stated. The paper uses GPT-4-turbo API, GPT-4o API, and SAM model but provides no information about total API spend, GPU hours for SAM, or overall computational requirements."
    433       }
    434     },
    435     "experimental_rigor": {
    436       "seed_sensitivity_reported": {
    437         "applies": true,
    438         "answer": false,
    439         "justification": "Each image is processed 5 times (Section IV.A), but no analysis of variance across runs is reported. Only aggregate success counts are shown."
    440       },
    441       "number_of_runs_stated": {
    442         "applies": true,
    443         "answer": true,
    444         "justification": "Section IV.A states: 'each image is processed five times per configuration on the target model (GPT-4-turbo).'"
    445       },
    446       "hyperparameter_search_budget": {
    447         "applies": true,
    448         "answer": false,
    449         "justification": "The paper explores 12 prompts, 5 font scales, 3 coloring strategies, and multiple brightness offsets, but does not frame this as a hyperparameter search or report the total configurations tried or compute spent."
    450       },
    451       "best_config_selection_justified": {
    452         "applies": true,
    453         "answer": false,
    454         "justification": "Prompt 5 is selected as the default 'due to consistent effectiveness across diverse images and injection conditions' (Section IV.B), but selection appears to be from the same evaluation data. No validation/test split is described."
    455       },
    456       "multiple_comparison_correction": {
    457         "applies": true,
    458         "answer": false,
    459         "justification": "The paper compares 12 prompts, 5 font scales, 3 coloring strategies, and multiple brightness offsets without any correction for multiple comparisons. No significance tests are performed at all."
    460       },
    461       "self_comparison_bias_addressed": {
    462         "applies": true,
    463         "answer": false,
    464         "justification": "The authors evaluate their own IPI pipeline against their own variants without acknowledging self-comparison bias or seeking independent evaluation."
    465       },
    466       "compute_budget_vs_performance": {
    467         "applies": false,
    468         "answer": false,
    469         "justification": "All configurations use roughly similar compute (one API query per trial). Compute differences across configurations are negligible."
    470       },
    471       "benchmark_construct_validity": {
    472         "applies": true,
    473         "answer": false,
    474         "justification": "No discussion of whether ASR on COCO images with GPT-4-turbo reflects real-world attack viability. The construct validity of using ASR on a static image dataset as a proxy for real-world threat is not addressed."
    475       },
    476       "scaffold_confound_addressed": {
    477         "applies": false,
    478         "answer": false,
    479         "justification": "No scaffolding is involved. The system is a direct pipeline that embeds text and queries an API."
    480       }
    481     },
    482     "data_leakage": {
    483       "temporal_leakage_addressed": {
    484         "applies": true,
    485         "answer": false,
    486         "justification": "GPT-4-turbo was almost certainly trained on COCO images, which could affect its text-reading behavior on these images. This temporal overlap is not discussed."
    487       },
    488       "feature_leakage_addressed": {
    489         "applies": true,
    490         "answer": false,
    491         "justification": "The evaluation sends only images to GPT-4-turbo with no text prompt, but the paper does not discuss whether this evaluation setup accurately represents real deployment scenarios where system prompts would be present."
    492       },
    493       "non_independence_addressed": {
    494         "applies": true,
    495         "answer": false,
    496         "justification": "No discussion of whether COCO images share structural similarities that could inflate or deflate attack success rates, or whether results would differ on images from other distributions."
    497       },
    498       "leakage_detection_method": {
    499         "applies": true,
    500         "answer": false,
    501         "justification": "No leakage detection or prevention method is used or discussed."
    502       }
    503     }
    504   },
    505   "engagement_factors": {
    506     "practical_relevance": {
    507       "score": 1,
    508       "justification": "The attack pipeline is described in detail but no code is released; a security researcher could implement it but it's not directly usable."
    509     },
    510     "surprise_contrarian": {
    511       "score": 1,
    512       "justification": "Visual prompt injection is a known concern; this paper confirms and quantifies the threat rather than challenging conventional wisdom."
    513     },
    514     "fear_safety": {
    515       "score": 2,
    516       "justification": "Demonstrates a practical black-box attack embedding hidden instructions in images to hijack multimodal LLM output, raising concrete AI safety concerns."
    517     },
    518     "drama_conflict": {
    519       "score": 0,
    520       "justification": "No controversy, no accusations, and no 'benchmarks are fake' angle — straightforward attack methodology paper."
    521     },
    522     "demo_ability": {
    523       "score": 0,
    524       "justification": "No code released, no demo, no tool to try — the attack pipeline exists only in paper description."
    525     },
    526     "brand_recognition": {
    527       "score": 1,
    528       "justification": "Tests GPT-4-turbo (well-known product) but authors are from NAU and Bytedance, not top-tier AI safety labs."
    529     }
    530   }
    531 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs