scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (25054B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Image-based Prompt Injection: Hijacking Multimodal LLMs through Visually Embedded Adversarial Instructions",
      6     "authors": [
      7       "Neha Nagaraja",
      8       "Lan Zhang",
      9       "Zhilong Wang",
     10       "Bo Zhang",
     11       "Pawan Patil"
     12     ],
     13     "year": 2025,
     14     "venue": "Unknown",
     15     "arxiv_id": "2603.03637",
     16     "doi": "10.1109/FLLM67465.2025.11391218"
     17   },
     18   "checklist": {
     19     "claims_and_evidence": {
     20       "abstract_claims_supported": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "All abstract claims (IPI can manipulate output, achieve 64% success, practical threat) are directly supported by experimental results in Tables I-V.",
     24         "source": "haiku"
     25       },
     26       "causal_claims_justified": {
     27         "applies": true,
     28         "answer": true,
     29         "justification": "Causal claims about font size and color effects justified through ablation studies (Tables II-V) holding other variables constant while varying single parameters.",
     30         "source": "haiku"
     31       },
     32       "generalization_bounded": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "Paper claims technique is 'broadly generalizable to other multimodal LLMs' but tests only GPT-4-turbo. Extrapolates COCO results to general 'natural images' without bounded scope.",
     36         "source": "haiku"
     37       },
     38       "alternative_explanations_discussed": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Doesn't explore why embedded prompts work (reading text vs. visual artifacts?) or why coloring strategies differ mechanistically. No discussion of alternative interpretations.",
     42         "source": "haiku"
     43       },
     44       "proxy_outcome_distinction": {
     45         "applies": true,
     46         "answer": true,
     47         "justification": "Attack Success Rate (ASR) directly measures whether model output matches injected instructions, which is the exact phenomenon claimed.",
     48         "source": "haiku"
     49       }
     50     },
     51     "limitations_and_scope": {
     52       "limitations_section_present": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No dedicated limitations or threats-to-validity section. Limitations scattered in Discussion (Section V) without comprehensive organization.",
     56         "source": "haiku"
     57       },
     58       "threats_to_validity_specific": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Generic statements like 'effectiveness may vary depending on each model's safety filters' stated. Specific threats (e.g., 'single-model evaluation') not articulated.",
     62         "source": "haiku"
     63       },
     64       "scope_boundaries_stated": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "Scope implicit in methodology (GPT-4-turbo, COCO, 12 prompts) but never explicitly stated as limitations on generalizability of findings.",
     68         "source": "haiku"
     69       }
     70     },
     71     "conflicts_of_interest": {
     72       "funding_disclosed": {
     73         "applies": true,
     74         "answer": false,
     75         "justification": "NSF grant (CNS-2451231) disclosed for one author. Three Bytedance-affiliated authors' funding sources not disclosed.",
     76         "source": "haiku"
     77       },
     78       "affiliations_disclosed": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Bytedance affiliations listed but not discussed as potential conflicts. Bytedance develops competing multimodal models; no COI discussion.",
     82         "source": "haiku"
     83       },
     84       "funder_independent_of_outcome": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "NSF is independent (appropriate), but Bytedance funding unclear. If Bytedance co-funded, it would benefit from demonstrating OpenAI vulnerabilities.",
     88         "source": "haiku"
     89       },
     90       "financial_interests_declared": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "No competing interests statement or disclosure of patents, equity, or consulting related to adversarial ML or prompt injection.",
     94         "source": "haiku"
     95       }
     96     },
     97     "scope_and_framing": {
     98       "key_terms_defined": {
     99         "applies": true,
    100         "answer": true,
    101         "justification": "IPI defined as black-box attack embedding adversarial text in images. ASR defined as Nsuccess/N. MLLM and black-box access clearly explained.",
    102         "source": "haiku"
    103       },
    104       "intended_contribution_clear": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Four explicit contributions stated: novel attack method, end-to-end pipeline, empirical parameter evaluation, black-box feasibility demonstration.",
    108         "source": "haiku"
    109       },
    110       "engagement_with_prior_work": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Section VI positions work relative to text/visual prompt injection literature, explaining this contribution as systematic evaluation of visibility-stealth trade-offs.",
    114         "source": "haiku"
    115       }
    116     }
    117   },
    118   "type_checklist": {
    119     "empirical": {
    120       "artifacts": {
    121         "code_released": {
    122           "applies": true,
    123           "answer": false,
    124           "justification": "No source code, GitHub, or implementation released. Algorithm 1 pseudocode insufficient for reproduction.",
    125           "source": "haiku"
    126         },
    127         "data_released": {
    128           "applies": true,
    129           "answer": true,
    130           "justification": "COCO dataset is publicly available. Adversarial images not released, but foundational dataset is standard and accessible.",
    131           "source": "haiku"
    132         },
    133         "environment_specified": {
    134           "applies": true,
    135           "answer": false,
    136           "justification": "SAM, GPT-4-turbo, ChatGPT mentioned without versions, API dates, library versions, requirements.txt, or Dockerfile.",
    137           "source": "haiku"
    138         },
    139         "reproduction_instructions": {
    140           "applies": true,
    141           "answer": false,
    142           "justification": "Algorithm 1 gives high-level flow but lacks step-by-step instructions for setting up, running, parameterizing, or validating attacks.",
    143           "source": "haiku"
    144         }
    145       },
    146       "statistical_methodology": {
    147         "confidence_intervals_or_error_bars": {
    148           "applies": true,
    149           "answer": false,
    150           "justification": "Tables I-V report point-estimate percentages only. Paper mentions 5 repetitions but reports no variance, CIs, or error bars.",
    151           "source": "haiku"
    152         },
    153         "significance_tests": {
    154           "applies": true,
    155           "answer": false,
    156           "justification": "Comparative claims (Prompt 5 superior, object-aware + base > base alone) made without p-values, t-tests, or significance testing.",
    157           "source": "haiku"
    158         },
    159         "effect_sizes_reported": {
    160           "applies": true,
    161           "answer": true,
    162           "justification": "Absolute percentages and percentage-point differences shown (Table V: 41% to 64% = 23pp improvement). Implicit effect size reporting.",
    163           "source": "haiku"
    164         },
    165         "sample_size_justified": {
    166           "applies": true,
    167           "answer": false,
    168           "justification": "5 repetitions per config mentioned; some tables show 'out of 800'. No power analysis or justification for sample size choices.",
    169           "source": "haiku"
    170         },
    171         "variance_reported": {
    172           "applies": true,
    173           "answer": false,
    174           "justification": "Point estimates only. Standard deviations, ranges, or confidence bounds not reported despite multiple runs per configuration.",
    175           "source": "haiku"
    176         }
    177       },
    178       "evaluation_design": {
    179         "baselines_included": {
    180           "applies": true,
    181           "answer": true,
    182           "justification": "12 prompt variants compared (Table I). Font sizes 0.10–0.30 ablated (Table II). Coloring strategies compared (Tables III-V).",
    183           "source": "haiku"
    184         },
    185         "baselines_contemporary": {
    186           "applies": true,
    187           "answer": false,
    188           "justification": "No experimental comparison to prior visual prompt injection methods cited in Section VI (Kimura 2024, Bailey 2024, etc.).",
    189           "source": "haiku"
    190         },
    191         "ablation_study": {
    192           "applies": true,
    193           "answer": true,
    194           "justification": "Tables II-V systematically ablate font size, coloring strategy, object-aware prefixing, and brightness offset. Each isolates one parameter.",
    195           "source": "haiku"
    196         },
    197         "multiple_metrics": {
    198           "applies": true,
    199           "answer": false,
    200           "justification": "Only ASR reported in results. MSE for visual distortion mentioned in text but not shown. Single metric dominates.",
    201           "source": "haiku"
    202         },
    203         "human_evaluation": {
    204           "applies": true,
    205           "answer": false,
    206           "justification": "No human subjects evaluated imperceptibility. Claim that prompts are 'nearly imperceptible' lacks human validation.",
    207           "source": "haiku"
    208         },
    209         "held_out_test_set": {
    210           "applies": true,
    211           "answer": true,
    212           "justification": "COCO is public benchmark not used to design attacks. Each image tested 5 times. Held-out from attack design process.",
    213           "source": "haiku"
    214         },
    215         "per_category_breakdown": {
    216           "applies": true,
    217           "answer": false,
    218           "justification": "Breakdowns by prompt and parameter shown (Tables I-V). No breakdown by image type, scene complexity, or other COCO categories.",
    219           "source": "haiku"
    220         },
    221         "failure_cases_discussed": {
    222           "applies": true,
    223           "answer": false,
    224           "justification": "Low-performing configs shown (Prompts 11-12, pixel-level blending at 10%). No analysis of WHY they failed or failure modes.",
    225           "source": "haiku"
    226         },
    227         "negative_results_reported": {
    228           "applies": true,
    229           "answer": true,
    230           "justification": "Pixel-level blending achieved only 10% ASR despite visual stealth. Font <0.20 failed. Low-performing prompts reported.",
    231           "source": "haiku"
    232         }
    233       },
    234       "setup_transparency": {
    235         "model_versions_specified": {
    236           "applies": true,
    237           "answer": false,
    238           "justification": "Marketing names only (GPT-4-turbo, gpt-4o, ChatGPT). No snapshot dates, API versions, training cutoffs, or system prompts.",
    239           "source": "haiku"
    240         },
    241         "prompts_provided": {
    242           "applies": true,
    243           "answer": false,
    244           "justification": "Prompt 5 shown in full. Other 11 prompts only referenced by ID ('Prompt 1'–'Prompt 12'). Most prompts unavailable for inspection.",
    245           "source": "haiku"
    246         },
    247         "hyperparameters_reported": {
    248           "applies": true,
    249           "answer": false,
    250           "justification": "Attack parameters (font sizes, offsets) reported. Model inference hyperparameters (temperature, top-p, max_tokens) not specified.",
    251           "source": "haiku"
    252         },
    253         "scaffolding_described": {
    254           "applies": false,
    255           "answer": false,
    256           "justification": "No agentic scaffolding (ReAct, chain-of-thought, tool use). Single image, single query. Not applicable to this work.",
    257           "source": "haiku"
    258         },
    259         "data_preprocessing_documented": {
    260           "applies": true,
    261           "answer": false,
    262           "justification": "COCO images used as-is. SAM segmentation applied but settings/parameters not documented. Resizing, color space handling not mentioned.",
    263           "source": "haiku"
    264         }
    265       },
    266       "data_integrity": {
    267         "raw_data_available": {
    268           "applies": true,
    269           "answer": false,
    270           "justification": "COCO images public. Adversarial images and model responses not released. Cannot independently verify attack outputs.",
    271           "source": "haiku"
    272         },
    273         "data_collection_described": {
    274           "applies": true,
    275           "answer": true,
    276           "justification": "COCO selection justified for diversity. Algorithm 1 describes attack generation pipeline. Collection transparent at high level.",
    277           "source": "haiku"
    278         },
    279         "recruitment_methods_described": {
    280           "applies": false,
    281           "answer": false,
    282           "justification": "No human participants. Not applicable.",
    283           "source": "haiku"
    284         },
    285         "data_pipeline_documented": {
    286           "applies": true,
    287           "answer": false,
    288           "justification": "High-level pipeline in Algorithm 1 (image → SAM → prompt → embedding → query → ASR). Detailed preprocessing, storage, tracking absent.",
    289           "source": "haiku"
    290         }
    291       },
    292       "contamination": {
    293         "training_cutoff_stated": {
    294           "applies": false,
    295           "answer": false,
    296           "justification": "Not a knowledge evaluation. Testing prompt injection behavior. Training cutoff mentioned but not critical for this attack type.",
    297           "source": "haiku"
    298         },
    299         "train_test_overlap_discussed": {
    300           "applies": false,
    301           "answer": false,
    302           "justification": "Not evaluating memorization or knowledge. Behavioral test, not knowledge test. Overlap not discussed.",
    303           "source": "haiku"
    304         },
    305         "benchmark_contamination_addressed": {
    306           "applies": true,
    307           "answer": false,
    308           "justification": "COCO is public. No discussion of whether GPT-4-turbo training included COCO, which could affect text-detection capability.",
    309           "source": "haiku"
    310         }
    311       },
    312       "human_studies": {
    313         "pre_registered": {
    314           "applies": false,
    315           "answer": false,
    316           "justification": "No human participants. Not applicable.",
    317           "source": "haiku"
    318         },
    319         "irb_or_ethics_approval": {
    320           "applies": false,
    321           "answer": false,
    322           "justification": "No human subjects. Not applicable.",
    323           "source": "haiku"
    324         },
    325         "demographics_reported": {
    326           "applies": false,
    327           "answer": false,
    328           "justification": "No human participants. Not applicable.",
    329           "source": "haiku"
    330         },
    331         "inclusion_exclusion_criteria": {
    332           "applies": false,
    333           "answer": false,
    334           "justification": "No human subjects. Not applicable.",
    335           "source": "haiku"
    336         },
    337         "randomization_described": {
    338           "applies": false,
    339           "answer": false,
    340           "justification": "No human participants. Not applicable.",
    341           "source": "haiku"
    342         },
    343         "blinding_described": {
    344           "applies": false,
    345           "answer": false,
    346           "justification": "No human subjects. Not applicable.",
    347           "source": "haiku"
    348         },
    349         "attrition_reported": {
    350           "applies": false,
    351           "answer": false,
    352           "justification": "No human participants. Not applicable.",
    353           "source": "haiku"
    354         }
    355       },
    356       "cost_and_practicality": {
    357         "inference_cost_reported": {
    358           "applies": true,
    359           "answer": false,
    360           "justification": "Uses GPT-4-turbo and gpt-4o APIs but no cost, latency, or token counts reported. Practical constraints not discussed.",
    361           "source": "haiku"
    362         },
    363         "compute_budget_stated": {
    364           "applies": true,
    365           "answer": false,
    366           "justification": "Total computational budget (API calls, tokens, dollars) not stated. Scale and expense of evaluation unclear.",
    367           "source": "haiku"
    368         }
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "Image-based prompt injection can achieve up to 64% attack success rate against GPT-4-turbo under stealth constraints",
    375       "evidence": "Table V: object-aware prefix + base prompt with +20 brightness offset yields 64% ASR",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "Repetition-based prompts (Prompt 5) are most robust across font sizes and configurations",
    380       "evidence": "Table I: Prompt 5 achieves 100% baseline ASR; Table II: 'remained most robust across all font sizes'",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "Font size below 0.20 results in <10% success; 0.30 scale most effective",
    385       "evidence": "Table II: 0.10=0%, 0.15=1%, 0.20=10%, 0.30=37.88%",
    386       "supported": "strong"
    387     },
    388     {
    389       "claim": "Global region-averaged coloring outperforms patch-based and pixel-level blending",
    390       "evidence": "Tables III-V: patch coloring max 25%, pixel blending 10%, global region 64% (with object-aware prefix)",
    391       "supported": "strong"
    392     },
    393     {
    394       "claim": "Object-aware prefixes increase attack success from 41% to 64%",
    395       "evidence": "Table V: base prompt 41% vs. object-aware prefix + base prompt 64% (both +20 offset)",
    396       "supported": "strong"
    397     },
    398     {
    399       "claim": "The technique is broadly generalizable to other multimodal LLMs",
    400       "evidence": "Discussion claims 'broadly generalizable' but only GPT-4-turbo tested. Claim unsupported.",
    401       "supported": "weak"
    402     },
    403     {
    404       "claim": "Embedded prompts remain imperceptible to human observers",
    405       "evidence": "Figures 2-4 show visual examples, but no human perceptibility study conducted to validate.",
    406       "supported": "weak"
    407     }
    408   ],
    409   "methodology_tags": [
    410     "benchmark-eval",
    411     "case-study"
    412   ],
    413   "key_findings": "Image-based prompt injection successfully hijacks GPT-4-turbo in black-box settings, achieving 64% attack success when combining object-aware prompting with global region-averaged text coloring and +20 brightness offset. Repetition-based prompts prove most robust across configurations. A critical trade-off exists between stealth and effectiveness: font scales below 0.20 drop success to <10%, while imperceptible text (pixel-level blending) achieves only 10% success. The systematic pipeline integrating SAM segmentation, adaptive rendering, and prompt engineering provides a methodology for adversarial image attacks on vision-language models.",
    414   "red_flags": [
    415     {
    416       "flag": "No human evaluation of stealth",
    417       "detail": "Paper claims embeddings are 'imperceptible to humans' but provides no human study. Visual inspection alone cannot validate imperceptibility."
    418     },
    419     {
    420       "flag": "Non-reproducible without code",
    421       "detail": "No source code, API setup, parameter configs, or detailed instructions released. Algorithm 1 pseudocode alone insufficient for reproduction."
    422     },
    423     {
    424       "flag": "Single-model generalization claims",
    425       "detail": "Only GPT-4-turbo evaluated, yet abstract claims 'broadly generalizable to other multimodal LLMs.' No evidence for transferability."
    426     },
    427     {
    428       "flag": "No statistical significance tests",
    429       "detail": "All results are point estimates. Despite 5 repetitions per config, no variance, confidence intervals, or significance tests reported."
    430     },
    431     {
    432       "flag": "No baseline comparison",
    433       "detail": "Discusses prior visual prompt injection works (Kimura 2024, Bailey 2024) but doesn't experimentally compare IPI to these baseline methods."
    434     },
    435     {
    436       "flag": "Limited evaluation scope",
    437       "detail": "Only COCO images tested. No evaluation across other domains (documents, videos, webpages) despite claiming 'practical threat.'"
    438     },
    439     {
    440       "flag": "Incomplete model specification",
    441       "detail": "GPT-4-turbo, gpt-4o, ChatGPT used without version dates, system prompts, or inference hyperparameters (temperature, top-p)."
    442     },
    443     {
    444       "flag": "Overgeneralized threat framing",
    445       "detail": "Labels IPI as 'systemic vulnerability' and 'practical threat' without addressing embedding difficulty, deployment constraints, or detection methods."
    446     },
    447     {
    448       "flag": "Mechanism unexplored",
    449       "detail": "Why does embedding work? Is model reading text or reacting to visual artifacts? Why do coloring strategies differ mechanistically? No analysis."
    450     },
    451     {
    452       "flag": "Undisclosed conflict of interest",
    453       "detail": "Three Bytedance-affiliated authors. No disclosure of whether company funded research. Potential incentive to show competitor (OpenAI) vulnerabilities."
    454     }
    455   ],
    456   "cited_papers": [
    457     {
    458       "title": "Ignore previous prompt: Attack techniques for language models",
    459       "authors": "Perez & Ribeiro",
    460       "year": 2022,
    461       "relevance": "Foundational work establishing direct prompt injection as attack vector against text LLMs"
    462     },
    463     {
    464       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    465       "authors": "Greshake et al.",
    466       "year": 2023,
    467       "relevance": "Introduces indirect prompt injection threat model via external content retrieval"
    468     },
    469     {
    470       "title": "Empirical analysis of large vision-language models against goal hijacking via visual prompt injection",
    471       "authors": "Kimura et al.",
    472       "year": 2024,
    473       "relevance": "Direct prior work on visual prompt injection against MLLMs; directly comparable methodology"
    474     },
    475     {
    476       "title": "Image hijacks: Adversarial images can control generative models at runtime",
    477       "authors": "Bailey et al.",
    478       "year": 2024,
    479       "relevance": "Related image-based adversarial attack approach for controlling generative models"
    480     },
    481     {
    482       "title": "Jailbreak in pieces: Compositional adversarial attacks on multi-modal language models",
    483       "authors": "Shayegani et al.",
    484       "year": 2023,
    485       "relevance": "Compositional attack strategy for bypassing safety alignment in multimodal systems"
    486     },
    487     {
    488       "title": "Eyes closed, safety on: Protecting multimodal LLMs via image-to-text transformation",
    489       "authors": "Gou et al.",
    490       "year": 2024,
    491       "relevance": "Defense mitigation strategy: sanitizing images through structured description"
    492     },
    493     {
    494       "title": "Segment Anything",
    495       "authors": "Kirillov et al.",
    496       "year": 2023,
    497       "relevance": "Technical foundation: SAM segmentation model enabling region-based prompt placement"
    498     },
    499     {
    500       "title": "Vision-language models for vision tasks: A survey",
    501       "authors": "Zhang et al.",
    502       "year": 2024,
    503       "relevance": "Architecture survey of multimodal LLMs and their vision processing components"
    504     }
    505   ],
    506   "engagement_factors": {
    507     "practical_relevance": {
    508       "score": 2,
    509       "justification": "Attack requires embedding adversarial images in target applications (uploads, databases). Deployment scenarios exist but not trivial without system access."
    510     },
    511     "surprise_contrarian": {
    512       "score": 2,
    513       "justification": "Visual prompt injection is established (Kimura 2024); this work provides systematic parameter analysis and visibility-effectiveness trade-offs but not fundamentally novel attack class."
    514     },
    515     "fear_safety": {
    516       "score": 2,
    517       "justification": "Demonstrates multimodal LLM vulnerability relevant to image captioning and autonomous perception. Impact depends on deployment context and real-world feasibility."
    518     },
    519     "drama_conflict": {
    520       "score": 1,
    521       "justification": "Standard adversarial ML methodology paper. No particular controversy, human victims, or novel ethical dimension. Technical contribution without narrative drama."
    522     },
    523     "demo_ability": {
    524       "score": 1,
    525       "justification": "No public code or interactive demo. Requires implementing SAM segmentation, rendering pipeline, and GPT-4-turbo API access. High reproduction barrier."
    526     },
    527     "brand_recognition": {
    528       "score": 2,
    529       "justification": "Evaluates OpenAI's famous GPT-4-turbo model (high), but authors primarily from Northern Arizona University and Bytedance (moderate institutional prestige)."
    530     }
    531   },
    532   "hn_data": {
    533     "threads": [
    534       {
    535         "hn_id": "46894924",
    536         "title": "Accelerating Scientific Research with Gemini: Case Studies and Common Techniques",
    537         "points": 4,
    538         "comments": 0,
    539         "url": "https://news.ycombinator.com/item?id=46894924"
    540       },
    541       {
    542         "hn_id": "22705219",
    543         "title": "Twitter, growing echo chamber: More retweets that original content since 2018",
    544         "points": 3,
    545         "comments": 0,
    546         "url": "https://news.ycombinator.com/item?id=22705219"
    547       },
    548       {
    549         "hn_id": "39715108",
    550         "title": "Junctiond: Extending FaaS Runtimes with Kernel-Bypass",
    551         "points": 2,
    552         "comments": 0,
    553         "url": "https://news.ycombinator.com/item?id=39715108"
    554       },
    555       {
    556         "hn_id": "46987729",
    557         "title": "Accelerating Scientific Research with Gemini: Case Studies and Common Techniques",
    558         "points": 1,
    559         "comments": 0,
    560         "url": "https://news.ycombinator.com/item?id=46987729"
    561       }
    562     ],
    563     "top_points": 4,
    564     "total_points": 10,
    565     "total_comments": 0
    566   }
    567 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs