ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (21091B)


      1 {
      2   "paper": {
      3     "title": "Multimodal Prompt Injection Attacks: Risks and Defenses for Modern LLMs",
      4     "authors": ["Andrew Yeo", "Daeseon Choi"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2509.05883"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Tested four types of prompt injection (direct, external, image-based, prompt leakage) on eight commercial LLMs. Claude 3 was relatively most robust, resisting direct and external injection but partially susceptible to image-based injection. No model resisted all attack types. The paper proposes a staged vulnerability-defense mapping (Table 2) but provides no quantitative defense evaluation.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, code link, or archive is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset or test prompts are released. The controlled website and scraped content are not made available."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "The paper mentions Python, requests, BeautifulSoup, and Visual Studio Code but provides no requirements file, library versions, or environment specification."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The methodology section describes the setup conceptually but lacks runnable commands or scripts."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No confidence intervals or error bars reported. Results in Table 1 are binary (success/failure/partial) with no quantification of uncertainty."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are used. The paper claims Claude 3 is 'most resilient' based on comparing binary outcomes without any statistical test."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No effect sizes reported. Results are qualitative success/fail/partial symbols with no quantitative effect measurement."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The number of trials per model per attack type is never stated, let alone justified. The paper admits 'limited logging in early runs' (Section 4.1)."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance or multiple-run results reported. It is unclear how many times each attack was attempted."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No baselines are included. The paper tests models against attacks but does not compare against prior attack frameworks, defense methods, or other evaluation approaches."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No baselines are included, so contemporaneity cannot be assessed."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study. The paper applies regex sanitization but does not systematically ablate defense components or attack variations."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "Only a single ternary outcome (success/partial/failure) is used. No quantitative metrics like attack success rate, response deviation score, or time-to-exploit."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Section 3.4 states 'Ambiguous cases were manually adjudicated' and outcomes were classified by the researchers based on inspection of model outputs."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Not a benchmark evaluation with train/test splits. The study tests attack prompts against live LLM APIs."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 provides per-model, per-attack-type breakdown of results across all four injection categories."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "The paper notes which attacks failed per model but does not analyze why specific attacks failed or provide qualitative error analysis of model refusals."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports failed attacks (e.g., Claude 3 resisting direct and external injection, Kimi-K2 resisting leakage), not just successes."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims Claude 3 showed 'relatively greater robustness' and that 'additional defenses... remain necessary.' Table 1 supports the first claim; the second is supported by the finding that all models failed at least one category."
    115       },
    116       "causal_claims_justified": {
    117         "applies": false,
    118         "answer": false,
    119         "justification": "The paper does not make causal claims. It reports observational attack outcomes without claiming causal mechanisms."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Modern LLMs' broadly but tests only 8 models with a single attack prompt per category. The paper does not bound its findings to the specific models, prompt variants, or API versions tested."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations are discussed. For example, the paper does not consider whether model version differences, API-level safety filters, or the specific prompt phrasing could explain differential outcomes."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper measures binary attack success on a single contrived scenario (shuriken crafting instructions) and frames it as general prompt injection vulnerability. No discussion of how this proxy relates to real-world attack surface or whether the specific test prompt is representative."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are listed as 'GPT-4o', 'Claude 3', 'Kimi-K2', 'Mistral-Saba-24B', 'GPT-3.5-Turbo', 'LLaMA-3-8B', 'LLaMA-3-70B', 'Gemma' — all marketing names without snapshot dates or API versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The system prompt is provided verbatim in Section 3.2 and the attack prompt is shown in Figure 4. The actual text used for testing is given."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Section 3.2 states 'Default decoding parameters... were preserved unless otherwise specified' and mentions max token limits for some models, but does not report temperature, top-p, or other sampling parameters."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The experiment is a direct API call pipeline."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Section 3.2 describes the preprocessing: Python scraper with requests/BeautifulSoup, regex sanitization removing HTML comments and adversarial phrases, truncation to 4,000 characters."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no dedicated limitations section. A single paragraph in the Discussion (Section 5) mentions 'restricted access to newer commercial LLMs' but this is not a substantive limitations section."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "The only specific limitation mentioned is paywall access restrictions. No discussion of threats like single-prompt testing, manual adjudication bias, or temporal instability of API models."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries stated. The paper does not clarify what its results do NOT show — e.g., that testing one prompt per category does not establish general vulnerability rates."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data (model responses, logs, adjudication records) are made available. The paper admits 'limited logging in early runs.'"
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Section 3.2 describes the data collection procedure: web scraping a controlled Korean cultural heritage portal, sanitization, truncation, and API calls to each model."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Models were selected but this is not a recruitment process for human subjects."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from scraping to API call is described, but the adjudication pipeline for classifying outcomes is not documented. The paper acknowledges 'partial reconstructions' of results due to logging gaps."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding source is disclosed. No acknowledgments section is present."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: Andrew Yeo at Ranchview High School, Daeseon Choi at Soongsil University. Neither is affiliated with the model providers tested."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This study tests defense resilience against adversarial prompts, not model knowledge on a benchmark. Contamination is not relevant to attack success."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Same as above — the study evaluates prompt injection defense, not model capability on memorizable benchmark tasks."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Same as above — contamination does not apply to testing whether models resist adversarial instructions."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No API costs, token counts, or latency measurements reported despite using commercial APIs."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No compute budget stated. The total number of API calls and associated costs are not reported."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No multiple seeds or repeated runs reported. Temperature and sampling seeds not even stated."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of trials per model per attack is never stated. The paper admits results are 'based on partial reconstructions' from limited logging."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": false,
    304         "answer": false,
    305         "justification": "No hyperparameter tuning was performed; default API parameters were used."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "No configuration selection was performed."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical tests are performed, so multiple comparison correction does not apply."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "The paper tests existing commercial models without proposing a new system, so self-comparison bias is not relevant."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "No performance-compute tradeoff relevant here; all models were called under similar conditions."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper uses a single contrived attack prompt (shuriken instructions in a castle narrative) to measure 'prompt injection vulnerability' but never discusses whether this test has construct validity or represents real-world attack patterns."
    331       }
    332     }
    333   },
    334   "claims": [
    335     {
    336       "claim": "Claude 3 demonstrated relatively greater robustness to prompt injection among the eight models tested.",
    337       "evidence": "Table 1 shows Claude 3 resisted direct and external injection, with only partial susceptibility to image-based injection. It was vulnerable to prompt leakage.",
    338       "supported": "moderate"
    339     },
    340     {
    341       "claim": "Image-based injection is the most concerning attack type.",
    342       "evidence": "Section 5 and 6 state this, but only two models were tested with images (GPT-4o and Claude 3). GPT-4o was successfully injected; Claude 3 partially.",
    343       "supported": "weak"
    344     },
    345     {
    346       "claim": "No model can reliably defend against prompt injection through alignment alone.",
    347       "evidence": "Table 1 shows all 8 models were vulnerable to at least one attack category (if prompt leakage is included). However, sample size and number of trials are unstated.",
    348       "supported": "weak"
    349     },
    350     {
    351       "claim": "Additional sanitization measures such as regex filtering significantly strengthen model defenses.",
    352       "evidence": "Section 4.2 asserts this but provides no quantitative comparison of performance with and without sanitization. The claim is unsupported by the experimental data.",
    353       "supported": "unsupported"
    354     }
    355   ],
    356   "red_flags": [
    357     {
    358       "flag": "Unreported sample sizes",
    359       "detail": "The number of trials per model per attack type is never stated. The paper admits results are 'based on partial reconstructions' from 'limited logging in early runs.' This makes results unreliable and unreproducible."
    360     },
    361     {
    362       "flag": "Single attack prompt",
    363       "detail": "All models appear to have been tested with a single base attack prompt (the castle/shuriken narrative in Figure 4). Conclusions about model vulnerability are drawn from what may be a single or very few trials per category."
    364     },
    365     {
    366       "flag": "Claims significantly outrun evidence",
    367       "detail": "The paper claims sanitization 'significantly strengthens' defenses but provides no with/without comparison. The claim that image-based injection is 'most concerning' is based on testing only 2 of 8 models."
    368     },
    369     {
    370       "flag": "No quantitative metrics",
    371       "detail": "Results are reported as binary symbols (✓/×/△) with no attack success rates, no confidence intervals, and no statistical analysis. This is insufficient for a paper making comparative claims about model robustness."
    372     },
    373     {
    374       "flag": "First author affiliation",
    375       "detail": "First author is affiliated with a high school (Ranchview High School), which is unusual for a security research paper and may explain the methodological gaps."
    376     }
    377   ],
    378   "cited_papers": [
    379     {
    380       "title": "A Survey on Large Language Model Security and Privacy",
    381       "authors": ["Y. Yao", "X. Liu", "L. Wang", "Z. Zhang"],
    382       "year": 2024,
    383       "relevance": "Comprehensive survey on LLM security vulnerabilities relevant to understanding prompt injection attack surface."
    384     },
    385     {
    386       "title": "Prompt Injection Attack against LLM-integrated Applications",
    387       "authors": ["Y. Liu", "X. Wang", "P. Chen"],
    388       "year": 2023,
    389       "arxiv_id": "2306.05499",
    390       "relevance": "Early empirical study of prompt injection against LLM-integrated applications, foundational to attack taxonomy."
    391     },
    392     {
    393       "title": "PoisonPrompt: Backdoor Attack on Prompt-based Large Language Models",
    394       "authors": ["H. Yao", "Q. Zhang", "J. Li", "Z. Li"],
    395       "year": 2023,
    396       "arxiv_id": "2310.12439",
    397       "relevance": "Introduces trigger-based backdoor attacks on prompt-based LLMs, relevant to adversarial AI safety evaluation."
    398     },
    399     {
    400       "title": "Prompt Infection: LLM-to-LLM Prompt Injection within Multi-Agent Systems",
    401       "authors": ["K. Lee", "A. Tiwari"],
    402       "year": 2024,
    403       "arxiv_id": "2410.07283",
    404       "relevance": "Demonstrates prompt injection propagation in multi-agent systems, relevant to agentic AI security."
    405     },
    406     {
    407       "title": "Systematically Analyzing Prompt Injection Vulnerabilities in Diverse LLM Architectures",
    408       "authors": ["V. Benjamin", "R. Shen", "M. J. Smith"],
    409       "year": 2024,
    410       "arxiv_id": "2410.23308",
    411       "relevance": "Systematic analysis of prompt injection across LLM architectures, directly comparable methodology."
    412     },
    413     {
    414       "title": "Visual Prompt Injection Attacks in Modern Large Language Models",
    415       "authors": ["S. Lee"],
    416       "year": 2025,
    417       "relevance": "Studies visual/image-based prompt injection attacks, core related work for multimodal injection."
    418     },
    419     {
    420       "title": "System Prompt Poisoning: Persistent Attacks on Large Language Models Beyond User Injection",
    421       "authors": ["J. Guo", "H. Cai"],
    422       "year": 2025,
    423       "arxiv_id": "2505.06493",
    424       "relevance": "Analyzes system prompt poisoning as a persistent attack vector, relevant to LLM deployment security."
    425     },
    426     {
    427       "title": "Enhancing Security in Large Language Models: A Comprehensive Review of Prompt Injection Attacks and Defenses",
    428       "authors": ["M. Khan", "A. Sharma", "P. Verma"],
    429       "year": 2024,
    430       "relevance": "Review of prompt injection countermeasures and defense strategies, relevant to AI safety survey."
    431     }
    432   ]
    433 }

Impressum · Datenschutz