scan.json (22436B)
1 { 2 "paper": { 3 "title": "Attacking LLMs and AI Agents: Advertisement Embedding Attacks Against Large Language Models", 4 "authors": ["Qiming Guo", "Jinwen Tang", "Xingran Huang"], 5 "year": 2025, 6 "venue": "IEEE", 7 "arxiv_id": "2508.17674", 8 "doi": "10.48550/arXiv.2508.17674" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": false, 15 "justification": "No repository URL, code archive, or supplementary materials are mentioned anywhere in the paper." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": false, 20 "justification": "The attack data (malicious Q&A pairs) and fine-tuning datasets used are not released or made available." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": false, 25 "justification": "The only hardware mention is 'a local RTX 4070 graphics card' for LoRA fine-tuning. No software versions, library versions, requirements.txt, or environment specifications are provided." 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": false, 30 "justification": "No step-by-step reproduction instructions are provided. The attack prompts are given inline but there are no instructions for setting up the attack pipeline or reproducing the fine-tuning experiments." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "No quantitative results with confidence intervals or error bars are reported. The paper makes qualitative claims about attack effectiveness ('almost 100%' success) without formal measurement." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "No statistical significance tests are used. The paper claims attacks are effective based on qualitative demonstrations without any formal comparison or testing." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": false, 47 "justification": "No effect sizes are reported. The claim of 'almost 100%' reproduction of preset responses is the closest to a quantitative result, but no formal measurement methodology, sample size, or baseline context is provided." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The number of test queries is not stated, let alone justified. The paper shows a handful of example Q&A pairs in Figure 2 but does not describe a systematic evaluation with a defined sample size." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "No variance, standard deviation, or spread measures are reported across any experimental runs." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": false, 64 "justification": "No baselines are included. The paper does not compare AEA to other attack methods or compare the defense to other defense mechanisms." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": false, 69 "justification": "No baselines are compared at all, so contemporariness is moot." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": false, 74 "justification": "The attack has multiple components (prompt design, data content, model choice) but no ablation study examines which components contribute to attack success." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": false, 79 "justification": "No formal metrics are defined or reported. The paper relies on qualitative demonstrations (showing example outputs) rather than measurable metrics like attack success rate, stealthiness, or detection rate." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": false, 84 "justification": "No human evaluation is conducted. Claims about stealthiness and user deception are made without any user study to verify whether humans can detect the embedded content." 85 }, 86 "held_out_test_set": { 87 "applies": true, 88 "answer": false, 89 "justification": "No formal test set is used. The paper demonstrates attacks on a handful of example queries shown in Figure 2 without a systematic held-out evaluation set." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": false, 94 "justification": "No per-category breakdown is provided. The paper does not analyze attack success by question type, content category, or model." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": false, 99 "justification": "No failure cases are discussed. The paper does not describe scenarios where the attacks fail or where the defense is insufficient." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": false, 104 "justification": "No negative results are reported. Every experiment shown is a success case. The defense is acknowledged to not work against parameter-level attacks, but this is a design limitation rather than a negative experimental result." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": false, 111 "justification": "The abstract claims AEA operate through 'two low-cost vectors' and presents 'an initial prompt-based self-inspection defense that mitigates these injections.' The low-cost claim is supported only by a single anecdote (1 hour on RTX 4070). The defense claim is supported only by describing a prompt without systematic evaluation of its effectiveness." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": false, 116 "justification": "The paper makes causal claims that AEA 'cause models to return covert ads, propaganda, or hate speech' and that the defense 'mitigates these injections.' These are demonstrated with a few examples rather than a controlled experimental design that would support causal inference about attack mechanisms or defense effectiveness." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": false, 121 "justification": "The paper tests on Gemini 2.5 for the SDP attack and LLaMA 3, Mistral 7B, Phi-3 for fine-tuning, but makes broad claims about LLM security generally. The title 'Attacking LLMs and AI Agents' and claims about the threat being as 'prevalent as web viruses' far exceed the narrow demonstrations provided." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": false, 126 "justification": "No alternative explanations are discussed. The paper does not consider whether the observed attack success might be specific to certain model architectures, prompt structures, or whether existing safety mechanisms might already mitigate these attacks in practice." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": false, 133 "justification": "The paper mentions 'Gemini 2.5', 'LLaMA 3', 'Mistral 7B', and 'Phi-3' but does not provide specific version identifiers, snapshot dates, or API versions for any of them. 'Gemini 2.5' is a marketing name without a precise version." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "The attack prompt is provided verbatim in Section IV-B1 and the defense prompt is provided in Section IV-C. These are the actual prompts used, not templates or natural language descriptions." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": false, 143 "justification": "No hyperparameters are reported for either the LLM API calls (temperature, top-p) or the LoRA fine-tuning (learning rate, rank, alpha, epochs). Only the GPU model (RTX 4070) and training time (1 hour) are mentioned." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "The paper does not use agentic scaffolding. The attack is a prompt injection / fine-tuning attack, not an agentic workflow." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": false, 153 "justification": "The attack data (malicious Q&A pairs) and fine-tuning data are not described in detail. The paper mentions 'simulated attacker data included falsified history and hate speech' without documenting how this data was prepared, its format, or its size." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": false, 160 "justification": "There is no dedicated limitations or threats-to-validity section. The conclusion acknowledges 'more research is needed' but does not discuss specific limitations of the study." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": false, 165 "justification": "No specific threats to validity are discussed. The paper does not address internal or external validity concerns." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": false, 170 "justification": "No explicit scope boundaries are stated. The paper does not clarify what it does NOT show or which settings/models/scenarios are excluded from its claims." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": false, 177 "justification": "No raw data (attack prompts corpus, fine-tuning datasets, model outputs) is made available for independent verification." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": false, 182 "justification": "The paper mentions observing 'threat actors in underground markets distributing compromised models designed to increase traffic to specific gambling websites' but provides no details about how these observations were made, when, or what evidence was collected." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants were recruited. The experiments involve running prompts against LLM APIs and fine-tuning models." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": false, 192 "justification": "No data pipeline is documented. The process from creating attack data to executing attacks to measuring results is not described systematically." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": false, 199 "justification": "No funding source is disclosed. The acknowledgment section only mentions 'ChatGPT refined the writing for grammatical accuracy' with no funding information." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "Author affiliations are clearly listed: Texas A&M University-Corpus Christi, University of Missouri, and University of California-Riverside." 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": false, 209 "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of a funding disclosure statement is a gap." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests statement or financial interest declaration is present in the paper." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": false, 220 "answer": false, 221 "justification": "This paper tests attacks (prompt injection and model poisoning) rather than evaluating a pre-trained model's knowledge capability on a benchmark. Contamination in the traditional sense is not applicable." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": false, 225 "answer": false, 226 "justification": "The paper does not evaluate a pre-trained model on a benchmark. It tests attack vectors, so train/test overlap is not a relevant concern." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": false, 230 "answer": false, 231 "justification": "No benchmark is used for capability evaluation. The paper demonstrates attacks using custom adversarial inputs, not standardized benchmarks." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants are involved in this study." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants are involved in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants are involved in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants are involved in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants are involved in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants are involved in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants are involved in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": false, 275 "justification": "The paper claims attacks are 'low-cost' and mentions using an RTX 4070 for 1 hour for LoRA fine-tuning, but does not report API costs for the SDP attack path, token counts, or any systematic cost analysis." 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": false, 280 "justification": "The paper mentions 'a local RTX 4070 graphics card and took 1 hour' for fine-tuning but does not provide a total computational budget, GPU hours across all experiments, or API costs." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "AEA can stealthily inject promotional or malicious content into LLM outputs through service distribution platform hijacking.", 287 "evidence": "Section IV-B1 demonstrates the attack on Google Gemini 2.5 with a specific attack prompt, showing modified outputs in Figure 2.", 288 "supported": "weak" 289 }, 290 { 291 "claim": "AEA via model distribution platforms achieve 'almost 100%' reproduction of preset attack responses through LoRA fine-tuning.", 292 "evidence": "Section IV-B2 states 'Parameter-tuning attacks proved highly effective, reproducing almost 100% of the preset responses from the attacker dataset' on LLaMA 3, Mistral 7B, and Phi-3, but provides no formal measurement methodology, sample size, or detailed results.", 293 "supported": "weak" 294 }, 295 { 296 "claim": "The LoRA fine-tuning attack can be implemented with only a local RTX 4070 GPU in 1 hour.", 297 "evidence": "Section IV-B2 states this directly, supporting the low-cost claim for this specific attack vector.", 298 "supported": "moderate" 299 }, 300 { 301 "claim": "A prompt-based self-inspection defense can effectively defend against API-based AEA attacks.", 302 "evidence": "Section IV-C describes the defense prompt and claims it 'can effectively defend against attacks based on API Providers' but provides no quantitative evaluation, success rate, or systematic testing of the defense.", 303 "supported": "unsupported" 304 }, 305 { 306 "claim": "Current service providers (including Google Gemini) remain inadequately prepared to defend against AEA.", 307 "evidence": "Section I and IV-B1 demonstrate that Gemini 2.5 was 'successfully manipulated by our attack prompts' but this is shown through a single demonstration, not a systematic evaluation across providers.", 308 "supported": "weak" 309 } 310 ], 311 "methodology_tags": ["case-study"], 312 "key_findings": "The paper introduces Advertisement Embedding Attacks (AEA), a class of attacks that inject promotional or malicious content into LLM outputs via two vectors: hijacking service distribution platforms to prepend adversarial prompts, and publishing backdoored open-source model checkpoints fine-tuned with attacker data. The authors demonstrate attacks on Gemini 2.5 and three open-source models (LLaMA 3, Mistral 7B, Phi-3), and propose a basic prompt-based defense. However, the evaluation consists entirely of qualitative demonstrations with no systematic measurement of attack success rates, stealthiness, or defense effectiveness.", 313 "red_flags": [ 314 { 315 "flag": "No quantitative evaluation", 316 "detail": "The paper provides only qualitative demonstrations (a few example outputs in Figure 2) with no formal metrics, success rates, or systematic evaluation. Claims of 'almost 100%' attack success are unsupported by rigorous measurement." 317 }, 318 { 319 "flag": "Unsupported defense effectiveness claim", 320 "detail": "The defense method is described with a single example prompt and claimed to 'effectively defend' without any evaluation. No success rate, false positive rate, or systematic testing is provided." 321 }, 322 { 323 "flag": "Overclaimed scope", 324 "detail": "The paper tests on Gemini 2.5 for one attack vector and three models for another, but makes sweeping claims about LLM security broadly and predicts AEA 'will become as prevalent as web viruses' without evidence for such generalization." 325 }, 326 { 327 "flag": "No baselines or comparisons", 328 "detail": "The attacks are not compared to existing prompt injection or backdoor attack methods. The defense is not compared to any existing defense mechanisms." 329 }, 330 { 331 "flag": "Missing limitations section", 332 "detail": "The paper has no limitations, threats-to-validity, or ethical considerations section, despite proposing and demonstrating attacks that could be misused." 333 }, 334 { 335 "flag": "Novelty claim questionable", 336 "detail": "The paper claims to define a 'newly discovered' attack class, but prompt injection and model poisoning/backdoor attacks are well-established in the literature. The paper does not clearly differentiate AEA from these existing attack categories beyond the specific motivation of advertising." 337 } 338 ], 339 "cited_papers": [ 340 { 341 "title": "Language models are few-shot learners", 342 "authors": ["T. Brown", "B. Mann", "N. Ryder"], 343 "year": 2020, 344 "relevance": "Foundational GPT-3 paper relevant to understanding LLM capabilities that enable the attacks discussed." 345 }, 346 { 347 "title": "On the opportunities and risks of foundation models", 348 "authors": ["R. Bommasani", "D. A. Hudson", "E. Adeli"], 349 "year": 2021, 350 "arxiv_id": "2108.07258", 351 "relevance": "Comprehensive survey of foundation model risks and opportunities relevant to LLM security assessment." 352 }, 353 { 354 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 355 "authors": ["E. Hubinger", "C. Denison", "J. Mu"], 356 "year": 2024, 357 "arxiv_id": "2401.05566", 358 "relevance": "Directly relevant to backdoor and deceptive behavior in LLMs, closely related to the model poisoning attack vector in AEA." 359 }, 360 { 361 "title": "Jailbroken: How does LLM safety training fail?", 362 "authors": ["A. Wei", "N. Haghtalab", "J. Steinhardt"], 363 "year": 2023, 364 "relevance": "Foundational work on LLM jailbreaking attacks relevant to understanding prompt-based attack mechanisms." 365 }, 366 { 367 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 368 "authors": ["K. Greshake", "S. Abdelnabi", "S. Mishra"], 369 "year": 2023, 370 "relevance": "Directly relevant indirect prompt injection attack paper that establishes the broader threat category AEA falls under." 371 }, 372 { 373 "title": "Universal and transferable adversarial attacks on aligned language models", 374 "authors": ["A. Zou", "Z. Wang", "N. Carlini"], 375 "year": 2023, 376 "arxiv_id": "2307.15043", 377 "relevance": "Key paper on adversarial attacks against LLMs, directly relevant to understanding attack transferability." 378 }, 379 { 380 "title": "Jailbreaking ChatGPT via prompt engineering: An empirical study", 381 "authors": ["Y. Liu", "G. Deng", "Z. Xu"], 382 "year": 2023, 383 "arxiv_id": "2305.13860", 384 "relevance": "Empirical study of prompt-based attacks on ChatGPT, directly relevant to the prompt injection attack vector." 385 }, 386 { 387 "title": "Weight poisoning attacks on pre-trained models", 388 "authors": ["K. Kurita", "P. Michel", "G. Neubig"], 389 "year": 2020, 390 "arxiv_id": "2004.06660", 391 "relevance": "Foundational work on weight poisoning in pre-trained models, directly relevant to the model distribution platform attack vector." 392 }, 393 { 394 "title": "PoisonPrompt: Backdoor attack on prompt-based large language models", 395 "authors": ["H. Yao", "J. Lou", "Z. Qin"], 396 "year": 2024, 397 "relevance": "Directly relevant backdoor attack on prompt-based LLMs, closely related to the attack methodology in AEA." 398 }, 399 { 400 "title": "Scalable extraction of training data from (production) language models", 401 "authors": ["M. Nasr", "N. Carlini", "J. Hayase"], 402 "year": 2023, 403 "arxiv_id": "2311.17035", 404 "relevance": "Important work on LLM data extraction attacks, relevant to the broader threat landscape of LLM security." 405 }, 406 { 407 "title": "A survey on evaluation of large language models", 408 "authors": ["Y. Chang", "X. Wang", "J. Wang"], 409 "year": 2024, 410 "relevance": "Survey on LLM evaluation methodology relevant to assessing how LLM security evaluations should be conducted." 411 } 412 ] 413 }