scan.json (26262B)
1 { 2 "paper": { 3 "title": "Decoding Latent Attack Surfaces in LLMs: Prompt Injection via HTML in Web Summarization", 4 "authors": ["Ishaan Verma", "Arsheya Yadav"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2509.05831", 8 "doi": "10.48550/arXiv.2509.05831" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "HTML-based prompt injection attacks using non-visible elements (meta tags, opacity-zero divs, HTML comments) can reliably manipulate LLM web summarization outputs. Llama 4 Scout was significantly more susceptible (29.3% injection success) than Gemma 9B IT (15.7%) across 140 injected web pages and 8 injection techniques. Meta tags and opacity-zero divs were the most effective attack vectors, causing both lexical (ROUGE-L) and semantic (SBERT cosine similarity) divergence from clean summaries.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "A GitHub repository is provided in the Appendix: https://github.com/ishaanv1206/Decoding-Latent-Attack-Surfaces-in-LLMs-Prompt-Injection-via-HTML-in-Web-Summarization, containing evaluation scripts, HTML generation code, and model outputs." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The Appendix states the repository includes 'clean/ and injected/: HTML pages used for evaluation' as well as 'gemma.csv, llama.csv: Summarization outputs from each model' and 'metadata.csv'." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or dependency specifications are mentioned in the paper. The paper mentions Python scripts and Playwright but does not specify library versions or environment setup details." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The Appendix lists repository contents but provides no step-by-step reproduction instructions, README with commands, or explicit guidance on how to replicate the experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Only point estimates are reported (e.g., average ROUGE-L of 0.3011, SBERT cosine similarity of 0.6980). No confidence intervals or error bars are provided." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims Llama 4 Scout is 'more susceptible' and certain techniques are 'most effective' based solely on comparing raw numbers. No statistical significance tests (t-tests, chi-squared, etc.) are applied." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": false, 51 "justification": "No effect sizes (Cohen's d, odds ratios, etc.) are reported. Only raw success counts and average similarity scores are presented." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The dataset consists of 282 pages (141 clean, 141 injected) but no justification is given for this sample size, and no power analysis is discussed." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported. It is unclear whether experiments were run once or multiple times." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Clean page summaries serve as baselines against which injected page summaries are compared. This is the core experimental design (clean vs. injected pairs)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "Both evaluated models are current: Llama 4 Scout (Meta, 2025) and Gemma 9B IT (Google, 2024). These are recent, publicly available models." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "While results are broken down by injection technique (Table 2), there is no systematic ablation study removing or modifying individual components of the injection or evaluation pipeline." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Three evaluation approaches are used: ROUGE-L (lexical overlap), SBERT cosine similarity (semantic similarity), and manual annotation of injection success." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section III states: 'the \"Injection Successful\" outcome was determined through manual inspection' of LLM summary outputs for evidence of adversarial influence." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "No train/test split is discussed. The entire dataset is used for evaluation with no hold-out set, though since no model tuning is performed, this is less critical." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 2 provides per-injection-technique breakdowns of successful injections for both models. Results by injection type are also discussed for ROUGE-L and SBERT scores." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "The paper provides one qualitative example of a successful injection (pirate persona) but does not systematically discuss failure cases — where and why injections failed to influence model outputs." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 2 shows several injection techniques with zero successes (hidden script: 0 for Gemma; base64, ARIA label, alt text not listed as successful for either model), indicating that these techniques were ineffective." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims that 'a significant proportion of injected pages led to measurable semantic and stylistic shifts,' which is supported by Table 1 showing 29.3% and 15.7% success rates and ROUGE-L/SBERT divergences." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The study uses a controlled experimental design where the only difference between clean and injected pages is the injection payload. This controlled single-variable manipulation is adequate for the causal claims made (injections cause output changes)." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title says 'LLMs' broadly and the abstract refers to 'LLM-driven web pipelines' generally, but only two open-source models are tested. The paper does not bound its claims to these specific models or acknowledge the narrow scope." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations are considered for the observed differences between models or injection techniques. For example, the paper does not consider whether model size, training data composition, or instruction-tuning approach might explain differential susceptibility." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses ROUGE-L and SBERT similarity as proxies for 'injection impact' without discussing whether these metrics fully capture what constitutes a successful or harmful injection in real-world scenarios. The gap between summary divergence and actual harm is not addressed." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "Models are identified as 'Llama 4 Scout' and 'Gemma 9B IT' without exact version identifiers, snapshot dates, or API versions. Reference [19] mentions 'Llama-4-Scout-17B-16E' but this full identifier is not used in the paper itself." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper describes the prompt as 'a standardized prompt instructing the LLM to generate a one-paragraph summary of the web page' but does not provide the actual prompt text used." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported for either model's inference configuration." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The models are prompted directly to generate summaries." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section III describes the full pipeline: HTML page creation with CSS styling, injection technique application, hosting on GitHub Pages, Playwright-based extraction of HTML source and rendered text, and standardized prompting for summarization." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section. The conclusion mentions 'future work' directions but does not discuss current study limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of how the synthetic dataset, small sample size, or two-model scope might threaten validity." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not identify what the results do NOT show or what settings are excluded from the claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "The Appendix provides a GitHub repository containing raw HTML files (clean/ and injected/), model outputs (gemma.csv, llama.csv), and metadata (metadata.csv)." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section III describes how 28 content categories were used, how injection techniques were implemented and randomly assigned, how pages were hosted on GitHub Pages, and how Playwright extracted HTML source and rendered text." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants are involved. The dataset is entirely synthetic/self-constructed, not drawn from a standard benchmark." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from HTML generation → injection → hosting → Playwright extraction → LLM summarization → metric computation → manual annotation is documented in Section III, though the jump from '28 static HTML pages' to 141 clean + 141 injected is not fully explained." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is mentioned anywhere in the paper. There is no acknowledgments section disclosing grants or sponsors." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: both authors are from Manipal University Jaipur (Department of Computer Science and Engineering, Department of Data Science and Engineering)." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": false, 217 "answer": false, 218 "justification": "This appears to be unfunded university student work. No funding source is mentioned." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "This paper tests LLM susceptibility to prompt injection attacks, not model knowledge on a benchmark. It evaluates a security vulnerability rather than model capability, so contamination concerns do not apply." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "The study tests defense/vulnerability to prompt injection, not model knowledge. Contamination of the synthetic HTML pages in training data is not the same concern as benchmark contamination." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "The study tests prompt injection susceptibility rather than model capability on a knowledge benchmark. Traditional benchmark contamination does not apply." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study. The dataset is synthetic and evaluation is automated (with manual annotation by the authors)." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants. The study evaluates LLM outputs on synthetic web pages." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, API costs, tokens consumed, or wall-clock time is reported for either model's summarization runs." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No information about computational resources, GPU hours, or hardware used is provided." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds. It appears experiments were run once per page per model." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. It is unclear if each page was summarized once or multiple times." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search is mentioned. Model inference settings (temperature, etc.) are not even reported, let alone justified through search." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No configuration selection process is described. The paper does not explain how model settings or prompt design were chosen." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Multiple comparisons are made across 8 injection techniques and 2 models, but no statistical tests are performed at all, let alone corrections for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors evaluate their own attack framework and dataset without acknowledging the bias inherent in authors evaluating their own system." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No compute budget information is provided, so performance cannot be contextualized against resource usage." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether its synthetic HTML pages and injection techniques are representative of real-world web content and actual attack scenarios. Construct validity of the benchmark is not addressed." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. Models are prompted directly for summarization." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "The synthetic HTML pages were hosted on GitHub Pages and could theoretically appear in model training data. No temporal analysis or discussion of whether models may have seen similar content is provided." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup might inadvertently leak information about injection presence to the model through formatting or structural cues." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Pages are generated from 28 content templates, meaning multiple test pages share structural similarities. This non-independence is not discussed." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Llama 4 Scout has a 29.29% injection success rate while Gemma 9B IT has a 15.71% success rate across HTML-based prompt injections.", 365 "evidence": "Table 1 in Section IV reports 41/140 successful injections for Llama 4 Scout (29.29%) and 22/140 for Gemma 9B IT (15.71%), determined by manual annotation.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Meta tags, opacity-zero divs, and comment injections are the most effective HTML-based prompt injection techniques.", 370 "evidence": "Table 2 shows meta tag (17 Llama, 6 Gemma), comment injection (12, 7), and opacity div (10, 9) as the most successful techniques. Hidden scripts had minimal success (2, 0).", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "HTML-based prompt injections cause measurable lexical and semantic divergence in LLM summaries.", 375 "evidence": "Table 1 reports average ROUGE-L of 0.3011/0.3270 and SBERT cosine similarity of 0.6980/0.6945 between clean and injected summary pairs, indicating divergence from clean outputs.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Conventional sanitization methods are insufficient to mitigate HTML-based injection threats.", 380 "evidence": "Stated in the conclusion but supported only by the observation that non-visible HTML elements successfully influenced model outputs. No actual sanitization methods were tested or evaluated.", 381 "supported": "weak" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "Internal numerical inconsistencies", 387 "detail": "The paper states '141 clean and 141 injected pages' in Section III but Table 1 shows '140' total injected files. Success rates differ between Table 1 (29.29%, 15.71%) and the discussion text (29.08%, 15.60%). The methodology describes '28 static HTML pages' but the total is 282, with no clear explanation of how 28 templates became 141 instances." 388 }, 389 { 390 "flag": "No statistical testing whatsoever", 391 "detail": "All comparisons between models and injection techniques are based on raw counts and averages. No significance tests, confidence intervals, or effect sizes are reported, making it impossible to determine whether differences are meaningful or due to chance." 392 }, 393 { 394 "flag": "Manual annotation without inter-rater reliability", 395 "detail": "Injection success was determined through manual inspection, but there is no mention of multiple annotators, inter-rater agreement metrics (e.g., Cohen's kappa), or annotation guidelines. This is a single-annotator assessment with no reliability check." 396 }, 397 { 398 "flag": "Overclaiming from narrow scope", 399 "detail": "The paper tests only two open-source models but makes broad claims about 'LLM' vulnerability and 'state-of-the-art models.' No proprietary models (GPT-4, Claude) are tested, and the two models tested are not the most widely deployed in production summarization systems." 400 }, 401 { 402 "flag": "No limitations section", 403 "detail": "The paper contains no dedicated limitations section and does not acknowledge any threats to validity, which is a significant omission for a security research paper making broad claims." 404 }, 405 { 406 "flag": "Small and synthetic dataset", 407 "detail": "Only 140 injected pages from 28 content templates were tested, with injection type randomly assigned. The synthetic nature of the pages (generated with lorem-ipsum-style content) limits ecological validity for real-world web summarization scenarios." 408 } 409 ], 410 "cited_papers": [ 411 { 412 "title": "Automatic and Universal Prompt Injection Attacks against Large Language Models", 413 "authors": ["X. Liu", "Z. Yu", "Y. Zhang", "N. Zhang", "C. Xiao"], 414 "year": 2024, 415 "arxiv_id": "2403.04957", 416 "relevance": "Introduces a universal framework for prompt injection attacks demonstrating vulnerabilities in defense-equipped LLMs." 417 }, 418 { 419 "title": "Prompt Injection attack against LLM-integrated Applications", 420 "authors": ["Y. Liu", "G. Deng", "Y. Li", "K. Wang", "Z. Wang", "X. Wang", "T. Zhang", "Y. Liu", "H. Wang", "Y. Zheng", "Y. Liu"], 421 "year": 2023, 422 "arxiv_id": "2306.05499", 423 "relevance": "Proposes a goal-guided generative method for prompt injection attacks against LLM-integrated applications." 424 }, 425 { 426 "title": "Prompt Injection Attacks on Large Language Models in Realistic Settings", 427 "authors": ["J. Clusmann", "M. Bontrager", "M. Maldonado"], 428 "year": 2024, 429 "arxiv_id": "2407.18981", 430 "relevance": "Studies prompt injection attacks in realistic deployment settings, including system prompt leakage scenarios." 431 }, 432 { 433 "title": "Evaluating and Improving Robustness in Large Language Models: A Survey", 434 "authors": ["Y. Yang", "Z. Wang", "H. Liu"], 435 "year": 2024, 436 "arxiv_id": "2506.11111", 437 "relevance": "Survey demonstrating that LLMs vary significantly in resilience across datasets and attack strategies." 438 }, 439 { 440 "title": "Retrieval-Augmented In-Context Learning Attacks and Defenses", 441 "authors": ["Q. Yu", "T. Huang", "Z. Deng"], 442 "year": 2024, 443 "arxiv_id": "2402.10928", 444 "relevance": "Reveals weaknesses in retrieval-augmented in-context learning when exposed to embedded adversarial prompts." 445 }, 446 { 447 "title": "Raze to the Ground: Query-Efficient Adversarial HTML Attacks on Machine-Learning Phishing Webpage Detectors", 448 "authors": ["L. Tao", "M. Li", "H. Li"], 449 "year": 2023, 450 "relevance": "Empirical study showing that HTML-based manipulations can evade conventional sanitization methods in ML systems." 451 }, 452 { 453 "title": "A Real-World Case Study of Attacking ChatGPT via Prompt Injection", 454 "year": 2024, 455 "arxiv_id": "2504.16125", 456 "relevance": "Real-world case study of prompt injection attacks against ChatGPT, directly relevant to LLM security evaluation." 457 } 458 ], 459 "engagement_factors": { 460 "practical_relevance": { 461 "score": 2, 462 "justification": "Web developers integrating LLMs into summarization pipelines can use these findings to identify and mitigate HTML-based injection vectors." 463 }, 464 "surprise_contrarian": { 465 "score": 1, 466 "justification": "Confirms the known vulnerability of LLMs to prompt injection through a specific HTML channel; results are expected rather than surprising." 467 }, 468 "fear_safety": { 469 "score": 2, 470 "justification": "Demonstrates concrete invisible injection attacks through common HTML elements that could manipulate LLM-powered web tools without user awareness." 471 }, 472 "drama_conflict": { 473 "score": 0, 474 "justification": "No controversy, no criticism of specific companies or products, purely a vulnerability demonstration study." 475 }, 476 "demo_ability": { 477 "score": 2, 478 "justification": "Code and dataset available on GitHub with HTML pages, evaluation scripts, and model outputs, allowing reproduction." 479 }, 480 "brand_recognition": { 481 "score": 1, 482 "justification": "Tests Meta's Llama 4 and Google's Gemma — recognizable but not top-tier attention magnets like GPT-4 or ChatGPT." 483 } 484 } 485 }