scan-v5.json (25591B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Hijacking Large Language Models via Adversarial In-Context Learning", 6 "authors": [ 7 "Yao Qiang", 8 "Xiangyu Zhou", 9 "Saleh Zare Zade", 10 "Prashant Khanduri", 11 "Dongxiao Zhu" 12 ], 13 "year": 2023, 14 "venue": "arXiv.org", 15 "arxiv_id": "2311.09948", 16 "doi": "10.48550/arXiv.2311.09948" 17 }, 18 "checklist": { 19 "claims_and_evidence": { 20 "abstract_claims_supported": { 21 "applies": true, 22 "answer": true, 23 "justification": "Abstract claims of novel attack, gradient-based suffix learning, clean-demo defense, and cross-model transferability are all demonstrated with experimental results in Tables 1–3 and Figures 2–3.", 24 "source": "haiku" 25 }, 26 "causal_claims_justified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Causal claims (adversarial suffixes cause hijacking; clean demos restore behavior) are supported by controlled comparisons across clean/adversarial conditions and prefix/suffix defense variants, with ablation-like comparisons across shot settings and model sizes.", 30 "source": "haiku" 31 }, 32 "generalization_bounded": { 33 "applies": true, 34 "answer": false, 35 "justification": "The attack requires full white-box gradient access (threat model: 'adversarial model publisher with full access to loss values and gradients'), yet the abstract and conclusion make broad claims about 'significant security vulnerabilities of LLMs' without bounding these to gradient-accessible open-source models.", 36 "source": "haiku" 37 }, 38 "alternative_explanations_discussed": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper offers only attention-diversion as the mechanistic explanation (Section F); no alternative explanations for why the attack succeeds or why clean demos restore behavior are discussed.", 42 "source": "haiku" 43 }, 44 "proxy_outcome_distinction": { 45 "applies": true, 46 "answer": true, 47 "justification": "Claims precisely match measurements: classification hijacking is measured as accuracy collapse on the target class; jailbreak success is measured as ASR with explicit definition of what counts as a refusal.", 48 "source": "haiku" 49 } 50 }, 51 "limitations_and_scope": { 52 "limitations_section_present": { 53 "applies": true, 54 "answer": false, 55 "justification": "A two-sentence 'Limitations' paragraph appears at the end of the Conclusion section, which does not constitute a dedicated limitations section.", 56 "source": "haiku" 57 }, 58 "threats_to_validity_specific": { 59 "applies": true, 60 "answer": false, 61 "justification": "The only limitation named is the uninvestigated impact of adversarial token placement within demos; the white-box requirement, limited model set, and use of synthetic harmful queries are not discussed as threats.", 62 "source": "haiku" 63 }, 64 "scope_boundaries_stated": { 65 "applies": true, 66 "answer": false, 67 "justification": "No explicit statement bounds the results to gradient-accessible, open-source models; the threat model defines the adversary capability but the conclusion and abstract imply broader applicability.", 68 "source": "haiku" 69 } 70 }, 71 "conflicts_of_interest": { 72 "funding_disclosed": { 73 "applies": true, 74 "answer": false, 75 "justification": "No funding source or acknowledgment section appears anywhere in the paper.", 76 "source": "haiku" 77 }, 78 "affiliations_disclosed": { 79 "applies": true, 80 "answer": true, 81 "justification": "All five authors have institutional affiliations listed (Wayne State University, Oakland University) on the title page.", 82 "source": "haiku" 83 }, 84 "funder_independent_of_outcome": { 85 "applies": false, 86 "answer": false, 87 "justification": "No funder identified; appears to be unfunded academic work.", 88 "source": "haiku" 89 }, 90 "financial_interests_declared": { 91 "applies": true, 92 "answer": false, 93 "justification": "No competing interests or financial disclosure statement is present in the paper.", 94 "source": "haiku" 95 } 96 }, 97 "scope_and_framing": { 98 "key_terms_defined": { 99 "applies": true, 100 "answer": true, 101 "justification": "ICL is formally defined (Section 2.1 with mathematical notation), adversarial attack objective is formalized (Section 2.2), and the threat model with adversarial capacity is precisely specified (Section 3.2).", 102 "source": "haiku" 103 }, 104 "intended_contribution_clear": { 105 "applies": true, 106 "answer": true, 107 "justification": "Four numbered contributions are explicitly listed at the end of Section 1, covering the attack, the GGI algorithm, the defense mechanism, and comprehensive experimental validation.", 108 "source": "haiku" 109 }, 110 "engagement_with_prior_work": { 111 "applies": true, 112 "answer": true, 113 "justification": "Section 7 provides substantive engagement with prior work across three sub-areas (ICL stability, adversarial LLM attacks, defense strategies), explicitly positioning GGI against existing methods and explaining what differentiates it from backdoor-trigger-dependent approaches.", 114 "source": "haiku" 115 } 116 } 117 }, 118 "type_checklist": { 119 "empirical": { 120 "artifacts": { 121 "code_released": { 122 "applies": true, 123 "answer": true, 124 "justification": "Code is available at https://github.com/xzhou98/Hijacking-LLMs-GGI as stated in the abstract.", 125 "source": "haiku" 126 }, 127 "data_released": { 128 "applies": true, 129 "answer": true, 130 "justification": "All datasets used (SST-2, Rotten Tomatoes, AG's News, AdvBench) are standard public benchmarks used unmodified.", 131 "source": "haiku" 132 }, 133 "environment_specified": { 134 "applies": true, 135 "answer": false, 136 "justification": "Only hardware is mentioned ('2 NVIDIA H100 GPU cards'); no requirements.txt, Dockerfile, Python version, or library versions are provided.", 137 "source": "haiku" 138 }, 139 "reproduction_instructions": { 140 "applies": true, 141 "answer": false, 142 "justification": "Algorithm 1 describes the GGI procedure and code is released, but no step-by-step reproduction instructions (how to run scripts, reproduce specific table numbers) are included in the paper.", 143 "source": "haiku" 144 } 145 }, 146 "statistical_methodology": { 147 "confidence_intervals_or_error_bars": { 148 "applies": true, 149 "answer": false, 150 "justification": "Tables 1–3 and 5–6 report point estimates only; despite 5-repetition averaging mentioned in the appendix, no variance, std dev, or confidence intervals are reported.", 151 "source": "haiku" 152 }, 153 "significance_tests": { 154 "applies": true, 155 "answer": false, 156 "justification": "No statistical significance tests are used for any comparative claims in the paper; differences are presented as raw numbers only.", 157 "source": "haiku" 158 }, 159 "effect_sizes_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "PRR (Performance Recovery Rate) quantifies the percentage of negative-class accuracy recovered relative to clean performance, providing a contextual effect size for defense claims.", 163 "source": "haiku" 164 }, 165 "sample_size_justified": { 166 "applies": true, 167 "answer": false, 168 "justification": "The 1,000 test queries per dataset are stated but not justified via power analysis or any other rationale; the 2–4 training queries are similarly unjustified.", 169 "source": "haiku" 170 }, 171 "variance_reported": { 172 "applies": true, 173 "answer": false, 174 "justification": "The appendix notes results are averaged over 5 repetitions but no standard deviation or variance is reported anywhere in the tables.", 175 "source": "haiku" 176 } 177 }, 178 "evaluation_design": { 179 "baselines_included": { 180 "applies": true, 181 "answer": true, 182 "justification": "Three attack baselines (Square, Greedy, TextAttack) and one defense baseline (Onion) are compared; ICA is included as jailbreak baseline in Table 3.", 183 "source": "haiku" 184 }, 185 "baselines_contemporary": { 186 "applies": true, 187 "answer": true, 188 "justification": "ICA (Wei et al., 2023), ICLAttack (Zhao et al., 2024), and GCG (Zou et al., 2023) are contemporary; Square (2020) and TextAttack (2020) are older but are standard adversarial NLP baselines still used in this literature.", 189 "source": "haiku" 190 }, 191 "ablation_study": { 192 "applies": true, 193 "answer": false, 194 "justification": "No formal ablation of GGI components (e.g., removing stochastic batch sampling, varying top-k, isolating gradient guidance vs. greedy search) is performed; the pre/suf defense comparison is a variant test, not a component ablation.", 195 "source": "haiku" 196 }, 197 "multiple_metrics": { 198 "applies": true, 199 "answer": true, 200 "justification": "The paper uses accuracy (positive/negative separately), PRR (Performance Recovery Rate), and ASR (Attack Success Rate) across different tasks.", 201 "source": "haiku" 202 }, 203 "human_evaluation": { 204 "applies": false, 205 "answer": false, 206 "justification": "Human evaluation is not relevant for automated adversarial attack/defense evaluation on classification and jailbreak tasks.", 207 "source": "haiku" 208 }, 209 "held_out_test_set": { 210 "applies": true, 211 "answer": true, 212 "justification": "2–4 training queries are used for adversarial suffix optimization; 1,000 test queries from held-out splits of standard benchmarks are used for evaluation.", 213 "source": "haiku" 214 }, 215 "per_category_breakdown": { 216 "applies": true, 217 "answer": true, 218 "justification": "Sentiment analysis results are broken down by positive and negative class; AG's News results are broken down by all four topic categories (world, sports, business, tech).", 219 "source": "haiku" 220 }, 221 "failure_cases_discussed": { 222 "applies": true, 223 "answer": true, 224 "justification": "The paper notes cases where the attack is less effective (LLaMA3.1-8b shows higher resistance; GGI is less effective in 2-shot settings) and where the Suf. defense fails for Vicuna-7b and LLaMA-13b.", 225 "source": "haiku" 226 }, 227 "negative_results_reported": { 228 "applies": true, 229 "answer": true, 230 "justification": "The Suf. defense method yields PRR of 0% for several models (Vicuna-7b SST-2/RT, LLaMA-13b SST-2), and GGI shows near-zero ASR on LLaMA3.1-8b-Instruct in the jailbreak task; both are honestly reported.", 231 "source": "haiku" 232 } 233 }, 234 "setup_transparency": { 235 "model_versions_specified": { 236 "applies": true, 237 "answer": true, 238 "justification": "Models are specified with names and sizes (OPT-6.7b, Vicuna-7b, LLaMA-13b, LLaMa3.1-8b, Mistral-7B-Instruct, LLaMA3-8b-Instruct, GPT2-XL), which is sufficient to identify the checkpoints.", 239 "source": "haiku" 240 }, 241 "prompts_provided": { 242 "applies": true, 243 "answer": true, 244 "justification": "Table 7 in the appendix provides the complete prompt templates with structure and filled examples for SST-2/RT, AG's News, and AdvBench; the benign instruction for jailbreak is also reproduced verbatim.", 245 "source": "haiku" 246 }, 247 "hyperparameters_reported": { 248 "applies": true, 249 "answer": false, 250 "justification": "Algorithm 1 lists T (iterations), b (batch size), and k (top-k) as parameters but their specific numerical values used in experiments are not stated in the paper.", 251 "source": "haiku" 252 }, 253 "scaffolding_described": { 254 "applies": false, 255 "answer": false, 256 "justification": "This paper does not involve agentic scaffolding; it evaluates direct prompt-level attacks on LLMs.", 257 "source": "haiku" 258 }, 259 "data_preprocessing_documented": { 260 "applies": true, 261 "answer": true, 262 "justification": "Appendix A describes the number of training queries used per dataset, the 5-repetition averaging procedure, and the random demo selection from training sets.", 263 "source": "haiku" 264 } 265 }, 266 "data_integrity": { 267 "raw_data_available": { 268 "applies": true, 269 "answer": true, 270 "justification": "All datasets (SST-2, RT, AG's News, AdvBench) are publicly available standard benchmarks; the generated adversarial examples can be reproduced via the released code.", 271 "source": "haiku" 272 }, 273 "data_collection_described": { 274 "applies": true, 275 "answer": true, 276 "justification": "Dataset sources are cited with references, dataset statistics provided in Table 4, and the sampling procedure (random selection from training splits) is described in Appendix A.", 277 "source": "haiku" 278 }, 279 "recruitment_methods_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants; standard benchmarks are used with no recruitment.", 283 "source": "haiku" 284 }, 285 "data_pipeline_documented": { 286 "applies": true, 287 "answer": true, 288 "justification": "The pipeline from training query selection → GGI optimization → test query evaluation is described in Algorithm 1 and Appendix A, with the ICL template structure in Table 7.", 289 "source": "haiku" 290 } 291 }, 292 "contamination": { 293 "training_cutoff_stated": { 294 "applies": false, 295 "answer": false, 296 "justification": "The paper evaluates adversarial robustness, not knowledge/capability on benchmarks, so training cutoff is not relevant to the claims.", 297 "source": "haiku" 298 }, 299 "train_test_overlap_discussed": { 300 "applies": false, 301 "answer": false, 302 "justification": "Not applicable for the same reason as above.", 303 "source": "haiku" 304 }, 305 "benchmark_contamination_addressed": { 306 "applies": false, 307 "answer": false, 308 "justification": "The evaluation is of adversarial attack effectiveness, not of the model's memorization of benchmark answers.", 309 "source": "haiku" 310 } 311 }, 312 "human_studies": { 313 "pre_registered": { 314 "applies": false, 315 "answer": false, 316 "justification": "No human participants.", 317 "source": "haiku" 318 }, 319 "irb_or_ethics_approval": { 320 "applies": false, 321 "answer": false, 322 "justification": "No human participants; an Ethics Statement addresses responsible disclosure, not IRB.", 323 "source": "haiku" 324 }, 325 "demographics_reported": { 326 "applies": false, 327 "answer": false, 328 "justification": "No human participants.", 329 "source": "haiku" 330 }, 331 "inclusion_exclusion_criteria": { 332 "applies": false, 333 "answer": false, 334 "justification": "No human participants.", 335 "source": "haiku" 336 }, 337 "randomization_described": { 338 "applies": false, 339 "answer": false, 340 "justification": "No human participants.", 341 "source": "haiku" 342 }, 343 "blinding_described": { 344 "applies": false, 345 "answer": false, 346 "justification": "No human participants.", 347 "source": "haiku" 348 }, 349 "attrition_reported": { 350 "applies": false, 351 "answer": false, 352 "justification": "No human participants.", 353 "source": "haiku" 354 } 355 }, 356 "cost_and_practicality": { 357 "inference_cost_reported": { 358 "applies": true, 359 "answer": false, 360 "justification": "Hardware (2 NVIDIA H100 GPUs) is mentioned, but no inference latency, GPU-hours, or per-attack cost figures are reported.", 361 "source": "haiku" 362 }, 363 "compute_budget_stated": { 364 "applies": true, 365 "answer": false, 366 "justification": "Hardware is stated but total compute time for all experiments is not reported.", 367 "source": "haiku" 368 } 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "GGI consistently achieves near-perfect hijacking (positive accuracy ~100%, negative accuracy ~0%) on sentiment analysis across all tested LLMs including SOTA models like LLaMA3.1-8b.", 375 "evidence": "Table 1 shows 8-shot GGI results: OPT-6.7b N=0.2%, Vicuna-7b N=0.0%, LLaMA-13b N=0.0%, LLaMA3.1-8b N=0.0% on SST-2.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "GGI is transferable across different in-context demo sets and across different datasets of the same task.", 380 "evidence": "Figure 2 shows high ASRs when adversarial tokens learned on one demo set are applied to three distinct demo sets from both SST-2 and RT.", 381 "supported": "strong" 382 }, 383 { 384 "claim": "GGI is stealthy, maintaining perplexity scores comparable to clean prompts, making perplexity-based defenses ineffective.", 385 "evidence": "Figure 3 shows GGI perplexity (19.36) similar to Greedy (18.71) and Square (18.13), vs. TA's 46.39, on LLaMA-13b.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "The prefix clean-demo defense (Pre.) recovers negative-class accuracy with PRR frequently exceeding 65% across datasets.", 390 "evidence": "Table 2 shows Pre. PRR values: OPT-6.7b SST-2 58.8%, RT 65.1%; Vicuna-7b SST-2 81.1%, RT 86.3%; LLaMA-13b SST-2 73.7%, RT 80.6%.", 391 "supported": "strong" 392 }, 393 { 394 "claim": "GGI achieves state-of-the-art jailbreaking performance, outperforming ICA, Square, and Greedy on aligned models.", 395 "evidence": "Table 3 shows GGI achieves 97.2% ASR on Vicuna-7b (4-shot) and 62.0% on LLaMA3.1-8b-Instruct (4-shot), compared to near-zero ASRs for Square and Greedy on the latter.", 396 "supported": "moderate" 397 }, 398 { 399 "claim": "Smaller LLMs are more vulnerable to adversarial attacks than larger ones.", 400 "evidence": "Table 6 and Figure 4 show OPT-2.7b is hijacked to 100% N=0.0% in all shot settings, while OPT-6.7b shows higher residual negative accuracy; Figure 4 shows higher ASRs for OPT-2.7b vs OPT-6.7b.", 401 "supported": "moderate" 402 } 403 ], 404 "methodology_tags": [ 405 "benchmark-eval", 406 "empirical" 407 ], 408 "key_findings": "The GGI (Greedy Gradient-guided Injection) attack appends imperceptible adversarial suffix tokens to in-context demos using gradient-guided optimization, consistently hijacking LLMs to produce target outputs (near 100% targeted sentiment, high jailbreak ASR) while evading perplexity-based detection. The attack transfers across demo sets and datasets without requiring backdoor triggers in user queries. A simple defense of prepending 2-shot clean demos before adversarial demos recovers negative-class accuracy with PRR >65% in most settings and reduces jailbreak ASR to single digits, while the suffix-positioning variant shows mixed effectiveness. Larger models exhibit stronger resistance to the attack.", 409 "red_flags": [ 410 { 411 "flag": "White-box requirement understated", 412 "detail": "The attack requires full gradient access to the target LLM (adversarial model publisher threat model), but the abstract and conclusion make broad claims about 'significant security vulnerabilities of LLMs' without prominently bounding this to the white-box scenario." 413 }, 414 { 415 "flag": "No variance reporting", 416 "detail": "Despite 5-repetition averaging, no standard deviations or confidence intervals are reported for any table, making it impossible to assess whether differences between methods are statistically meaningful." 417 }, 418 { 419 "flag": "Minimal limitations section", 420 "detail": "The limitations section is two sentences in the conclusion, discussing only adversarial token placement; the white-box constraint, limited model coverage, and use of synthetic jailbreak queries are not addressed." 421 }, 422 { 423 "flag": "No component ablation", 424 "detail": "The GGI algorithm combines greedy search, gradient guidance, and stochastic batch sampling, but no ablation tests the contribution of each component individually." 425 }, 426 { 427 "flag": "No funding disclosure", 428 "detail": "The paper contains no acknowledgment or funding section, making it impossible to assess potential conflicts of interest." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 434 "relevance": "GCG method — primary gradient-based attack baseline that GGI builds upon; most directly related prior work on gradient-guided adversarial suffix generation." 435 }, 436 { 437 "title": "Adversarial Demonstration Attacks on Large Language Models", 438 "relevance": "Direct prior work on adversarial attacks targeting in-context demonstrations; GGI is positioned as an improvement over this approach." 439 }, 440 { 441 "title": "Jailbreak and Guard Aligned Language Models with Only Few In-Context Demonstrations", 442 "relevance": "ICA baseline for jailbreak attacks using in-context demos; also motivates the clean-demo defense strategy." 443 }, 444 { 445 "title": "Test-Time Backdoor Mitigation for Black-Box Large Language Models with Defensive Demonstrations", 446 "relevance": "Motivates the clean-demo defense approach; directly related to the proposed Pre./Suf. defense strategies." 447 }, 448 { 449 "title": "Universal Vulnerabilities in Large Language Models: Backdoor Attacks for In-Context Learning", 450 "relevance": "ICLAttack — most closely related attack requiring backdoor triggers in user queries, which GGI is designed to overcome." 451 }, 452 { 453 "title": "Backdoor Attacks for In-Context Learning with Language Models", 454 "relevance": "Earlier backdoor ICL attack requiring fine-tuning; establishes the threat landscape that GGI's trigger-free approach addresses." 455 }, 456 { 457 "title": "Language Models are Few-Shot Learners", 458 "relevance": "Foundational paper establishing in-context learning as a paradigm; establishes the ICL setup that is the attack target." 459 }, 460 { 461 "title": "Baseline Defenses for Adversarial Attacks Against Aligned Language Models", 462 "relevance": "Perplexity-based defense that GGI is shown to evade; establishes that perplexity filtering is insufficient against low-perplexity attacks." 463 } 464 ], 465 "engagement_factors": { 466 "practical_relevance": { 467 "score": 2, 468 "justification": "The defense (inject clean demos) is immediately deployable at inference time without model modification, but the attack requires white-box gradient access limiting real-world threat level." 469 }, 470 "surprise_contrarian": { 471 "score": 1, 472 "justification": "Adversarial attacks on LLMs are expected; the main novelty is the specific trigger-free ICL attack surface, not a surprising or counterintuitive finding." 473 }, 474 "fear_safety": { 475 "score": 2, 476 "justification": "Demonstrates jailbreaking of aligned models like LLaMA3.1-8b-Instruct with 62% ASR and reduction of Mistral safeguards to near zero, raising concrete safety concerns about ICL-based deployments." 477 }, 478 "drama_conflict": { 479 "score": 1, 480 "justification": "Standard attack-defense paper in adversarial ML; no controversy angle beyond the jailbreak content." 481 }, 482 "demo_ability": { 483 "score": 2, 484 "justification": "Code is released and all models (OPT, LLaMA, Vicuna, Mistral) are publicly available open-source; replication requires H100-level GPU access for the gradient-based optimization." 485 }, 486 "brand_recognition": { 487 "score": 0, 488 "justification": "Authors are from Wayne State University and Oakland University with no high-profile lab affiliation." 489 } 490 }, 491 "hn_data": { 492 "threads": [ 493 { 494 "hn_id": "42418940", 495 "title": "SlimLM: An Efficient Small Language Model for On-Device Document Assistance", 496 "points": 3, 497 "comments": 0, 498 "url": "https://news.ycombinator.com/item?id=42418940" 499 }, 500 { 501 "hn_id": "40065412", 502 "title": "Show Your Work with Confidence: Confidence Bands for Tuning Curves", 503 "points": 1, 504 "comments": 0, 505 "url": "https://news.ycombinator.com/item?id=40065412" 506 }, 507 { 508 "hn_id": "40014091", 509 "title": "Show Your Work with Confidence: Confidence Bands for Tuning Curves", 510 "points": 1, 511 "comments": 0, 512 "url": "https://news.ycombinator.com/item?id=40014091" 513 } 514 ], 515 "top_points": 3, 516 "total_points": 5, 517 "total_comments": 0 518 } 519 }