scan.json (31486B)
1 { 2 "paper": { 3 "title": "Hijacking Large Language Models via Adversarial In-Context Learning", 4 "authors": [ 5 "Xiangyu Zhou", 6 "Yao Qiang", 7 "Saleh Zare Zade", 8 "Prashant Khanduri", 9 "Dongxiao Zhu" 10 ], 11 "year": 2023, 12 "venue": "arXiv.org", 13 "arxiv_id": "2311.09948", 14 "doi": "10.48550/arXiv.2311.09948" 15 }, 16 "scan_version": 3, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "The paper proposes a greedy gradient-guided injection (GGI) attack that appends adversarial suffixes to in-context demonstrations, hijacking LLMs to produce targeted outputs or harmful responses. GGI achieves near-perfect attack success rates on classification tasks (collapsing negative accuracy to ~0%) and high jailbreak ASR (e.g., 97.2% on Vicuna-7b with 4-shots). The attack transfers across demo sets and datasets and maintains low perplexity (stealthy). A simple defense of appending clean demonstrations at inference time substantially mitigates the attack, with the prefix strategy recovering up to 86% of lost accuracy.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": true, 25 "justification": "Code is released at https://github.com/xzhou98/Hijacking-LLMs-GGI, as stated in the abstract." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "All datasets used are publicly available: SST-2, Rotten Tomatoes, AG's News, and AdvBench. The paper references these standard benchmarks and does not use proprietary data." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The appendix states 'All experiments are conducted on 2 NVIDIA H100 GPU cards in a single node' but provides no software dependencies, library versions, requirements.txt, or environment specifications." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub repository is linked but the paper itself does not include commands or a reproduction guide." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results in Tables 1-3 and 5-6 report point estimates only. Despite averaging over 5 repetitions of demo selection, no confidence intervals, error bars, or ± notation are reported." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper makes numerous comparative claims (e.g., 'GGI achieves SOTA jailbreaking performance', 'GGI remains highly effective even on the SOTA LLMs') based solely on comparing raw numbers without any statistical significance tests." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Results are presented with baseline context throughout. Tables show clean accuracy vs attack accuracy (e.g., negative accuracy drops from 93.8% to 2.0%), ASR values with and without defense, and PRR percentages quantifying recovery magnitude." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification is given for why 2 training queries are used to learn adversarial suffixes for sentiment tasks, 4 for AG's News and jailbreak, or 1000 test queries. No power analysis is discussed." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "Section 5 states 'we randomly select the demos from the training set and repeat this process 5 times, reporting the average accuracy over the repetitions,' but no standard deviation, interquartile range, or any spread measure is reported in any table." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Multiple baselines are included: Square Attack, Greedy Search, TextAttack (TA), and ICA for jailbreak. Defense baselines include Onion and a benign instruction approach. Details in Appendix B." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Baselines include ICA (2023), Square Attack (2020), TextAttack (2020), and Onion (2020). ICA is the most closely related concurrent work. The older baselines are justified as representative of different attack paradigms (black-box, character-level, word-level)." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": false, 84 "justification": "No ablation study is performed on the GGI attack components (e.g., removing the gradient guidance, varying top-k, removing the random subset sampling). The defense variants (Pre. vs Suf.) are compared but this is not an ablation of the attack method itself." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Multiple metrics are used: per-class accuracy (positive and negative separately), Attack Success Rate (ASR) for jailbreak, Performance Recovery Rate (PRR) for defense evaluation, and perplexity scores for stealthiness." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "No human evaluation is conducted. The paper claims the attack is 'imperceptible' and 'stealthy' but assesses stealthiness only via automated perplexity scores (Figure 3), not human judgment of whether adversarial suffixes are detectable." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Training queries (2-4 examples per Table 4) used to learn adversarial suffixes are separate from the 1000 test queries used for evaluation. The split is clearly described." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down per model, per dataset, per few-shot setting (2/4/8-shots), and per sentiment class (positive vs negative). Tables 1, 2, 3, 5, and 6 all provide detailed breakdowns." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper discusses where attacks are less effective: baseline attacks fail on stronger models like LLaMA3.1-8b (Section 6.2), the Suf. defense fails on some models with PRR=0% (Table 2), and LLaMA3.1-8b-Instruct resists jailbreaking more effectively (Table 3, ASR remains low for baselines)." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Negative results include: Suf. defense showing PRR=0% on Vicuna-7b and LLaMA-13b for classification (Table 2), Onion defense underperforming across most settings, and the benign instruction defense failing to mitigate jailbreak attacks (Table 3)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims about effectiveness (near-perfect hijacking in Tables 1, 5), stealthiness (low perplexity in Figure 3), transferability (Figure 2), and defense effectiveness (Tables 2, 3) are all supported by corresponding experimental results." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "The paper makes causal claims about adversarial suffixes causing misclassification and jailbreaking. The study design uses controlled comparisons (clean vs. adversarial demos, with vs. without defense) which adequately supports these claims. The attention analysis in Figure 6 provides a mechanistic explanation." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims 'Hijacking Large Language Models' generally, but experiments are limited to models up to 13B parameters from specific families (OPT, LLaMA, Vicuna, Mistral). The abstract says 'leveraging LLMs' and 'diverse LLMs' without bounding to the tested range. No discussion of whether results extend to larger models (70B+) or closed-source models." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper provides one attention analysis example (Figure 6) as explanation for why GGI works but does not consider alternative explanations. For instance, it does not discuss whether the attack's effectiveness could be partially attributed to the small demo set size, model-specific vulnerabilities, or dataset characteristics." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures accuracy and ASR and claims exactly those things. ASR is defined precisely (absence of refusal phrases) and accuracy is standard. The stealthiness claim uses perplexity as an explicit proxy, though without fully discussing its limitations." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Model names are specified with sizes and versions: OPT-6.7b, Vicuna-7b-v1.5, LLaMA-13b (cited to LLaMA 1 paper), LLaMA3.1-8b, Mistral-7B-Instruct, LLaMA3-8b-Instruct, LLaMA3.1-8b-Instruct. For these open-source models, the name+size+citation identifies the exact HuggingFace checkpoint." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Full prompt templates with concrete examples are provided in Table 7 for SST-2/RT, AG's News, and AdvBench. The benign instruction for jailbreak defense is provided verbatim in Appendix A." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "Algorithm 1 defines parameters T (iterations), b (batch size), and k (top-k) but their actual values are not stated in the paper. LLM inference settings (temperature, top-p) are also not reported." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. The approach directly manipulates prompt tokens without any agent framework, tool use, or workflow orchestration." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Table 4 documents training/test splits per dataset. The ICL settings section describes demo selection (random from training set, 5 repetitions). Template structures are shown in Table 7. The AdvBench evaluation uses 200 randomly selected harmful queries from the benchmark." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "A 'Limitations' paragraph appears at the end of Section 8 (Conclusion), discussing the unexplored impact of adversarial token placement within demos." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "The limitations section identifies a specific threat: 'Our analysis has not yet examined the impact of adversarial token placement within the demos (e.g., at the prefix, middle, or suffix) on the effectiveness of the attack.' This is specific to their study design." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The paper does not explicitly state what the results do NOT show. No discussion of model size boundaries, closed-source model applicability, whether results generalize beyond English, or the limited task diversity (only classification and jailbreak)." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw experimental data (per-sample predictions, learned adversarial suffixes, intermediate optimization logs) is made available. Only aggregated results in tables are reported." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Dataset sources are clearly identified (SST-2, Rotten Tomatoes, AG's News, AdvBench). Table 4 provides statistics on training and test queries. Selection procedures for demos and test queries are described." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. All data sources are standard public benchmarks (SST-2, RT, AG's News, AdvBench)." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is documented: select training queries (Table 4) → learn adversarial suffixes via GGI (Algorithm 1) → construct adversarial demos → evaluate on 1000 test queries with random demo selection repeated 5 times. The defense pipeline (appending clean demos) is also clearly described." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding acknowledgment, grants, or sponsorship information appears anywhere in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: Wayne State University and Oakland University. No evaluated product is affiliated with these institutions." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "Cannot assess funder independence because funding is not disclosed. The absence of a funding disclosure makes this unanswerable." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement appears in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": false, 235 "answer": false, 236 "justification": "This is a red-teaming/adversarial attack study testing defense robustness rather than evaluating pre-trained model knowledge on benchmarks. The core contribution is the attack mechanism, not model capability." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": false, 240 "answer": false, 241 "justification": "As an adversarial attack study, the concern is whether attacks can manipulate model behavior, not whether models have memorized benchmark answers. Contamination does not affect the validity of attack success measurements." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": false, 245 "answer": false, 246 "justification": "This is a red-teaming study testing adversarial robustness of ICL, not evaluating model capabilities on benchmarks. Benchmark contamination is not a threat to the validity of the attack evaluation." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. All evaluations are automated using benchmark datasets." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants. The paper includes an Ethics Statement discussing responsible disclosure intent." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": false, 290 "justification": "No inference cost, latency, or tokens consumed are reported. The attack requires gradient computation over the full vocabulary for each demo position across T iterations, which is computationally expensive, but this cost is not quantified." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "The appendix mentions '2 NVIDIA H100 GPU cards' as hardware but does not state total GPU hours, wall-clock time, or any compute budget for the experiments." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "While the paper averages over 5 repetitions of random demo selection, it does not report variance across these runs or analyze seed sensitivity. Only averaged point estimates are shown." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section 5 (ICL Settings) states: 'we randomly select the demos from the training set and repeat this process 5 times, reporting the average accuracy over the repetitions.'" 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "No hyperparameter search budget is reported. The values for T (iterations), k (top-k), and b (batch size) in Algorithm 1 appear tuned but no search process is described." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": false, 317 "justification": "No description of how the final attack configuration was selected. The paper does not explain how hyperparameters for GGI were chosen or whether multiple configurations were tried." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "The paper makes comparisons across 4+ models, 2-3 datasets, 3 few-shot settings, and 4+ attack methods without any correction for multiple comparisons." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The authors implement their own versions of baseline attacks (Greedy, Square) and compare against their proposed GGI without acknowledging potential self-comparison bias in baseline implementations." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": false, 332 "justification": "GGI requires gradient computation over the full vocabulary (gradient-based) while baselines like Square are black-box. This compute asymmetry is not discussed or controlled for in comparisons." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper uses ASR (absence of refusal phrases) as the jailbreak metric without discussing whether string-matching for refusal phrases adequately captures harmful content generation. No discussion of whether SST-2 accuracy adequately measures attack effectiveness." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is involved. The attack operates directly on prompt tokens without any agentic framework." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "No discussion of whether SST-2, RT, or AG's News examples appeared in the training data of the evaluated models. These benchmarks predate all tested models." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup provides information not available in realistic deployment scenarios." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "No discussion of independence between training queries used for learning adversarial suffixes and test queries, or whether structural similarities between datasets affect results." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, or decontamination pipelines are used." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "GGI consistently hijacks LLMs to generate targeted positive sentiment, achieving near-perfect positive accuracy and collapsing negative accuracy to near 0% across most settings.", 371 "evidence": "Table 1 shows GGI achieving 100% positive accuracy and 0-2% negative accuracy across OPT-6.7b, LLaMA-13b at 8-shots. Even on LLaMA3.1-8b (strongest model), negative accuracy drops to 0% at 8-shots.", 372 "supported": "strong" 373 }, 374 { 375 "claim": "GGI achieves state-of-the-art jailbreaking performance across all models and settings.", 376 "evidence": "Table 3 shows GGI achieving 76.1-97.2% ASR on Vicuna-7b, 90.5-98.9% on Mistral-7B-Instruct, and 28.6-62.0% on LLaMA3.1-8b-Instruct, consistently outperforming ICA, Square, and Greedy baselines.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "The Prefix defense method substantially recovers model performance against hijacking attacks, with PRR frequently exceeding 65%.", 381 "evidence": "Table 2 shows PRR values of 58.8-81.1% for Pre. defense across OPT-6.7b, Vicuna-7b, LLaMA-13b on SST-2 and RT. Pre. consistently outperforms Onion baseline.", 382 "supported": "moderate" 383 }, 384 { 385 "claim": "The Suffix defense consistently delivers the strongest jailbreak mitigation, reducing ASR to single digits.", 386 "evidence": "Table 3 shows Suf. reducing ASR to 1.5-4.4% on Vicuna-7b and 4.9-8.6% on Mistral-7B-Instruct under GGI attack. However, results on LLaMA3.1-8b-Instruct are less dramatic (4.1-9.3%).", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "GGI exhibits transferability across different demo sets and across different datasets of the same task.", 391 "evidence": "Figure 2 shows adversarial tokens learned from one demo set achieving high ASR when applied to three different demo sets on both SST-2 and RT. Tokens learned from RT transfer to SST-2.", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "GGI maintains strong stealthiness with perplexity scores comparable to clean inputs, making it difficult to detect via perplexity-based defenses.", 396 "evidence": "Figure 3 shows average perplexity of 19.36 for GGI vs 13.8 for clean inputs on LLaMA-13b with RT, while TextAttack reaches 46.39. However, this is tested on only one model/dataset combination.", 397 "supported": "moderate" 398 }, 399 { 400 "claim": "Smaller LLMs are more vulnerable to adversarial hijacking attacks.", 401 "evidence": "Table 6 compares OPT-2.7b vs OPT-6.7b and LLaMA-7b vs LLaMA-13b, showing smaller models achieve 100% attack success more easily. Figure 4 confirms with ASR comparison on AG's News.", 402 "supported": "moderate" 403 } 404 ], 405 "red_flags": [ 406 { 407 "flag": "No uncertainty quantification despite multiple runs", 408 "detail": "The paper averages over 5 repetitions of demo selection but reports zero variance, standard deviation, or confidence intervals in any table. This hides potential instability in results, especially for settings with more moderate attack effectiveness." 409 }, 410 { 411 "flag": "No statistical significance tests for comparative claims", 412 "detail": "All claims of superiority ('GGI achieves SOTA', 'outperforms all baselines') are based on comparing raw numbers without any statistical tests. Given the variance in ICL performance that the paper itself acknowledges, this is a notable omission." 413 }, 414 { 415 "flag": "Compute-unfair baseline comparison", 416 "detail": "GGI uses gradient-based optimization (requiring full model access and backpropagation over the vocabulary), while Square Attack is a black-box method. The paper does not control for or discuss the compute asymmetry, making it unclear how much of GGI's advantage comes from compute rather than algorithmic innovation." 417 }, 418 { 419 "flag": "Stealthiness claim based on limited evaluation", 420 "detail": "The 'imperceptible' and 'stealthy' claims rest entirely on perplexity scores from one model (LLaMA-13b) on one dataset (RT) in Figure 3. No human evaluation of detectability is conducted, and adversarial suffixes like 'NULL', 'Remove', 'For', and 'refresh Real result' shown in figures would be noticeable to a careful reader." 421 }, 422 { 423 "flag": "Missing hyperparameter details", 424 "detail": "Key algorithm parameters (iterations T, top-k value, batch size b, and LLM temperature/sampling settings) are not reported, making the attack difficult to reproduce or fairly evaluate." 425 }, 426 { 427 "flag": "Jailbreak ASR metric limitations", 428 "detail": "ASR is defined as absence of refusal phrases, but the paper does not verify whether the generated content is actually harmful or coherent. A model that generates gibberish without a refusal phrase would count as a successful jailbreak under this metric." 429 } 430 ], 431 "cited_papers": [ 432 { 433 "title": "Universal and transferable adversarial attacks on aligned language models", 434 "authors": ["Andy Zou"], 435 "year": 2023, 436 "arxiv_id": "2307.15043", 437 "relevance": "Foundational gradient-based adversarial attack (GCG) on aligned LLMs; provides AdvBench benchmark used in this paper." 438 }, 439 { 440 "title": "Adversarial demonstration attacks on large language models", 441 "authors": ["Jiongxiao Wang"], 442 "year": 2023, 443 "arxiv_id": "2305.14950", 444 "relevance": "Most closely related prior work on adversarial attacks targeting ICL demonstrations; this paper's TextAttack baseline follows their approach." 445 }, 446 { 447 "title": "Backdoor attacks for in-context learning with language models", 448 "authors": ["Nikhil Kandpal"], 449 "year": 2023, 450 "arxiv_id": "2307.14692", 451 "relevance": "Proposes backdoor attacks against ICL by fine-tuning on poisoned training samples; addresses the security of in-context learning." 452 }, 453 { 454 "title": "Jailbreak and guard aligned language models with only few in-context demonstrations", 455 "authors": ["Zeming Wei"], 456 "year": 2023, 457 "arxiv_id": "2310.06387", 458 "relevance": "Shows that in-context demos alone can jailbreak aligned LLMs; serves as the ICA baseline and inspires the defense strategy in this paper." 459 }, 460 { 461 "title": "Universal vulnerabilities in large language models: Backdoor attacks for in-context learning", 462 "authors": ["Shuai Zhao", "Meihuizi Jia", "Luu Anh Tuan", "Fengjun Pan", "Jinming Wen"], 463 "year": 2024, 464 "arxiv_id": "2401.05949", 465 "relevance": "Introduces ICLAttack inserting backdoor triggers into ICL demos and queries; directly related to adversarial ICL security." 466 }, 467 { 468 "title": "Baseline defenses for adversarial attacks against aligned language models", 469 "authors": ["Neel Jain"], 470 "year": 2023, 471 "arxiv_id": "2309.00614", 472 "relevance": "Proposes perplexity-based defenses against adversarial prompts; this paper evaluates stealthiness against such defenses." 473 }, 474 { 475 "title": "Survey of vulnerabilities in large language models revealed by adversarial attacks", 476 "authors": ["Erfan Shayegani"], 477 "year": 2023, 478 "arxiv_id": "2310.10844", 479 "relevance": "Comprehensive survey of LLM adversarial vulnerabilities; provides broader context for the attack landscape." 480 }, 481 { 482 "title": "Test-time backdoor mitigation for black-box large language models with defensive demonstrations", 483 "authors": ["Wenjie Mo"], 484 "year": 2023, 485 "arxiv_id": "2311.09763", 486 "relevance": "Proposes test-time defense using clean demonstrations against backdoor attacks; directly inspires the defense strategy in this paper." 487 }, 488 { 489 "title": "Cold-attack: Jailbreaking LLMs with stealthiness and controllability", 490 "authors": ["Xingang Guo"], 491 "year": 2024, 492 "arxiv_id": "2402.08679", 493 "relevance": "Addresses stealthy jailbreak attacks on LLMs; related to the adversarial prompt generation space." 494 }, 495 { 496 "title": "The llama 3 herd of models", 497 "authors": ["Aaron Grattafiori", "Abhimanyu Dubey"], 498 "year": 2024, 499 "arxiv_id": "2407.21783", 500 "relevance": "Describes the LLaMA 3 model family used as primary evaluation target in this paper's experiments." 501 }, 502 { 503 "title": "Mitigating fine-tuning jailbreak attack with backdoor enhanced alignment", 504 "authors": ["Jiongxiao Wang"], 505 "year": 2024, 506 "arxiv_id": "2402.14968", 507 "relevance": "Proposes alignment-based defense against jailbreak attacks from fine-tuning; related to LLM safety mechanisms." 508 }, 509 { 510 "title": "Safety alignment should be made more than just a few tokens deep", 511 "authors": ["Xiangyu Qi", "Ashwinee Panda"], 512 "year": 2024, 513 "arxiv_id": "2406.05946", 514 "relevance": "Analyzes the depth of safety alignment in LLMs, relevant to understanding why adversarial ICL can bypass safety mechanisms." 515 } 516 ], 517 "engagement_factors": { 518 "practical_relevance": { 519 "score": 2, 520 "justification": "The defense strategy (appending clean demos) is immediately usable by practitioners deploying ICL-based systems, though the attack requires white-box model access." 521 }, 522 "surprise_contrarian": { 523 "score": 1, 524 "justification": "Adversarial vulnerability of ICL extends known concerns about LLM robustness but doesn't overturn any widely-held belief." 525 }, 526 "fear_safety": { 527 "score": 2, 528 "justification": "Demonstrates concrete jailbreaking of aligned models (97% ASR on Vicuna) via adversarial in-context demos, raising legitimate safety concerns about ICL deployment." 529 }, 530 "drama_conflict": { 531 "score": 1, 532 "justification": "No major controversy or provocative framing; positioned as constructive red-teaming with accompanying defense." 533 }, 534 "demo_ability": { 535 "score": 2, 536 "justification": "Code released on GitHub with open-source models (LLaMA, Vicuna, Mistral), allowing reproduction with sufficient compute." 537 }, 538 "brand_recognition": { 539 "score": 1, 540 "justification": "From Wayne State and Oakland University; tests on well-known open-source models but not from a famous AI lab." 541 } 542 } 543 }