scan.json (33211B)
1 { 2 "paper": { 3 "title": "Optimization-based Prompt Injection Attack to LLM-as-a-Judge", 4 "authors": [ 5 "Jiawen Shi", 6 "Zenghui Yuan", 7 "Yinuo Liu", 8 "Yue Huang", 9 "Pan Zhou", 10 "Lichao Sun", 11 "Neil Zhenqiang Gong" 12 ], 13 "year": 2024, 14 "venue": "CCS '24 (ACM SIGSAC Conference on Computer and Communications Security)", 15 "arxiv_id": "2403.17710", 16 "doi": "10.1145/3658644.3690291" 17 }, 18 "scan_version": 3, 19 "active_modules": ["experimental_rigor", "data_leakage"], 20 "methodology_tags": ["benchmark-eval"], 21 "key_findings": "JudgeDeceiver, an optimization-based prompt injection attack on LLM-as-a-Judge, achieves 89–99% attack success rates across four open-source LLMs and two benchmarks, vastly outperforming manual prompt injection (max 40.7% ASR) and jailbreak attacks (max 53.6% ASR). The attack transfers across models, with sequences optimized on Llama-3-8B achieving up to 99% ASR on Llama-2-13B and 79% on GPT-4. Three detection-based defenses (known-answer, PPL, PPL-W) are insufficient, with false negative rates of 40–100%.", 22 "checklist": { 23 "artifacts": { 24 "code_released": { 25 "applies": true, 26 "answer": true, 27 "justification": "The abstract states: 'Our implementation is available at this repository: https://github.com/ShiJiawenwen/JudgeDeceiver.'" 28 }, 29 "data_released": { 30 "applies": true, 31 "answer": true, 32 "justification": "The paper uses publicly available benchmarks MT-Bench and LLMBar. All 10 target question-response pairs for each benchmark are fully specified in the appendix (Figures 13–17). The code repository is provided for reconstruction of evaluation datasets." 33 }, 34 "environment_specified": { 35 "applies": true, 36 "answer": false, 37 "justification": "No requirements.txt, Dockerfile, conda environment file, or environment setup section is mentioned in the paper. Only model names and temperature settings are given." 38 }, 39 "reproduction_instructions": { 40 "applies": true, 41 "answer": false, 42 "justification": "While Algorithm 1 describes the optimization process and a GitHub repo is provided, the paper does not include step-by-step reproduction instructions, scripts to replicate main experiments, or a 'Reproducing Results' section." 43 } 44 }, 45 "statistical_methodology": { 46 "confidence_intervals_or_error_bars": { 47 "applies": true, 48 "answer": false, 49 "justification": "All results in Tables 1–12 are reported as point estimates (e.g., '90.8% ASR') without confidence intervals, error bars, or ± notation." 50 }, 51 "significance_tests": { 52 "applies": true, 53 "answer": false, 54 "justification": "The paper claims JudgeDeceiver 'outperforms' all baselines by comparing raw ASR/PAC numbers (e.g., Table 2, Table 3) without any statistical significance tests such as p-values, t-tests, or bootstrap tests." 55 }, 56 "effect_sizes_reported": { 57 "applies": true, 58 "answer": true, 59 "justification": "The paper reports absolute improvements with baseline context, e.g., 'ASR improvements ranging from 37.2% to 78.7% on MTBench and 43.3% to 92.1% on LLMBar' (Section 4.2), and provides both ASR-B (baseline error rate) and ASR for interpreting improvement magnitude." 60 }, 61 "sample_size_justified": { 62 "applies": true, 63 "answer": false, 64 "justification": "The choice of 10 target QR pairs, 500 clean responses, 3 shadow candidate response sets, and 600 optimization iterations is not justified. No power analysis or rationale for these specific numbers is provided." 65 }, 66 "variance_reported": { 67 "applies": true, 68 "answer": false, 69 "justification": "While temperature=0 makes LLM generation deterministic, the optimization process in Algorithm 1 involves random token selection (step 9), yet no variance across optimization runs, standard deviations, or spread measures are reported." 70 } 71 }, 72 "evaluation_design": { 73 "baselines_included": { 74 "applies": true, 75 "answer": true, 76 "justification": "The paper compares against 6 manual prompt injection attacks (naive, escape characters, context ignore, fake completion, combined, fake reasoning) in Table 2 and 4 jailbreak attacks (TAP, PAIR, AutoDAN, GCG) in Table 3." 77 }, 78 "baselines_contemporary": { 79 "applies": true, 80 "answer": true, 81 "justification": "Jailbreak baselines include GCG (2023), AutoDAN (2023), TAP (2023), and PAIR (2023), which are contemporary and represent state-of-the-art adversarial attack methods." 82 }, 83 "ablation_study": { 84 "applies": true, 85 "answer": true, 86 "justification": "Table 4 ablates the three loss terms one by one. Figure 6 ablates hyperparameters α and β. Additional ablations cover shadow response numbers (Figure 5), initialization types (Table 5), injected sequence locations (Table 6), and sequence length effects." 87 }, 88 "multiple_metrics": { 89 "applies": true, 90 "answer": true, 91 "justification": "Four evaluation metrics are defined and used: ACC (accuracy), ASR-B (baseline attack success rate), ASR (attack success rate), and PAC (positional attack consistency), as defined in Section 4.1.4." 92 }, 93 "human_evaluation": { 94 "applies": true, 95 "answer": false, 96 "justification": "All evaluation is automated through LLM-as-a-Judge outputs. No human evaluation of attack quality, detectability, or the injected sequences' naturalness is included." 97 }, 98 "held_out_test_set": { 99 "applies": true, 100 "answer": true, 101 "justification": "The injected sequence is optimized using shadow candidate responses (Section 3.2), while evaluation uses a separate set of 500 clean responses per benchmark generated by 10 different LLMs (Section 4.1.1). The evaluation responses are never seen during optimization." 102 }, 103 "per_category_breakdown": { 104 "applies": true, 105 "answer": true, 106 "justification": "Tables 1a and 1b provide per-question-response pair breakdowns (QR-1 through QR-10) for all four metrics across all four models and both datasets." 107 }, 108 "failure_cases_discussed": { 109 "applies": true, 110 "answer": true, 111 "justification": "The paper discusses cases where the attack is less effective: QR-8 and QR-9 on Mistral-7B achieve only 71% and 77% ASR (Table 1a). The defense evaluation (Section 6) shows where defenses partially succeed. Figure 6 shows hyperparameter settings that degrade performance." 112 }, 113 "negative_results_reported": { 114 "applies": true, 115 "answer": true, 116 "justification": "Table 4 shows removing loss terms hurts performance. Figure 6 shows excessive α=10 drops ASR to 82% and β≥0.5 severely reduces ASR. Table 5 shows 'Character' initialization achieves only 70% ASR. The tradeoff between stealthiness (L_perplexity) and attack effectiveness is explicitly discussed." 117 } 118 }, 119 "claims_and_evidence": { 120 "abstract_claims_supported": { 121 "applies": true, 122 "answer": true, 123 "justification": "The abstract claims 'highly effective' (supported by 89–99% ASR in Table 1), 'much more effective than existing prompt injection attacks' (supported by Table 2 showing manual attacks max at 40.7%), and 'defenses are insufficient' (supported by Tables 11–12 showing 40–100% FNR). All claims are substantiated." 124 }, 125 "causal_claims_justified": { 126 "applies": true, 127 "answer": true, 128 "justification": "Causal claims about component contributions are supported by controlled ablation studies: Table 4 removes individual loss terms (single-variable manipulation), showing L_aligned and L_enhancement each reduce ASR when removed. This constitutes adequate controlled experimentation for the causal claims made." 129 }, 130 "generalization_bounded": { 131 "applies": true, 132 "answer": false, 133 "justification": "The title says 'Prompt Injection Attack to LLM-as-a-Judge' without qualification. The threat model assumes open-source LLMs (Section 2.2), but main evaluation is on four 7B–8B models. Transferability to larger/proprietary models is mixed (5% to 79% ASR from Llama-2-7B to GPT-4, Table 7). The broad title and abstract do not bound claims to the tested model scale." 134 }, 135 "alternative_explanations_discussed": { 136 "applies": true, 137 "answer": false, 138 "justification": "The paper does not discuss alternative explanations for why the attack succeeds. No confounding factors are considered, such as whether the attack exploits specific model weaknesses vs. general LLM-as-a-Judge vulnerabilities, or whether the high ASR on some models (e.g., Llama-2-7B at 31% ASR-B baseline) reflects weak judging rather than strong attacks." 139 }, 140 "proxy_outcome_distinction": { 141 "applies": true, 142 "answer": true, 143 "justification": "The metrics (ASR, PAC) directly measure what is claimed — the attack success rate is defined as the probability of the target response being selected, and this is exactly what the paper claims to optimize. No proxy gap exists between measurement and framing." 144 } 145 }, 146 "setup_transparency": { 147 "model_versions_specified": { 148 "applies": true, 149 "answer": false, 150 "justification": "Several key models lack exact version identifiers. 'Mistral-7B-Instruct' does not specify v0.1 vs v0.2. 'gpt-3.5-turbo' lacks a snapshot date (changes over time). While some models are specific ('gpt-4-0125-preview', 'mistral-large-2407'), the main evaluation models are insufficiently specified." 151 }, 152 "prompts_provided": { 153 "applies": true, 154 "answer": true, 155 "justification": "The LLM-as-a-Judge prompt template is shown in Figure 2. Manual attack prompts are provided in Table 15. Shadow response generation prompts are in Table 14. Target response generation prompt is in Figure 12. Detection prompts are given in Section 6." 156 }, 157 "hyperparameters_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 4.1.3 reports: temperature=0, 600 optimization iterations, suffix length=20 tokens, initial token='correct'. Default α and β values are explored in Figure 6. Top-K and batch size B parameters are described in Section 3.4." 161 }, 162 "scaffolding_described": { 163 "applies": false, 164 "answer": false, 165 "justification": "No agentic scaffolding is used. JudgeDeceiver is a direct gradient-based optimization attack, not an agentic system." 166 }, 167 "data_preprocessing_documented": { 168 "applies": true, 169 "answer": true, 170 "justification": "Section 4.1.1 documents: target question selection criteria (covering various topics), target response generation using GPT-3.5-turbo with manual selection of 'most unsuitable' response, clean response generation from 10 LLMs (10 per LLM per question, manually selecting 50 high-quality ones), and shadow response generation via rephrased prompts." 171 } 172 }, 173 "limitations_and_scope": { 174 "limitations_section_present": { 175 "applies": true, 176 "answer": false, 177 "justification": "There is no dedicated Limitations or Threats to Validity section. The conclusion (Section 8) mentions two future work directions in a single sentence but provides no substantive discussion of limitations." 178 }, 179 "threats_to_validity_specific": { 180 "applies": true, 181 "answer": false, 182 "justification": "No threats to validity are discussed. The paper does not address potential threats such as the small number of target QR pairs (10), reliance on specific prompt templates, or the gap between simulated scenarios and real deployments." 183 }, 184 "scope_boundaries_stated": { 185 "applies": true, 186 "answer": false, 187 "justification": "No explicit scope boundaries are stated. The paper does not explicitly state what the results do NOT show, such as limitations to specific model scales, the reliance on white-box access for optimization, or that real-world deployments may differ from the simulated settings." 188 } 189 }, 190 "data_integrity": { 191 "raw_data_available": { 192 "applies": true, 193 "answer": false, 194 "justification": "While code is released, the specific experimental outputs (all 500 clean responses per benchmark, optimization logs, raw model outputs for all experiments) are not mentioned as being available." 195 }, 196 "data_collection_described": { 197 "applies": true, 198 "answer": true, 199 "justification": "Section 4.1.1 describes data collection: target questions selected from MT-Bench and LLMBar covering diverse topics, target responses generated via GPT-3.5-turbo, clean responses generated by 10 named LLMs with manual quality selection. Case study data sources are also described (Sections 5.1–5.3)." 200 }, 201 "recruitment_methods_described": { 202 "applies": false, 203 "answer": false, 204 "justification": "No human participants. Data sources are standard benchmarks (MT-Bench, LLMBar) and LLM-generated responses." 205 }, 206 "data_pipeline_documented": { 207 "applies": true, 208 "answer": true, 209 "justification": "The pipeline is documented: benchmark selection → target question selection → target response generation (GPT-3.5-turbo + manual selection) → clean response generation (10 LLMs × 10 responses → manually select 50) → shadow response generation (5 prompts × 1 LLM). Each step and its filtering criteria are described." 210 } 211 }, 212 "conflicts_of_interest": { 213 "funding_disclosed": { 214 "applies": true, 215 "answer": false, 216 "justification": "No funding information, acknowledgments section, or grant numbers are mentioned in the paper." 217 }, 218 "affiliations_disclosed": { 219 "applies": true, 220 "answer": true, 221 "justification": "All author affiliations are clearly listed: Huazhong University of Science and Technology, University of Notre Dame, Lehigh University, and Duke University." 222 }, 223 "funder_independent_of_outcome": { 224 "applies": true, 225 "answer": false, 226 "justification": "Since no funding source is disclosed, independence of the funder cannot be assessed." 227 }, 228 "financial_interests_declared": { 229 "applies": true, 230 "answer": false, 231 "justification": "No competing interests statement or financial interest declarations are present in the paper." 232 } 233 }, 234 "contamination": { 235 "training_cutoff_stated": { 236 "applies": true, 237 "answer": false, 238 "justification": "The paper does not state the training data cutoff dates for any of the four evaluated LLMs (Mistral-7B, Openchat-3.5, Llama-2-7B, Llama-3-8B). MT-Bench and LLMBar questions could be in their training data, affecting judging behavior." 239 }, 240 "train_test_overlap_discussed": { 241 "applies": true, 242 "answer": false, 243 "justification": "No discussion of whether the LLMs used as judges have seen MT-Bench or LLMBar data during pre-training. The models' familiarity with benchmark questions could confound both baseline accuracy (ACC) and attack success measurements." 244 }, 245 "benchmark_contamination_addressed": { 246 "applies": true, 247 "answer": false, 248 "justification": "MT-Bench (2024) and LLMBar (2023) were published around the same time or before the models' training. No discussion of whether exposure to these benchmarks affects the judging behavior being attacked." 249 } 250 }, 251 "human_studies": { 252 "pre_registered": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants in this study. All experiments involve automated LLM evaluation." 256 }, 257 "irb_or_ethics_approval": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants. The study involves only automated attacks on LLM systems." 261 }, 262 "demographics_reported": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants." 266 }, 267 "inclusion_exclusion_criteria": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants." 271 }, 272 "randomization_described": { 273 "applies": false, 274 "answer": false, 275 "justification": "No human participants." 276 }, 277 "blinding_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "No human participants." 281 }, 282 "attrition_reported": { 283 "applies": false, 284 "answer": false, 285 "justification": "No human participants." 286 } 287 }, 288 "cost_and_practicality": { 289 "inference_cost_reported": { 290 "applies": true, 291 "answer": false, 292 "justification": "No API costs, tokens consumed, wall-clock time, or cost per attack are reported. The optimization requires 600 iterations of gradient computation on LLMs, but no cost quantification is provided." 293 }, 294 "compute_budget_stated": { 295 "applies": true, 296 "answer": false, 297 "justification": "No GPU hours, hardware specifications, or total compute budget are stated. The paper mentions 'larger computational resource consumption and GPU memory requirements' for larger m (Section 4.3) but does not quantify these." 298 } 299 }, 300 "experimental_rigor": { 301 "seed_sensitivity_reported": { 302 "applies": true, 303 "answer": false, 304 "justification": "While temperature=0 makes LLM generation deterministic, the optimization in Algorithm 1 involves random token selection (step 9) and random subset sampling (step 8). No seed sensitivity analysis or results across multiple random seeds are reported." 305 }, 306 "number_of_runs_stated": { 307 "applies": true, 308 "answer": false, 309 "justification": "The number of optimization runs per experiment is not explicitly stated. It is unclear whether each reported ASR comes from a single optimization run or is averaged over multiple runs." 310 }, 311 "hyperparameter_search_budget": { 312 "applies": true, 313 "answer": false, 314 "justification": "While Figure 6 explores α and β values and several ablation dimensions are covered, the total search budget (number of configurations tried, compute spent on search) is not reported." 315 }, 316 "best_config_selection_justified": { 317 "applies": true, 318 "answer": false, 319 "justification": "Default settings (α=1, β=0.1, suffix length=20) are used but the selection rationale is not clearly stated. Figure 6 shows performance landscapes but does not explain whether selection was based on a validation set or the test data itself." 320 }, 321 "multiple_comparison_correction": { 322 "applies": true, 323 "answer": false, 324 "justification": "The paper makes many comparative claims across 4 models, 2 datasets, 10 QR pairs, and 10+ baseline methods without any correction for multiple comparisons or family-wise error rate." 325 }, 326 "self_comparison_bias_addressed": { 327 "applies": true, 328 "answer": false, 329 "justification": "The authors implement all 6 manual prompt injection baselines themselves (Table 15) and extend 4 jailbreak attacks to their setting. No acknowledgment of potential author-evaluation bias in implementing and evaluating these baselines." 330 }, 331 "compute_budget_vs_performance": { 332 "applies": true, 333 "answer": false, 334 "justification": "JudgeDeceiver requires 600 iterations of gradient-based optimization per QR pair while manual attacks require no optimization. This compute asymmetry is not discussed. Table 3 shows injected sequence lengths but not the optimization cost to produce them." 335 }, 336 "benchmark_construct_validity": { 337 "applies": true, 338 "answer": false, 339 "justification": "MT-Bench and LLMBar are used without questioning whether they adequately represent real-world LLM-as-a-Judge deployment scenarios. The constructed evaluation datasets (10 QR pairs, 500 clean responses) are assumed to be representative without validation." 340 }, 341 "scaffold_confound_addressed": { 342 "applies": false, 343 "answer": false, 344 "justification": "No agentic scaffolding is involved. The attack directly optimizes token sequences via gradient descent." 345 } 346 }, 347 "data_leakage": { 348 "temporal_leakage_addressed": { 349 "applies": true, 350 "answer": false, 351 "justification": "Not discussed. MT-Bench and LLMBar were published before or around the time the evaluated models were trained, creating potential temporal leakage in the models' judging behavior." 352 }, 353 "feature_leakage_addressed": { 354 "applies": true, 355 "answer": false, 356 "justification": "Not discussed. The evaluation setup provides the LLM with the full question and both responses, which mirrors the intended use case, but no analysis of whether this setup inadvertently leaks information is provided." 357 }, 358 "non_independence_addressed": { 359 "applies": true, 360 "answer": false, 361 "justification": "Not discussed. The shadow responses used for optimization and the clean responses used for evaluation are generated by potentially overlapping LLMs, which could introduce dependencies." 362 }, 363 "leakage_detection_method": { 364 "applies": true, 365 "answer": false, 366 "justification": "No leakage detection or prevention method is used. No canary strings, membership inference tests, or decontamination pipelines are applied." 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "JudgeDeceiver achieves average attack success rates of 89–99% across four LLMs on two benchmarks.", 373 "evidence": "Table 1 shows average ASRs: Mistral-7B 90.8%/93.2%, Openchat-3.5 89.2%/88%, Llama-2-7B 98.9%/98.1%, Llama-3-8B 97.6%/97% on MT-Bench/LLMBar respectively (Section 4.2).", 374 "supported": "strong" 375 }, 376 { 377 "claim": "JudgeDeceiver significantly outperforms manual prompt injection attacks, which achieve a maximum ASR of no more than 40.7%.", 378 "evidence": "Table 2 compares JudgeDeceiver against 6 manual attacks across 4 models and 2 datasets. The highest manual ASR is 40.7% (Fake Reasoning on Llama-2-7B LLMBar) vs JudgeDeceiver's 98.1% (Section 4.2).", 379 "supported": "moderate" 380 }, 381 { 382 "claim": "JudgeDeceiver outperforms jailbreak attacks when adapted to the LLM-as-a-Judge problem.", 383 "evidence": "Table 3 compares against TAP, PAIR, AutoDAN, and GCG on Mistral-7B. JudgeDeceiver achieves 90.8% ASR vs AutoDAN's 53.6% (best jailbreak) on MT-Bench (Section 4.2).", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Known-answer detection cannot detect target responses with JudgeDeceiver's injected sequences (90–100% FNR).", 388 "evidence": "Table 11 on Mistral-7B shows FNR of 90% on MT-Bench and 100% on LLMBar for known-answer detection, with 0% FPR (Section 6.4).", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Perplexity-based defenses (PPL and PPL-W) are insufficient, missing 40–90% of attacked responses.", 393 "evidence": "Tables 11–12 show FNRs: PPL detection 50–80%, PPL-W detection 40–90% across models and datasets, while maintaining near-zero FPR (Section 6.4).", 394 "supported": "strong" 395 }, 396 { 397 "claim": "The attack transfers across LLMs, with sequences optimized on Llama-3-8B achieving high ASR on other models.", 398 "evidence": "Table 7 shows Llama-3-8B optimized sequences achieve 99% ASR on Llama-2-13B, 91% on Llama-3-70B, 88% on Claude3-Haiku, 79% on GPT-4. Llama-2-7B sequences transfer less well (e.g., 5% on GPT-4) (Section 4.3).", 399 "supported": "moderate" 400 }, 401 { 402 "claim": "All three loss terms (L_aligned, L_enhancement, L_perplexity) contribute to attack effectiveness.", 403 "evidence": "Table 4 shows removing L_aligned drops ASR from 97% to 87%, removing L_enhancement drops ASR to 84%, and removing L_perplexity increases ASR slightly to 98% but at the cost of stealthiness (Section 4.3).", 404 "supported": "strong" 405 }, 406 { 407 "claim": "JudgeDeceiver is effective in three real-world scenarios: LLM-powered search (80–100% ASR), RLAIF (95–100% ASR), and tool selection (80–100% ASR).", 408 "evidence": "Tables 8, 9, and 10 show results across scenario-specific evaluation sets on Mistral-7B (Section 5).", 409 "supported": "moderate" 410 } 411 ], 412 "red_flags": [ 413 { 414 "flag": "No statistical tests for superiority claims", 415 "detail": "All claims that JudgeDeceiver 'outperforms' baselines are based on comparing raw ASR/PAC numbers across Tables 2–3 without any statistical significance tests. Given the stochastic optimization process, results could vary across runs." 416 }, 417 { 418 "flag": "No error bars or variance across optimization runs", 419 "detail": "Despite the optimization process involving random token selection (Algorithm 1, step 9), all results are reported as single point estimates. The stability of the attack across different random seeds is unknown." 420 }, 421 { 422 "flag": "Self-implemented baselines", 423 "detail": "The authors implement all 6 manual prompt injection baselines and adapt all 4 jailbreak attacks to their setting. The implementations may not represent the best configurations of these methods. No acknowledgment of potential author-evaluation bias." 424 }, 425 { 426 "flag": "No limitations section", 427 "detail": "The paper lacks any dedicated discussion of limitations, threats to validity, or scope boundaries. Key limitations such as reliance on white-box model access, computational cost of optimization, small number of target QR pairs (10), and gap between simulated and real deployments are unaddressed." 428 }, 429 { 430 "flag": "Compute cost asymmetry ignored", 431 "detail": "JudgeDeceiver requires 600 iterations of gradient-based optimization per target QR pair on open-source LLMs, while manual baselines require no computation. This asymmetry is never quantified or discussed, making the comparison unfair without cost context." 432 }, 433 { 434 "flag": "Case studies simulate rather than test real deployments", 435 "detail": "The three 'real-world' case studies (LLM-powered search, RLAIF, tool selection) use simulated setups with Mistral-7B, not actual deployed systems. Bing Chat and Bard use proprietary models with unknown defenses, making generalization to real systems uncertain." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "GPT-4 Technical Report", 441 "authors": ["Josh Achiam", "Steven Adler", "Sandhini Agarwal"], 442 "year": 2023, 443 "arxiv_id": "2303.08774", 444 "relevance": "Foundational LLM used as clean response generator and transferability target in the evaluation." 445 }, 446 { 447 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 448 "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"], 449 "year": 2023, 450 "arxiv_id": "2307.15043", 451 "relevance": "GCG attack method used as both a baseline and foundation for gradient-based token optimization in JudgeDeceiver." 452 }, 453 { 454 "title": "Formalizing and Benchmarking Prompt Injection Attacks and Defenses", 455 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 456 "year": 2024, 457 "relevance": "Standardized framework for prompt injection attacks; provides combined attack baseline and detection-based defense methods evaluated in this paper." 458 }, 459 { 460 "title": "Not What You've Signed Up For: Compromising Real-World LLM-Integrated Applications with Indirect Prompt Injection", 461 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"], 462 "year": 2023, 463 "arxiv_id": "2302.12173", 464 "relevance": "Foundational work on indirect prompt injection attacks against LLM-integrated applications, directly relevant to the threat model." 465 }, 466 { 467 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 468 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 469 "year": 2024, 470 "relevance": "Introduces the LLM-as-a-Judge concept and MT-Bench dataset used as the primary evaluation benchmark in this paper." 471 }, 472 { 473 "title": "RLAIF: Scaling Reinforcement Learning from Human Feedback with AI Feedback", 474 "authors": ["Harrison Lee", "Samrat Phatale", "Hassan Mansoor"], 475 "year": 2023, 476 "arxiv_id": "2309.00267", 477 "relevance": "Introduces RLAIF paradigm that uses LLM-as-a-Judge for automated preference annotation, one of three case studies demonstrating attack impact." 478 }, 479 { 480 "title": "AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models", 481 "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"], 482 "year": 2023, 483 "arxiv_id": "2310.04451", 484 "relevance": "Jailbreak attack baseline using hierarchical genetic algorithm; most competitive jailbreak method in the comparison (53.6% ASR)." 485 }, 486 { 487 "title": "MetaGPT: Meta Programming for Multi-Agent Collaborative Framework", 488 "authors": ["Sirui Hong", "Xiawu Zheng", "Jonathan Chen"], 489 "year": 2023, 490 "arxiv_id": "2308.00352", 491 "relevance": "Multi-agent framework using tool selection via LLM-as-a-Judge, motivating the tool selection attack scenario." 492 }, 493 { 494 "title": "PLeak: Prompt Leaking Attacks against Large Language Model Applications", 495 "authors": ["Bo Hui", "Haolin Yuan", "Neil Gong", "Philippe Burlina", "Yinzhi Cao"], 496 "year": 2024, 497 "relevance": "Related prompt-level attack on LLM applications, demonstrating the broader vulnerability surface of LLM-integrated systems." 498 }, 499 { 500 "title": "StruQ: Defending against Prompt Injection with Structured Queries", 501 "authors": ["Sizhe Chen", "Julien Piet", "Chawin Sitawarin", "David Wagner"], 502 "year": 2024, 503 "arxiv_id": "2402.06363", 504 "relevance": "Defense approach using structured queries to prevent prompt injection, relevant to developing mitigations against attacks like JudgeDeceiver." 505 }, 506 { 507 "title": "Llama 2: Open Foundation and Fine-tuned Chat Models", 508 "authors": ["Hugo Touvron", "Louis Martin", "Kevin Stone"], 509 "year": 2023, 510 "arxiv_id": "2307.09288", 511 "relevance": "Open-source LLM family used as both evaluation target and transferability source in the attack evaluation." 512 }, 513 { 514 "title": "MetaTool Benchmark: Deciding Whether to Use Tools and Which to Use", 515 "authors": ["Yue Huang", "Jiawen Shi", "Yuan Li"], 516 "year": 2023, 517 "relevance": "Benchmark for LLM tool usage awareness, used as the evaluation dataset for the tool selection attack case study." 518 }, 519 { 520 "title": "Tree of Attacks: Jailbreaking Black-Box LLMs Automatically", 521 "authors": ["Anay Mehrotra", "Manolis Zampetakis", "Paul Kassianik"], 522 "year": 2023, 523 "arxiv_id": "2312.02119", 524 "relevance": "Tree-based jailbreak attack method used as a baseline, demonstrating limitations of black-box approaches for this attack setting." 525 } 526 ], 527 "engagement_factors": { 528 "practical_relevance": { 529 "score": 2, 530 "justification": "Directly relevant to anyone deploying LLM-as-a-Judge for search, RLAIF, or tool selection, but requires white-box access to open-source models for optimization." 531 }, 532 "surprise_contrarian": { 533 "score": 1, 534 "justification": "Prompt injection attacks are well-known; the contribution is demonstrating optimization-based attacks are far more effective than manual ones in the LLM-as-a-Judge setting, which extends rather than overturns expectations." 535 }, 536 "fear_safety": { 537 "score": 3, 538 "justification": "Demonstrates 90%+ attack success on LLM evaluation systems with three defenses proven insufficient, raising serious concerns about the integrity of LLM leaderboards, RLAIF training, and agent tool selection." 539 }, 540 "drama_conflict": { 541 "score": 1, 542 "justification": "Implies LLM leaderboard rankings could be manipulated, but does not directly accuse any specific entity or call out existing benchmark results as fraudulent." 543 }, 544 "demo_ability": { 545 "score": 2, 546 "justification": "Code released on GitHub (https://github.com/ShiJiawenwen/JudgeDeceiver), but running it requires open-source LLMs with GPU access for gradient computation." 547 }, 548 "brand_recognition": { 549 "score": 1, 550 "justification": "Authors from HUST, Notre Dame, Lehigh, and Duke — well-regarded universities but not major AI labs. Published at CCS, a top security venue." 551 } 552 } 553 }