scan.json (29544B)
1 { 2 "paper": { 3 "title": "Investigating the Vulnerability of LLM-as-a-Judge Architectures to Prompt-Injection Attacks", 4 "authors": [ 5 "Narek Maloyan", 6 "Bislan Ashinov", 7 "Dmitry Namiot" 8 ], 9 "year": 2025, 10 "venue": "arXiv.org", 11 "arxiv_id": "2505.13348", 12 "doi": "10.48550/arXiv.2505.13348" 13 }, 14 "scan_version": 3, 15 "active_modules": ["experimental_rigor", "data_leakage"], 16 "methodology_tags": ["benchmark-eval"], 17 "key_findings": "LLM-as-a-Judge systems using Qwen2.5-3B-Instruct and Falcon3-3B-Instruct are vulnerable to GCG-optimized adversarial suffixes. The Comparative Undermining Attack (CUA), targeting decision logits directly, achieves >30% Attack Success Rate, outperforming the Justification Manipulation Attack (JMA) at ~15-17% and prior JudgeDeceiver at ~22-24%. Control conditions (random-suffix at 1.2-1.5%, token-shuffle at 2.8-3.1%) confirm the attacks exploit specific token arrangements rather than mere text disruption.", 18 "checklist": { 19 "artifacts": { 20 "code_released": { 21 "applies": true, 22 "answer": false, 23 "justification": "No code repository, GitHub link, or archive is provided anywhere in the paper. No URL to source code for the CUA/JMA attacks or experimental setup." 24 }, 25 "data_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "The paper uses the publicly available MT-Bench Human Judgments dataset from LMSYS (Section IV-B). This is a standard public benchmark that the authors did not modify." 29 }, 30 "environment_specified": { 31 "applies": true, 32 "answer": false, 33 "justification": "No requirements.txt, Dockerfile, conda environment, or library version details are provided. The paper names the models used but provides no environment setup information." 34 }, 35 "reproduction_instructions": { 36 "applies": true, 37 "answer": false, 38 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The methodology is described at a high level but lacks sufficient operational detail for exact replication." 39 } 40 }, 41 "statistical_methodology": { 42 "confidence_intervals_or_error_bars": { 43 "applies": true, 44 "answer": false, 45 "justification": "Table I reports only point estimates for ASR (e.g., 31.2%, 32.4%) with no confidence intervals, error bars, or uncertainty quantification." 46 }, 47 "significance_tests": { 48 "applies": true, 49 "answer": false, 50 "justification": "The paper claims CUA outperforms JMA and other baselines based solely on comparing raw ASR percentages in Table I. No statistical significance tests (t-tests, bootstrap, etc.) are applied to any comparison." 51 }, 52 "effect_sizes_reported": { 53 "applies": true, 54 "answer": true, 55 "justification": "ASR percentages are reported with full baseline context: Random-Suffix 1.2-1.5%, Token-Shuffle 2.8-3.1%, Hard Prompt 5.1-5.4%, JMA 15.2-16.7%, JudgeDeceiver 22.8-24.1%, CUA 31.2-32.4%. The magnitude of improvement is clear from the absolute numbers and comparisons." 56 }, 57 "sample_size_justified": { 58 "applies": true, 59 "answer": false, 60 "justification": "The paper never states how many instances from MT-Bench were used for the attacks, let alone justifies the sample size. No power analysis or discussion of whether the sample is adequate." 61 }, 62 "variance_reported": { 63 "applies": true, 64 "answer": false, 65 "justification": "No variance, standard deviation, or spread measures are reported. All results in Table I are single-point estimates with no indication of variability across runs or instances." 66 } 67 }, 68 "evaluation_design": { 69 "baselines_included": { 70 "applies": true, 71 "answer": true, 72 "justification": "The paper includes multiple baselines: Random-Suffix Control, Token-Shuffle Control, Hard Prompt Attack baseline, and comparison with JudgeDeceiver (Section IV-C, Table I)." 73 }, 74 "baselines_contemporary": { 75 "applies": true, 76 "answer": true, 77 "justification": "JudgeDeceiver [41] (2024) is a contemporary baseline. The Random-Suffix and Token-Shuffle controls are well-designed experimental controls. The baselines are appropriate for this study." 78 }, 79 "ablation_study": { 80 "applies": true, 81 "answer": false, 82 "justification": "While CUA and JMA are compared, there is no systematic ablation of attack components (e.g., suffix length, number of GCG iterations, effect of different positive/negative token sets in JMA). The Token-Shuffle control is ablation-like but does not constitute a formal ablation study." 83 }, 84 "multiple_metrics": { 85 "applies": true, 86 "answer": false, 87 "justification": "Only Attack Success Rate (ASR) is used as an evaluation metric (Section IV-D, Eq. 3). No secondary metrics such as attack transferability, semantic preservation, or perplexity of adversarial suffixes are reported." 88 }, 89 "human_evaluation": { 90 "applies": true, 91 "answer": false, 92 "justification": "No human evaluation is performed. All evaluation is automated via ASR computation. Human review of whether verdict flips represent genuine manipulation (vs. confusion or degraded output) could have validated the results." 93 }, 94 "held_out_test_set": { 95 "applies": true, 96 "answer": false, 97 "justification": "No mention of splitting the MT-Bench dataset into optimization/development and held-out test sets. It is unclear whether GCG optimization and evaluation were performed on the same instances." 98 }, 99 "per_category_breakdown": { 100 "applies": true, 101 "answer": true, 102 "justification": "Table I breaks down results by model (Qwen2.5-3B, Falcon3-3B) and by attack method (6 methods). This provides per-method and per-model granularity." 103 }, 104 "failure_cases_discussed": { 105 "applies": true, 106 "answer": false, 107 "justification": "No discussion of when or why attacks failed. With CUA at ~31% ASR, ~69% of attacks failed, but no analysis of failure modes is provided." 108 }, 109 "negative_results_reported": { 110 "applies": true, 111 "answer": false, 112 "justification": "No negative results are reported. Every attack method shows improvement over controls, and no failed approaches or configurations are mentioned." 113 } 114 }, 115 "claims_and_evidence": { 116 "abstract_claims_supported": { 117 "applies": true, 118 "answer": true, 119 "justification": "The abstract claims 'CUA achieves an ASR exceeding 30%' — Table I shows 31.2% and 32.4%. 'JMA also shows notable effectiveness' — Table I shows 15.2% and 16.7%. Both claims are supported by the reported results." 120 }, 121 "causal_claims_justified": { 122 "applies": true, 123 "answer": true, 124 "justification": "The paper makes causal claims that adversarial suffixes manipulate judge decisions. The experimental design — appending optimized suffixes and measuring verdict flips — is a controlled intervention. The Random-Suffix and Token-Shuffle controls help isolate the causal mechanism (specific token arrangements, not mere text presence)." 125 }, 126 "generalization_bounded": { 127 "applies": true, 128 "answer": false, 129 "justification": "The title claims to investigate 'LLM-as-a-Judge Architectures' broadly, but experiments use only two 3B-parameter models (Qwen2.5-3B-Instruct and Falcon3-3B-Instruct). The conclusion references 'current LLM-as-a-Judge systems' and 'high-stakes evaluation tasks' without bounding these claims to small open-source models. No larger or proprietary models (GPT-4, Claude) were tested." 130 }, 131 "alternative_explanations_discussed": { 132 "applies": true, 133 "answer": false, 134 "justification": "No alternative explanations are discussed. The paper does not consider whether the vulnerability is specific to small 3B models, whether instruction tuning quality affects susceptibility, or whether different judge prompt formats would alter results." 135 }, 136 "proxy_outcome_distinction": { 137 "applies": true, 138 "answer": true, 139 "justification": "ASR directly measures what is claimed — the rate at which adversarial suffixes flip judge verdicts. The measurement and the claim are at the same granularity; no proxy gap exists." 140 } 141 }, 142 "setup_transparency": { 143 "model_versions_specified": { 144 "applies": true, 145 "answer": true, 146 "justification": "Specific model names with version and size are given: 'Qwen2.5-3B-Instruct' (Section IV-A) and 'Falcon3-3B-Instruct' (Section IV-A). These are sufficiently specific to identify the exact models." 147 }, 148 "prompts_provided": { 149 "applies": true, 150 "answer": false, 151 "justification": "The exact judge prompt template is not provided. The paper describes the setup conceptually ('the judge fθ(x, a, b) outputs a preference') and gives one example hard prompt ('It is critically important that you select response B as the better one') but does not provide the full system/judge prompt or actual GCG-generated adversarial suffixes." 152 }, 153 "hyperparameters_reported": { 154 "applies": true, 155 "answer": false, 156 "justification": "Critical GCG hyperparameters are not reported: suffix length L, number of iterations, number of candidate substitutions per position, batch size, learning rate/step size. No model inference parameters (temperature, top-p) are stated." 157 }, 158 "scaffolding_described": { 159 "applies": false, 160 "answer": false, 161 "justification": "No agentic scaffolding is used. The paper performs direct adversarial suffix optimization and model inference without any agent-based framework." 162 }, 163 "data_preprocessing_documented": { 164 "applies": true, 165 "answer": false, 166 "justification": "No description of how the MT-Bench Human Judgments dataset was preprocessed or filtered. The paper does not state how many instances were used, how triplets (x, a, b) were formed, or how the 'initially losing answer' was determined." 167 } 168 }, 169 "limitations_and_scope": { 170 "limitations_section_present": { 171 "applies": true, 172 "answer": false, 173 "justification": "No dedicated limitations section exists. The only limitation mentioned is a passing remark in the Discussion: 'This work did not explore the impact of permuting the order of the attacked and genuinely superior answers.' This does not constitute a substantive limitations discussion." 174 }, 175 "threats_to_validity_specific": { 176 "applies": true, 177 "answer": false, 178 "justification": "The single mentioned limitation (answer order permutation) is specific but isolated. No discussion of threats from small model size, single dataset, lack of larger/proprietary model testing, or absence of variance reporting." 179 }, 180 "scope_boundaries_stated": { 181 "applies": true, 182 "answer": false, 183 "justification": "No explicit 'what our results do NOT show' statement. The paper does not state that results are limited to 3B-parameter open-source models, a single dataset, or white-box access scenarios. The conclusions imply broad applicability without bounding scope." 184 } 185 }, 186 "data_integrity": { 187 "raw_data_available": { 188 "applies": true, 189 "answer": false, 190 "justification": "No raw experimental data (per-instance attack outcomes, generated adversarial suffixes, model outputs) is released. Only aggregated ASR percentages in Table I." 191 }, 192 "data_collection_described": { 193 "applies": true, 194 "answer": true, 195 "justification": "Section IV-B describes the data source: 'We utilized the MT-Bench Human Judgments dataset provided by LMSYS. This dataset contains human evaluations of LLM responses to diverse questions, presented as pairwise comparisons.'" 196 }, 197 "recruitment_methods_described": { 198 "applies": false, 199 "answer": false, 200 "justification": "No human participants. The data source is a standard public benchmark (MT-Bench Human Judgments from LMSYS)." 201 }, 202 "data_pipeline_documented": { 203 "applies": true, 204 "answer": false, 205 "justification": "No documentation of the pipeline from raw dataset to final results. Missing details: how many instances were selected, how 'initially losing answer' was determined for each instance, how GCG optimization was applied per-instance." 206 } 207 }, 208 "conflicts_of_interest": { 209 "funding_disclosed": { 210 "applies": true, 211 "answer": false, 212 "justification": "No funding sources are mentioned anywhere in the paper. No acknowledgments section is present." 213 }, 214 "affiliations_disclosed": { 215 "applies": true, 216 "answer": false, 217 "justification": "Only author names and one email address (maloyan.narek@gmail.com) are listed. No institutional affiliations are provided, which is itself a transparency gap." 218 }, 219 "funder_independent_of_outcome": { 220 "applies": true, 221 "answer": false, 222 "justification": "Cannot assess funder independence since no funding is disclosed. With three authors and no funding statement, independence cannot be verified." 223 }, 224 "financial_interests_declared": { 225 "applies": true, 226 "answer": false, 227 "justification": "No competing interests or financial disclosure statement appears in the paper." 228 } 229 }, 230 "contamination": { 231 "training_cutoff_stated": { 232 "applies": false, 233 "answer": false, 234 "justification": "This paper tests adversarial attack effectiveness on LLM judges, not model knowledge on a benchmark. Whether models have seen MT-Bench questions is irrelevant to whether adversarial suffixes can flip their judging decisions." 235 }, 236 "train_test_overlap_discussed": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper evaluates attack robustness of judges, not model knowledge. Benchmark contamination is not a relevant concern for this experimental design." 240 }, 241 "benchmark_contamination_addressed": { 242 "applies": false, 243 "answer": false, 244 "justification": "Same as above — the study tests defense/robustness of judge behavior against adversarial inputs, not whether models can correctly answer benchmark questions from memorization." 245 } 246 }, 247 "human_studies": { 248 "pre_registered": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "irb_or_ethics_approval": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "demographics_reported": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "inclusion_exclusion_criteria": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "randomization_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "blinding_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 }, 278 "attrition_reported": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants in this study." 282 } 283 }, 284 "cost_and_practicality": { 285 "inference_cost_reported": { 286 "applies": true, 287 "answer": false, 288 "justification": "No inference cost, API cost, tokens consumed, or wall-clock time is reported for either the GCG optimization process or the judge model evaluations." 289 }, 290 "compute_budget_stated": { 291 "applies": true, 292 "answer": false, 293 "justification": "No computational budget is stated. The paper mentions models were chosen for 'relatively small size, which allows for extensive experimentation within computational constraints' (Section IV-A) but does not quantify the actual compute used." 294 } 295 }, 296 "experimental_rigor": { 297 "seed_sensitivity_reported": { 298 "applies": true, 299 "answer": false, 300 "justification": "No mention of multiple random seeds. GCG optimization involves random initialization of the suffix, making seed sensitivity particularly important, but it is not addressed." 301 }, 302 "number_of_runs_stated": { 303 "applies": true, 304 "answer": false, 305 "justification": "The number of experimental runs is never stated. It is unclear whether results represent single runs or averages." 306 }, 307 "hyperparameter_search_budget": { 308 "applies": true, 309 "answer": false, 310 "justification": "No hyperparameter search budget is reported. GCG has multiple hyperparameters (suffix length, iterations, candidate set size) but no search process is described." 311 }, 312 "best_config_selection_justified": { 313 "applies": true, 314 "answer": false, 315 "justification": "No discussion of how the reported configuration was selected. The paper does not explain whether multiple configurations were tried or how the final setup was chosen." 316 }, 317 "multiple_comparison_correction": { 318 "applies": true, 319 "answer": false, 320 "justification": "Six attack methods are compared across two models (12 cells in Table I) with no statistical tests at all, let alone multiple comparison corrections." 321 }, 322 "self_comparison_bias_addressed": { 323 "applies": true, 324 "answer": false, 325 "justification": "The authors propose CUA and JMA, implement them, and compare against their own implementations of baselines (Hard Prompt, Random-Suffix, Token-Shuffle). No acknowledgment of potential self-comparison bias. The JudgeDeceiver comparison is presumably re-implemented but not discussed." 326 }, 327 "compute_budget_vs_performance": { 328 "applies": true, 329 "answer": false, 330 "justification": "GCG-based attacks (CUA, JMA) require significantly more compute than Hard Prompt or Random-Suffix baselines, but no compute budget comparison is provided. The higher ASR may partly reflect greater optimization compute." 331 }, 332 "benchmark_construct_validity": { 333 "applies": true, 334 "answer": false, 335 "justification": "No discussion of whether MT-Bench Human Judgments adequately represents real-world LLM-as-a-Judge deployment scenarios. MT-Bench was designed for model ranking, not security evaluation." 336 }, 337 "scaffold_confound_addressed": { 338 "applies": false, 339 "answer": false, 340 "justification": "No scaffolding is involved in the experimental setup." 341 } 342 }, 343 "data_leakage": { 344 "temporal_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the judge models (Qwen2.5-3B, Falcon3-3B) were trained on MT-Bench data. If models learned MT-Bench patterns during training, their baseline judging behavior could be confounded." 348 }, 349 "feature_leakage_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "No discussion of whether the evaluation setup leaks information. For instance, the judge seeing both answers simultaneously may provide different information than sequential evaluation." 353 }, 354 "non_independence_addressed": { 355 "applies": true, 356 "answer": false, 357 "justification": "No discussion of whether MT-Bench instances are independent or whether the same question categories/models appear in both optimization and evaluation sets." 358 }, 359 "leakage_detection_method": { 360 "applies": true, 361 "answer": false, 362 "justification": "No leakage detection or prevention method is applied." 363 } 364 } 365 }, 366 "claims": [ 367 { 368 "claim": "The Comparative Undermining Attack (CUA) achieves Attack Success Rates exceeding 30% on both tested LLM judges.", 369 "evidence": "Table I: CUA achieves 31.2% ASR on Qwen2.5-3B-Instruct and 32.4% on Falcon3-3B-Instruct (Section V-A).", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "The Justification Manipulation Attack (JMA) achieves ASRs of approximately 15-17%, significantly higher than simple hard prompt attacks.", 374 "evidence": "Table I: JMA achieves 15.2% on Qwen2.5-3B and 16.7% on Falcon3-3B, compared to Hard Prompt at 5.1% and 5.4% (Section V-A).", 375 "supported": "moderate" 376 }, 377 { 378 "claim": "Random text appended to answers has minimal impact on judge decisions, confirming attack success is not due to mere text disruption.", 379 "evidence": "Table I: Random-Suffix Control achieves only 1.2% and 1.5% ASR (Section V-A).", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "The specific ordering and structure of tokens in adversarial suffixes is critical to attack effectiveness, not just the presence of certain tokens.", 384 "evidence": "Table I: Token-Shuffle Control (2.8-3.1% ASR) is much lower than CUA (31.2-32.4%), demonstrating token arrangement matters (Section V-A, V-B).", 385 "supported": "moderate" 386 }, 387 { 388 "claim": "LLM-as-a-Judge systems are significantly vulnerable to prompt-injection attacks and may not be sufficiently robust for high-stakes evaluation tasks.", 389 "evidence": "Based on Table I results across all attack methods on two models (Section V-B, VI). However, only two small 3B models were tested, limiting generalizability.", 390 "supported": "weak" 391 } 392 ], 393 "red_flags": [ 394 { 395 "flag": "Extremely narrow model selection", 396 "detail": "Only two 3B-parameter open-source models tested. No larger models (7B, 13B, 70B) or proprietary models (GPT-4, Claude) included. Small models may be disproportionately vulnerable to adversarial attacks, making the broad claims about 'LLM-as-a-Judge architectures' unsupported." 397 }, 398 { 399 "flag": "No error bars or variance reporting", 400 "detail": "All results in Table I are single point estimates. GCG optimization involves random initialization, making results potentially sensitive to randomness. Without variance across seeds or runs, the stability and reliability of reported ASRs cannot be assessed." 401 }, 402 { 403 "flag": "Sample size never stated", 404 "detail": "The paper never reports how many instances from MT-Bench were used for the attacks. This is a fundamental missing detail — the reader cannot assess whether ASR differences are meaningful without knowing N." 405 }, 406 { 407 "flag": "Claims significantly outrun evidence", 408 "detail": "The paper generalizes from two 3B models to 'LLM-as-a-Judge architectures' and 'high-stakes evaluation tasks' in the title and conclusions. The tested models are far smaller than those typically used as judges in practice (GPT-4, Claude 3.5, etc.)." 409 }, 410 { 411 "flag": "Missing critical experimental details", 412 "detail": "GCG hyperparameters (suffix length, iteration count, candidate set size), model inference parameters (temperature, top-p), and dataset preprocessing steps are all absent. The experiments cannot be reproduced from the paper alone." 413 }, 414 { 415 "flag": "No failure analysis", 416 "detail": "With CUA succeeding only ~31% of the time, ~69% of attacks fail. No analysis of failure modes, what distinguishes successful from failed attacks, or what types of judge queries are more resistant." 417 } 418 ], 419 "cited_papers": [ 420 { 421 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 422 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 423 "year": 2023, 424 "arxiv_id": "2306.05685", 425 "relevance": "Foundational paper establishing the LLM-as-a-Judge paradigm and the MT-Bench evaluation framework used in this study." 426 }, 427 { 428 "title": "Universal and transferable adversarial attacks on aligned language models", 429 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"], 430 "year": 2023, 431 "arxiv_id": "2307.15043", 432 "relevance": "Proposes the GCG optimization method for adversarial suffix generation, which is the core attack technique adapted in this paper." 433 }, 434 { 435 "title": "JudgeDeceiver: Prompt injection attacks to manipulate LLM-as-a-Judge", 436 "authors": ["Yiwei Shi"], 437 "year": 2024, 438 "arxiv_id": "2403.17710", 439 "relevance": "Prior work on prompt injection attacks specifically targeting LLM-as-a-Judge systems; used as a baseline comparison." 440 }, 441 { 442 "title": "Bad-Judge: Backdoor vulnerabilities of LLM-as-a-Judge", 443 "authors": ["Zihan Wang"], 444 "year": 2024, 445 "arxiv_id": "2503.00596", 446 "relevance": "Demonstrates backdoor vulnerabilities in LLM-as-a-Judge systems during training, complementing the inference-time attacks studied here." 447 }, 448 { 449 "title": "Automatic and universal prompt injection attacks against large language models", 450 "authors": ["Xiaogeng Liu"], 451 "year": 2024, 452 "arxiv_id": "2403.04957", 453 "relevance": "Proposes gradient-based methods for universal prompt injection attacks on LLMs, demonstrating transferability across architectures." 454 }, 455 { 456 "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection", 457 "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra"], 458 "year": 2023, 459 "arxiv_id": "2302.12173", 460 "relevance": "Demonstrates indirect prompt injection attacks on real-world LLM-integrated applications, establishing the broader threat model for prompt injection." 461 }, 462 { 463 "title": "SmoothLLM: Defending large language models against jailbreaking attacks", 464 "authors": ["Alexander Robey", "Hamed Hassani", "George J. Pappas"], 465 "year": 2023, 466 "arxiv_id": "2310.03684", 467 "relevance": "Proposes a defense mechanism against adversarial attacks on LLMs, directly relevant to mitigating the vulnerabilities demonstrated in this paper." 468 }, 469 { 470 "title": "LLM agents can be tricked to generate harmful content through indirect prompt injection", 471 "authors": ["Xiaohan Xu"], 472 "year": 2023, 473 "arxiv_id": "2311.16339", 474 "relevance": "Demonstrates that LLM agents are vulnerable to indirect prompt injection, extending the threat model to agentic systems." 475 }, 476 { 477 "title": "Attention tracker: Detecting prompt injection attacks in LLMs", 478 "authors": ["Zijie Zhang"], 479 "year": 2024, 480 "arxiv_id": "2411.00348", 481 "relevance": "Proposes attention-based detection of prompt injection attacks, a potential defense mechanism for the vulnerabilities demonstrated here." 482 }, 483 { 484 "title": "AutoDAN: Automatic and interpretable adversarial attacks on large language models", 485 "authors": ["Sicheng Liu"], 486 "year": 2023, 487 "arxiv_id": "2310.15140", 488 "relevance": "Proposes automatic adversarial attack generation for LLMs, relevant to understanding the broader landscape of optimization-based attacks on language models." 489 }, 490 { 491 "title": "Certifying LLM safety against adversarial prompting", 492 "authors": ["Aounon Kumar"], 493 "year": 2023, 494 "arxiv_id": "2309.02705", 495 "relevance": "Proposes certification methods for LLM safety against adversarial prompts, offering a formal approach to securing systems like LLM-as-a-Judge." 496 }, 497 { 498 "title": "Baseline defenses for adversarial attacks against aligned language models", 499 "authors": ["Sujit Jain"], 500 "year": 2023, 501 "arxiv_id": "2309.00614", 502 "relevance": "Evaluates baseline defense strategies against adversarial attacks on aligned LLMs, directly applicable to defending LLM-as-a-Judge systems." 503 } 504 ], 505 "engagement_factors": { 506 "practical_relevance": { 507 "score": 1, 508 "justification": "Highlights a vulnerability in LLM-as-a-Judge systems but provides no tool, code, or defense that practitioners can directly use." 509 }, 510 "surprise_contrarian": { 511 "score": 1, 512 "justification": "That small LLMs are vulnerable to adversarial suffix attacks is consistent with extensive prior work on LLM adversarial robustness; not surprising." 513 }, 514 "fear_safety": { 515 "score": 2, 516 "justification": "Demonstrates that LLM evaluation systems used in RLHF and model selection can be manipulated, raising concerns about AI alignment pipeline integrity." 517 }, 518 "drama_conflict": { 519 "score": 1, 520 "justification": "Implies LLM judges are unreliable, a mild 'benchmarks are broken' angle, but no direct confrontation with specific products or companies." 521 }, 522 "demo_ability": { 523 "score": 0, 524 "justification": "No code, demo, or tool released; no way for readers to reproduce or try the attacks." 525 }, 526 "brand_recognition": { 527 "score": 0, 528 "justification": "Unknown authors with no disclosed institutional affiliation; uses relatively niche 3B-parameter models (Qwen2.5-3B, Falcon3-3B)." 529 } 530 } 531 }