scan-v5.json (25489B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Investigating the Vulnerability of LLM-as-a-Judge Architectures to Prompt-Injection Attacks", 6 "authors": [ 7 "Narek Maloyan", 8 "Bislan Ashinov", 9 "Dmitry Namiot" 10 ], 11 "year": 2025, 12 "venue": "arXiv.org", 13 "arxiv_id": "2505.13348", 14 "doi": "10.48550/arXiv.2505.13348" 15 }, 16 "checklist": { 17 "claims_and_evidence": { 18 "abstract_claims_supported": { 19 "applies": true, 20 "answer": true, 21 "justification": "Abstract claims CUA achieves >30% ASR and JMA shows notable effectiveness. Table I reports 31.2-32.4% for CUA and 15.2-16.7% for JMA, directly supporting abstract claims.", 22 "source": "haiku" 23 }, 24 "causal_claims_justified": { 25 "applies": true, 26 "answer": true, 27 "justification": "Paper claims adversarial suffixes cause verdict flips and tests this with controlled experiments comparing attacked vs baseline conditions with multiple control conditions (random-suffix, token-shuffle).", 28 "source": "haiku" 29 }, 30 "generalization_bounded": { 31 "applies": true, 32 "answer": false, 33 "justification": "Abstract and conclusion claim vulnerabilities in 'current LLM-as-a-Judge systems' broadly, but experiments only test 2 open-source 3B models. No testing of the actual large models (GPT-4, Claude) that are used as judges in practice.", 34 "source": "haiku" 35 }, 36 "alternative_explanations_discussed": { 37 "applies": true, 38 "answer": false, 39 "justification": "Paper mechanistically explains why attacks work (token ordering, direct decision optimization) but does not discuss alternative hypotheses for why judges are vulnerable (training, architecture, attention patterns).", 40 "source": "haiku" 41 }, 42 "proxy_outcome_distinction": { 43 "applies": true, 44 "answer": true, 45 "justification": "ASR (Attack Success Rate = verdict flip percentage) is the measured outcome; claims are about 'vulnerability' and 'susceptibility' which are reasonable interpretations of this metric.", 46 "source": "haiku" 47 } 48 }, 49 "limitations_and_scope": { 50 "limitations_section_present": { 51 "applies": true, 52 "answer": false, 53 "justification": "No dedicated limitations section. Conclusion mentions one limitation: 'This work did not explore the impact of permuting the order of the attacked and genuinely superior answers,' but lacks comprehensive threats-to-validity discussion.", 54 "source": "haiku" 55 }, 56 "threats_to_validity_specific": { 57 "applies": true, 58 "answer": false, 59 "justification": "Limitations mentioned are vague (permutation order). Missing discussions of: generalization to large closed-source models, sample size justification, whether findings hold with defenses applied, dataset representativeness.", 60 "source": "haiku" 61 }, 62 "scope_boundaries_stated": { 63 "applies": true, 64 "answer": false, 65 "justification": "Paper does not explicitly state what findings do NOT show. Implicit scope limitations (small models, no defense evaluation, MT-Bench only) are never articulated as boundaries.", 66 "source": "haiku" 67 } 68 }, 69 "conflicts_of_interest": { 70 "funding_disclosed": { 71 "applies": true, 72 "answer": false, 73 "justification": "No funding statement or acknowledgments section provided in the paper.", 74 "source": "haiku" 75 }, 76 "affiliations_disclosed": { 77 "applies": true, 78 "answer": false, 79 "justification": "Email (maloyan.narek@gmail.com) is provided but no institutional affiliations are stated in the paper.", 80 "source": "haiku" 81 }, 82 "funder_independent_of_outcome": { 83 "applies": false, 84 "answer": false, 85 "justification": "No funding source disclosed, so cannot assess independence.", 86 "source": "haiku" 87 }, 88 "financial_interests_declared": { 89 "applies": true, 90 "answer": false, 91 "justification": "No competing interests statement or declaration of financial interests (patents, equity, consulting) provided.", 92 "source": "haiku" 93 } 94 }, 95 "scope_and_framing": { 96 "key_terms_defined": { 97 "applies": true, 98 "answer": true, 99 "justification": "Key terms are defined: LLM-as-a-Judge, prompt injection, Comparative Undermining Attack, Justification Manipulation Attack, Attack Success Rate, adversarial suffix.", 100 "source": "haiku" 101 }, 102 "intended_contribution_clear": { 103 "applies": true, 104 "answer": true, 105 "justification": "Introduction clearly states the work investigates vulnerabilities of LLM-as-a-Judge systems to prompt-injection attacks and develops optimization-based attack methods.", 106 "source": "haiku" 107 }, 108 "engagement_with_prior_work": { 109 "applies": true, 110 "answer": true, 111 "justification": "Section II engages with prior work on LLM security, LLM-as-a-Judge paradigm, and prior attacks (JudgeDeceiver, GCG), positioning this work as extending GCG methods to judge architectures.", 112 "source": "haiku" 113 } 114 } 115 }, 116 "type_checklist": { 117 "empirical": { 118 "artifacts": { 119 "code_released": { 120 "applies": true, 121 "answer": false, 122 "justification": "Paper does not mention releasing code, attack generation scripts, or evaluation code.", 123 "source": "haiku" 124 }, 125 "data_released": { 126 "applies": true, 127 "answer": false, 128 "justification": "MT-Bench dataset is public, but the paper does not release adversarial suffixes generated, attack results, or judge model outputs.", 129 "source": "haiku" 130 }, 131 "environment_specified": { 132 "applies": true, 133 "answer": false, 134 "justification": "No specification of Python version, PyTorch version, CUDA version, hardware, or other computational environment details.", 135 "source": "haiku" 136 }, 137 "reproduction_instructions": { 138 "applies": true, 139 "answer": false, 140 "justification": "GCG algorithm described mathematically but without implementable details (exact suffix length L, number of iterations, candidate set size, evaluation function specifics).", 141 "source": "haiku" 142 } 143 }, 144 "statistical_methodology": { 145 "confidence_intervals_or_error_bars": { 146 "applies": true, 147 "answer": false, 148 "justification": "Table I reports single ASR percentages per method per model with no confidence intervals, standard deviations, or error bars.", 149 "source": "haiku" 150 }, 151 "significance_tests": { 152 "applies": true, 153 "answer": false, 154 "justification": "No statistical significance tests, p-values, or hypothesis tests comparing methods (e.g., CUA vs JMA).", 155 "source": "haiku" 156 }, 157 "effect_sizes_reported": { 158 "applies": true, 159 "answer": true, 160 "justification": "ASR percentages (e.g., 31.2% CUA vs 5.1% Hard Prompt) represent effect sizes comparing attack methods.", 161 "source": "haiku" 162 }, 163 "sample_size_justified": { 164 "applies": true, 165 "answer": false, 166 "justification": "Paper uses MT-Bench but exact number of examples tested per method is not reported. No justification for sample size or power analysis provided.", 167 "source": "haiku" 168 }, 169 "variance_reported": { 170 "applies": true, 171 "answer": false, 172 "justification": "Single ASR value per method per model in Table I. No variance across runs, no standard deviation, no evidence of multiple evaluation passes.", 173 "source": "haiku" 174 } 175 }, 176 "evaluation_design": { 177 "baselines_included": { 178 "applies": true, 179 "answer": true, 180 "justification": "Paper includes Hard Prompt Attack baseline and two control conditions (Random-Suffix Control, Token-Shuffle Control).", 181 "source": "haiku" 182 }, 183 "baselines_contemporary": { 184 "applies": true, 185 "answer": true, 186 "justification": "Hard Prompt Attack is a reasonable simple baseline; JudgeDeceiver [41] (2024) is cited and compared as contemporary prior work.", 187 "source": "haiku" 188 }, 189 "ablation_study": { 190 "applies": true, 191 "answer": false, 192 "justification": "Paper compares different attack methods (CUA, JMA, Hard Prompt) and controls but does not ablate components within a single method.", 193 "source": "haiku" 194 }, 195 "multiple_metrics": { 196 "applies": true, 197 "answer": false, 198 "justification": "Only Attack Success Rate (ASR) is used as an evaluation metric. No measurement of attack transferability, robustness, or other dimensions.", 199 "source": "haiku" 200 }, 201 "human_evaluation": { 202 "applies": true, 203 "answer": false, 204 "justification": "Paper uses MT-Bench which contains human-judged ground truth, but does not conduct human evaluation of attack success or realism.", 205 "source": "haiku" 206 }, 207 "held_out_test_set": { 208 "applies": true, 209 "answer": false, 210 "justification": "Paper uses MT-Bench but does not specify train/validation/test split. Unclear if held-out test set was used or if all MT-Bench data was used for evaluation.", 211 "source": "haiku" 212 }, 213 "per_category_breakdown": { 214 "applies": true, 215 "answer": false, 216 "justification": "Results in Table I show per-model breakdown but not per-question-type, per-answer-quality, or per-difficulty-level breakdowns.", 217 "source": "haiku" 218 }, 219 "failure_cases_discussed": { 220 "applies": true, 221 "answer": false, 222 "justification": "Paper does not discuss when attacks failed, what types of examples resisted attacks, or failure case analysis.", 223 "source": "haiku" 224 }, 225 "negative_results_reported": { 226 "applies": true, 227 "answer": true, 228 "justification": "Random-Suffix Control (1.2-1.5% ASR) and Token-Shuffle Control (2.8-3.1% ASR) demonstrate that simple perturbations are ineffective.", 229 "source": "haiku" 230 } 231 }, 232 "setup_transparency": { 233 "model_versions_specified": { 234 "applies": true, 235 "answer": true, 236 "justification": "Model versions explicitly stated: 'Qwen2.5-3B-Instruct' and 'Falcon3-3B-Instruct' with references to technical reports [52, 53].", 237 "source": "haiku" 238 }, 239 "prompts_provided": { 240 "applies": true, 241 "answer": false, 242 "justification": "Paper describes pairwise comparison task but does not provide the actual prompt template used to query the judge models.", 243 "source": "haiku" 244 }, 245 "hyperparameters_reported": { 246 "applies": true, 247 "answer": false, 248 "justification": "GCG algorithm is described at high level but key hyperparameters are not specified numerically: suffix length L is not stated as a concrete number, number of GCG iterations not specified.", 249 "source": "haiku" 250 }, 251 "scaffolding_described": { 252 "applies": true, 253 "answer": true, 254 "justification": "Pairwise comparison scaffolding is described: judge model receives (query x, answer a, answer b) and outputs preference. Adversarial suffix δ is appended to b.", 255 "source": "haiku" 256 }, 257 "data_preprocessing_documented": { 258 "applies": true, 259 "answer": false, 260 "justification": "Paper states MT-Bench examples are 'formed into triplets (x, a, b)' but does not document preprocessing steps, filtering criteria, or data cleaning.", 261 "source": "haiku" 262 } 263 }, 264 "data_integrity": { 265 "raw_data_available": { 266 "applies": true, 267 "answer": false, 268 "justification": "MT-Bench is publicly available from LMSYS, but attack-generated outputs, adversarial suffixes, and evaluation results are not released.", 269 "source": "haiku" 270 }, 271 "data_collection_described": { 272 "applies": true, 273 "answer": false, 274 "justification": "MT-Bench collection is referenced [35] but not described in this paper. This paper's selection criteria for which MT-Bench examples to attack is not documented.", 275 "source": "haiku" 276 }, 277 "recruitment_methods_described": { 278 "applies": false, 279 "answer": false, 280 "justification": "N/A — MT-Bench is a standard public dataset not newly collected by these authors.", 281 "source": "haiku" 282 }, 283 "data_pipeline_documented": { 284 "applies": true, 285 "answer": false, 286 "justification": "High-level pipeline is described (MT-Bench → GCG optimization → judge evaluation) but detailed pipeline from raw data to final results is not fully documented.", 287 "source": "haiku" 288 } 289 }, 290 "contamination": { 291 "training_cutoff_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "Models tested are Qwen2.5 and Falcon3 but training data cutoff dates are not stated in the paper.", 295 "source": "haiku" 296 }, 297 "train_test_overlap_discussed": { 298 "applies": true, 299 "answer": false, 300 "justification": "MT-Bench examples could potentially be in the training data of the judge models, but paper does not discuss this potential overlap.", 301 "source": "haiku" 302 }, 303 "benchmark_contamination_addressed": { 304 "applies": true, 305 "answer": false, 306 "justification": "MT-Bench was created in 2023 [35]; judge models (Qwen2.5, Falcon3) are from 2024-2025, making contamination unlikely, but paper does not explicitly address this.", 307 "source": "haiku" 308 } 309 }, 310 "human_studies": { 311 "pre_registered": { 312 "applies": false, 313 "answer": false, 314 "justification": "N/A — no human participants in the study.", 315 "source": "haiku" 316 }, 317 "irb_or_ethics_approval": { 318 "applies": false, 319 "answer": false, 320 "justification": "N/A — no human participants in the study.", 321 "source": "haiku" 322 }, 323 "demographics_reported": { 324 "applies": false, 325 "answer": false, 326 "justification": "N/A — no human participants in the study.", 327 "source": "haiku" 328 }, 329 "inclusion_exclusion_criteria": { 330 "applies": false, 331 "answer": false, 332 "justification": "N/A — no human participants in the study.", 333 "source": "haiku" 334 }, 335 "randomization_described": { 336 "applies": false, 337 "answer": false, 338 "justification": "N/A — no human participants in the study.", 339 "source": "haiku" 340 }, 341 "blinding_described": { 342 "applies": false, 343 "answer": false, 344 "justification": "N/A — no human participants in the study.", 345 "source": "haiku" 346 }, 347 "attrition_reported": { 348 "applies": false, 349 "answer": false, 350 "justification": "N/A — no human participants in the study.", 351 "source": "haiku" 352 } 353 }, 354 "cost_and_practicality": { 355 "inference_cost_reported": { 356 "applies": true, 357 "answer": false, 358 "justification": "No inference cost, latency, or computational time reported. Paper uses 3B models but no details on inference speed or cost per attack.", 359 "source": "haiku" 360 }, 361 "compute_budget_stated": { 362 "applies": true, 363 "answer": false, 364 "justification": "No total computational budget, GPU hours, training time, or inference time budgets are reported.", 365 "source": "haiku" 366 } 367 } 368 } 369 }, 370 "claims": [ 371 { 372 "claim": "Comparative Undermining Attack (CUA) achieves Attack Success Rate exceeding 30%", 373 "evidence": "Table I reports CUA ASR of 31.2% on Qwen2.5-3B and 32.4% on Falcon3-3B", 374 "supported": "strong" 375 }, 376 { 377 "claim": "Justification Manipulation Attack (JMA) shows notable effectiveness around 15-17%", 378 "evidence": "Table I reports JMA ASR of 15.2% on Qwen2.5-3B and 16.7% on Falcon3-3B", 379 "supported": "strong" 380 }, 381 { 382 "claim": "Token ordering in adversarial suffixes is more important than token presence", 383 "evidence": "Token-Shuffle Control achieved 2.8-3.1% ASR vs Random-Suffix Control at 1.2-1.5%, showing structure matters", 384 "supported": "moderate" 385 }, 386 { 387 "claim": "Simple heuristic prompt injections have limited effectiveness (5% ASR) compared to optimization-based attacks", 388 "evidence": "Hard Prompt Attack baseline achieved ~5% ASR vs CUA 31%+ and JMA 15%+", 389 "supported": "strong" 390 }, 391 { 392 "claim": "Current LLM-as-a-Judge systems are significantly vulnerable to prompt injection attacks", 393 "evidence": "30%+ success rates on tested models, but only demonstrated on 2 small open-source models, not on large commercial judges actually deployed", 394 "supported": "weak" 395 }, 396 { 397 "claim": "Direct decision-token optimization (CUA) is more effective than reasoning manipulation (JMA)", 398 "evidence": "CUA achieved 31-32% vs JMA 15-17% ASR; paper attributes this to more direct optimization objective", 399 "supported": "moderate" 400 } 401 ], 402 "methodology_tags": [ 403 "benchmark-eval", 404 "empirical" 405 ], 406 "key_findings": "LLM-as-a-Judge systems are vulnerable to prompt injection attacks via adversarial suffix optimization. Comparative Undermining Attack achieves 30%+ success rate in flipping judge verdicts on two open-source 3B models (Qwen2.5-3B-Instruct, Falcon3-3B-Instruct). Justification Manipulation Attack achieves 15-17% success, and control conditions (random text, token shuffling) show that attack effectiveness depends on specific token content and ordering, not just the presence of additional text.", 407 "red_flags": [ 408 { 409 "flag": "Limited model scope", 410 "detail": "Only tested on 2 open-source 3B models; results do not demonstrate whether large/closed-source models (GPT-4, Claude) that are actually used as judges in production are similarly vulnerable" 411 }, 412 { 413 "flag": "No statistical significance testing", 414 "detail": "Single ASR percentage reported per method per model with no confidence intervals, error bars, or hypothesis tests" 415 }, 416 { 417 "flag": "No defense evaluation", 418 "detail": "Paper identifies vulnerabilities but does not evaluate or test any mitigation strategies, defense mechanisms, or robustness improvements" 419 }, 420 { 421 "flag": "Incomplete methodological reporting", 422 "detail": "Critical hyperparameters missing: suffix length L never specified numerically, number of GCG iterations not stated, candidate set size not reported" 423 }, 424 { 425 "flag": "Overgeneralization in claims", 426 "detail": "Abstract and conclusions claim vulnerabilities in 'current LLM-as-a-Judge systems' broadly, but experiments only cover 2 small models" 427 }, 428 { 429 "flag": "No variance or uncertainty reporting", 430 "detail": "Table I shows only point estimates; unclear if results are from single run or averaged across multiple attacks" 431 }, 432 { 433 "flag": "Missing details on MT-Bench subset", 434 "detail": "Exact number of MT-Bench examples used for attacks not reported; unclear what fraction of the full dataset was evaluated" 435 }, 436 { 437 "flag": "No actual prompts provided", 438 "detail": "Pairwise comparison prompt template not included; evaluation setup cannot be fully reproduced" 439 } 440 ], 441 "cited_papers": [ 442 { 443 "title": "Quantifying and understanding adversarial prompting", 444 "authors": "Carlini et al.", 445 "year": 2023, 446 "relevance": "Foundational work on adversarial attacks against LLMs; categorizes attack classes relevant to this work's methodology" 447 }, 448 { 449 "title": "Universal and transferable adversarial attacks on aligned language models", 450 "authors": "Zou et al.", 451 "year": 2023, 452 "relevance": "Introduces GCG (Greedy Coordinate Gradient) method used in this paper's attack implementation" 453 }, 454 { 455 "title": "JudgeDeceiver: Prompt injection attacks to manipulate LLM-as-a-judge", 456 "authors": "Shi et al.", 457 "year": 2024, 458 "relevance": "Directly prior work on attacking judge models; demonstrates universal templates achieve 22-24% ASR" 459 }, 460 { 461 "title": "Judging llm-as-a-judge with mt-bench and chatbot arena", 462 "authors": "Zheng et al.", 463 "year": 2023, 464 "relevance": "Seminal work establishing LLM-as-a-Judge paradigm and MT-Bench dataset used in this evaluation" 465 }, 466 { 467 "title": "Bad-Judge: Backdoor vulnerabilities of LLM-as-a-judge", 468 "authors": "Wang et al.", 469 "year": 2024, 470 "relevance": "Complementary work on backdoor attacks against judge models during training" 471 }, 472 { 473 "title": "SmoothLLM: Defending large language models against jailbreaking attacks", 474 "authors": "Robey et al.", 475 "year": 2023, 476 "relevance": "Proposes defense mechanisms against adversarial attacks; cited as future direction for judge robustness" 477 }, 478 { 479 "title": "How helpful is ChatGPT as a judge?", 480 "authors": "Gu et al.", 481 "year": 2024, 482 "relevance": "Empirical study of reliability and limitations of LLM judges; contextualizes why vulnerabilities matter" 483 }, 484 { 485 "title": "Attention tracker: Detecting prompt injection attacks in LLMs", 486 "authors": "Zhang et al.", 487 "year": 2024, 488 "relevance": "Defense mechanism for detecting prompt injection; cited as approach to mitigate vulnerabilities demonstrated here" 489 } 490 ], 491 "engagement_factors": { 492 "practical_relevance": { 493 "score": 2, 494 "justification": "Identifies real vulnerability in deployed evaluation systems (LLM judges used in RLHF, model evaluation), but provides no defenses or mitigation strategies." 495 }, 496 "surprise_contrarian": { 497 "score": 1, 498 "justification": "LLM vulnerability to adversarial attacks is well-established; applying known techniques to judge models is incremental rather than surprising." 499 }, 500 "fear_safety": { 501 "score": 2, 502 "justification": "Raises concerns about evaluation system integrity that could compromise model safety pipelines (RLHF relies on judge correctness), but doesn't frame as existential risk." 503 }, 504 "demo_ability": { 505 "score": 3, 506 "justification": "GCG-based attacks are straightforward to implement; paper provides sufficient detail and uses publicly available models (Qwen2.5-3B, Falcon3-3B) for easy demonstration." 507 }, 508 "brand_recognition": { 509 "score": 1, 510 "justification": "Authors appear to be from Moscow State University (based on name patterns); no affiliation with well-known AI labs. Limited institutional prestige." 511 }, 512 "drama_conflict": { 513 "score": 2, 514 "justification": "Has controversy potential (security vulnerability in widely-used evaluation systems) but presented as dry technical research without sensationalism." 515 } 516 }, 517 "hn_data": { 518 "threads": [ 519 { 520 "hn_id": "36038868", 521 "title": "RWKV: Reinventing RNNs for the Transformer Era", 522 "points": 358, 523 "comments": 171, 524 "url": "https://news.ycombinator.com/item?id=36038868" 525 }, 526 { 527 "hn_id": "45341511", 528 "title": "Learn Your Way: Towards an AI-Augmented Textbook, Google Research", 529 "points": 3, 530 "comments": 0, 531 "url": "https://news.ycombinator.com/item?id=45341511" 532 }, 533 { 534 "hn_id": "44619169", 535 "title": "Palatable Conceptions of Disembodied Being", 536 "points": 3, 537 "comments": 0, 538 "url": "https://news.ycombinator.com/item?id=44619169" 539 }, 540 { 541 "hn_id": "43411379", 542 "title": "New Computer with intergrated Brain Computer interface", 543 "points": 3, 544 "comments": 0, 545 "url": "https://news.ycombinator.com/item?id=43411379" 546 }, 547 { 548 "hn_id": "43417925", 549 "title": "Bioscience Lab in home for your Brain and Body, control laptop via mind", 550 "points": 1, 551 "comments": 0, 552 "url": "https://news.ycombinator.com/item?id=43417925" 553 }, 554 { 555 "hn_id": "42898154", 556 "title": "Building a Verifiable Logical Clock for P2P Networks", 557 "points": 1, 558 "comments": 0, 559 "url": "https://news.ycombinator.com/item?id=42898154" 560 }, 561 { 562 "hn_id": "27932480", 563 "title": "Shining Light on Quantum Transport in Fractal Networks", 564 "points": 1, 565 "comments": 0, 566 "url": "https://news.ycombinator.com/item?id=27932480" 567 } 568 ], 569 "top_points": 358, 570 "total_points": 370, 571 "total_comments": 171 572 } 573 }