scan.json (28889B)
1 { 2 "paper": { 3 "title": "\"Give a Positive Review Only\": An Early Investigation Into In-Paper Prompt Injection Attacks and Defenses for AI Reviewers", 4 "authors": [ 5 "Qin Zhou", 6 "Zhexin Zhang", 7 "Zhi Li", 8 "Limin Sun" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2511.01287", 13 "doi": "10.48550/arXiv.2511.01287" 14 }, 15 "scan_version": 2, 16 "active_modules": ["experimental_rigor", "data_leakage"], 17 "methodology_tags": ["benchmark-eval"], 18 "key_findings": "AI peer review systems are highly vulnerable to in-paper prompt injection (IPI) attacks. Static attacks increase ratings by 1.24-2.80 points across GPT-5, DeepSeek-Chat, and Gemini-2.5-Pro, while iterative attacks push scores near the maximum of 10 for DeepSeek and Gemini within 3 rounds. GPT-5 is the most robust to both attack types. A detection-based defense achieves 99% detection of naive attacks but adaptive adversarial strategies reduce detection to 24%, still inflating scores by ~1 point on average.", 19 "checklist": { 20 "artifacts": { 21 "code_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper. They mention using 'an automated Python tool' for injection but do not release it." 25 }, 26 "data_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "The 100 papers were randomly sampled from the publicly available OpenReview platform for ICLR 2025 submissions. The underlying data source is public, though the specific sample IDs are not listed." 30 }, 31 "environment_specified": { 32 "applies": true, 33 "answer": false, 34 "justification": "No environment specification is provided. They use API-based models via OpenRouter but provide no dependency list, requirements.txt, or setup details." 35 }, 36 "reproduction_instructions": { 37 "applies": true, 38 "answer": false, 39 "justification": "No step-by-step reproduction instructions are provided. A reader would need to reverse-engineer the experimental setup from the method description." 40 } 41 }, 42 "statistical_methodology": { 43 "confidence_intervals_or_error_bars": { 44 "applies": true, 45 "answer": true, 46 "justification": "Tables 2, 3, and 4 report mean ± standard deviation across runs. For example, Table 2 reports 'mean ± std' for all conditions." 47 }, 48 "significance_tests": { 49 "applies": true, 50 "answer": false, 51 "justification": "A single t-test is reported in Table 6 for the defense evaluation (p=0.26 comparing defended vs. baseline). However, no significance tests are applied to the main attack claims in Table 2, where the paper claims attacks 'significantly increase paper scores' based only on comparing means." 52 }, 53 "effect_sizes_reported": { 54 "applies": true, 55 "answer": true, 56 "justification": "Effect sizes are reported throughout: '+1.91 for Gemini, +2.80 for DeepSeek, and +1.24 for GPT-5' (Section 1). Table 5 reports improvements in parentheses (e.g., '+2.97', '+2.78'). Baseline context is provided." 57 }, 58 "sample_size_justified": { 59 "applies": true, 60 "answer": false, 61 "justification": "No justification for the 100-paper sample size. The Limitations section acknowledges the constraint ('we limit our evaluation to 100 papers') but attributes it to cost rather than providing a power analysis or sample size rationale." 62 }, 63 "variance_reported": { 64 "applies": true, 65 "answer": true, 66 "justification": "Standard deviation is reported across three independent review runs for all main results. Tables 2, 3, and 4 consistently show mean ± std format." 67 } 68 }, 69 "evaluation_design": { 70 "baselines_included": { 71 "applies": true, 72 "answer": true, 73 "justification": "All attack conditions are compared against 'Original Paper (No Attack)' baseline in Table 2. Human reviewer scores are also included as a reference baseline." 74 }, 75 "baselines_contemporary": { 76 "applies": true, 77 "answer": true, 78 "justification": "The baseline is the natural no-attack condition, which is the appropriate comparison for an attack study. They also compare against human review scores (avg 5.12). Prior attack work (Ye et al. 2024, Lin 2025, Collu et al. 2025) is discussed, though not directly compared experimentally." 79 }, 80 "ablation_study": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple ablation-like analyses are conducted: 4 static attack variants (Table 2), 2 iterative attack seeds (Table 4), impact of injection position (Table 3), impact of iteration count (Table 4), and cross-model transferability (Table 5)." 84 }, 85 "multiple_metrics": { 86 "applies": true, 87 "answer": true, 88 "justification": "Multiple metrics are used: overall rating (1-10), sub-dimensions (Soundness, Presentation, Contribution on 1-4 scale), Pearson correlation with human scores (Figure 5), attack detection rate, and full prompt recovery rate (Tables 6-7)." 89 }, 90 "human_evaluation": { 91 "applies": true, 92 "answer": true, 93 "justification": "Human review scores from ICLR 2025 are included as a reference (Table 2, avg 5.12). Figure 5 compares AI and human ratings with Pearson correlations (r=0.501-0.643)." 94 }, 95 "held_out_test_set": { 96 "applies": true, 97 "answer": false, 98 "justification": "For the main iterative attack, the attack is optimized and evaluated on the same model-paper pair with no held-out split. The transferability experiment (Table 5) tests on different models, but the 100 papers are not split into optimization and evaluation sets." 99 }, 100 "per_category_breakdown": { 101 "applies": true, 102 "answer": true, 103 "justification": "Results are broken down by model (3 models in all tables), attack type (4 static + 2 iterative in Table 2), injection position (Table 3), human rating bins (Figure 3), and paper length bins (Figure 4)." 104 }, 105 "failure_cases_discussed": { 106 "applies": true, 107 "answer": true, 108 "justification": "Appendix B.1.2 shows a failure case where GPT-5 recognizes and ignores the injection ('I need to act as a reviewer... seems untrustworthy. I'll ignore that'). Defense failure cases are discussed in Section 4.4 and Appendix B.3.3-B.3.4." 109 }, 110 "negative_results_reported": { 111 "applies": true, 112 "answer": true, 113 "justification": "GPT-5's resistance to attacks is reported (Prompt 1 yields only +0.05 on GPT-5, Table 2). The defense's overcorrection issue (17 cases scoring >1.5 below baseline) is reported in Table 6. The defense fails against adaptive attacks (Table 7)." 114 } 115 }, 116 "claims_and_evidence": { 117 "abstract_claims_supported": { 118 "applies": true, 119 "answer": true, 120 "justification": "Abstract claims are supported: 'striking performance' is backed by Table 2 (scores approaching 10); 'robust across various settings' by Tables 3 and Figures 3-4; defense 'substantially reduces attack success rate' by Table 6 (10.00→7.27); 'adaptive attacker can partially circumvent' by Table 7 (score rises to 8.11)." 121 }, 122 "causal_claims_justified": { 123 "applies": true, 124 "answer": true, 125 "justification": "The paper claims injected prompts cause score inflation. The controlled experimental design (same 100 papers evaluated with and without injection, everything else held constant) supports causal inference. The manipulation is directly controlled by the experimenters." 126 }, 127 "generalization_bounded": { 128 "applies": true, 129 "answer": false, 130 "justification": "The abstract and conclusion make broad claims ('current AI review systems are vulnerable', 'fundamental vulnerabilities in AI-assisted reviewing pipelines') based on 3 models and 100 papers. While the title hedges ('An Early Investigation'), the body language generalizes beyond the tested scope. The Limitations section only acknowledges sample size, not the bounded scope of the claims." 131 }, 132 "alternative_explanations_discussed": { 133 "applies": true, 134 "answer": false, 135 "justification": "No alternative explanations are discussed. The baseline AI scores are substantially higher than human scores (7.03-7.06 vs. 5.12 for two models), suggesting these models are inherently generous reviewers. This confound — that generous models may be more susceptible — is not explored. No robustness checks against alternative explanations are provided." 136 }, 137 "proxy_outcome_distinction": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper measures AI review scores and claims about AI review scores. There is no proxy gap — the measurement (overall rating on 1-10 scale) matches what is claimed (vulnerability of AI review scoring to manipulation)." 141 } 142 }, 143 "setup_transparency": { 144 "model_versions_specified": { 145 "applies": true, 146 "answer": false, 147 "justification": "Models are identified as 'GPT-5', 'DeepSeek-Chat (DeepSeek-V3)', and 'Gemini-2.5-Pro' — marketing names without API versions or snapshot dates. No specific model version identifiers (e.g., dates, API version strings) are provided." 148 }, 149 "prompts_provided": { 150 "applies": true, 151 "answer": true, 152 "justification": "Full attack prompts are provided in Table 1. The iterative attack optimization instruction, defense instruction, and adaptive attack instruction are all provided in Appendix A. The review criteria prompt is in Figure 6." 153 }, 154 "hyperparameters_reported": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 4.1 states 'we fix the decoding temperature at 0.9 across all models.' Temperature is the key sampling parameter. Other parameters (top-p, max_tokens) are not mentioned." 158 }, 159 "scaffolding_described": { 160 "applies": false, 161 "answer": false, 162 "justification": "No agentic scaffolding is used. The approach calls model APIs directly with paper text and review instructions." 163 }, 164 "data_preprocessing_documented": { 165 "applies": true, 166 "answer": true, 167 "justification": "Section 4.1 describes: papers sampled from OpenReview, PDFs processed using 'OpenRouter's pdf-text engine to parse PDF papers before providing the extracted text to each reviewer model.' Attack prompts injected via 'automated Python tool' in 'white and at microscopic font sizes.'" 168 } 169 }, 170 "limitations_and_scope": { 171 "limitations_section_present": { 172 "applies": true, 173 "answer": true, 174 "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing the cost constraint and limited scope of 100 papers and three AI reviewers." 175 }, 176 "threats_to_validity_specific": { 177 "applies": true, 178 "answer": true, 179 "justification": "The Limitations section identifies a specific constraint for this study: 'Due to the substantial cost associated with generating AI-based reviews for scientific papers—where a single paper can consume tens of thousands of tokens—we limit our evaluation to 100 papers and three AI reviewers.' This is specific to the study's cost-sample tradeoff." 180 }, 181 "scope_boundaries_stated": { 182 "applies": true, 183 "answer": false, 184 "justification": "While the Limitations section mentions the constraint on paper count and model count, it does not explicitly state what the results do NOT show or what claims the authors are NOT making. There are no statements bounding generalization to other review settings, languages, or model types." 185 } 186 }, 187 "data_integrity": { 188 "raw_data_available": { 189 "applies": true, 190 "answer": false, 191 "justification": "No raw data is released. The specific paper IDs sampled, individual per-paper scores, injected PDF files, and full review outputs are not available for independent verification." 192 }, 193 "data_collection_described": { 194 "applies": true, 195 "answer": true, 196 "justification": "Section 4.1 describes: 'We randomly sampled 100 papers (in PDF format) together with their full peer reviews from the pool of 20,000+ submissions on OpenReview for ICLR 2025.' Figure 2 shows the distribution of the sample." 197 }, 198 "recruitment_methods_described": { 199 "applies": false, 200 "answer": false, 201 "justification": "No human participants. The data source is ICLR 2025 submissions on OpenReview, a standard public platform." 202 }, 203 "data_pipeline_documented": { 204 "applies": true, 205 "answer": true, 206 "justification": "The pipeline is documented: random sampling from OpenReview → PDF text extraction via OpenRouter's pdf-text engine → prompt injection into PDFs → evaluation by each model 3 times → score aggregation. Section 4.1 covers this flow." 207 } 208 }, 209 "conflicts_of_interest": { 210 "funding_disclosed": { 211 "applies": true, 212 "answer": false, 213 "justification": "No funding source is disclosed. No acknowledgments section mentioning grants or sponsors is present in the paper." 214 }, 215 "affiliations_disclosed": { 216 "applies": true, 217 "answer": true, 218 "justification": "Author affiliations are clearly listed: Institute of Information Engineering (CAS), University of Chinese Academy of Sciences, and Tsinghua University. These are academic institutions, not affiliated with the AI companies whose models are tested." 219 }, 220 "funder_independent_of_outcome": { 221 "applies": true, 222 "answer": false, 223 "justification": "No funding is disclosed, so independence cannot be verified. The absence of a funding statement means this criterion cannot be confirmed." 224 }, 225 "financial_interests_declared": { 226 "applies": true, 227 "answer": false, 228 "justification": "No competing interests or financial interest declaration is present in the paper." 229 } 230 }, 231 "contamination": { 232 "training_cutoff_stated": { 233 "applies": false, 234 "answer": false, 235 "justification": "This is a red-teaming study testing prompt injection attacks against AI reviewers, not evaluating model knowledge on a benchmark. Contamination is not the relevant concern." 236 }, 237 "train_test_overlap_discussed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Red-teaming study testing attack vulnerability, not model capability on benchmark tasks." 241 }, 242 "benchmark_contamination_addressed": { 243 "applies": false, 244 "answer": false, 245 "justification": "Red-teaming study testing attack vulnerability, not model capability on benchmark tasks." 246 } 247 }, 248 "human_studies": { 249 "pre_registered": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study. All evaluation is automated using AI models." 253 }, 254 "irb_or_ethics_approval": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "demographics_reported": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "inclusion_exclusion_criteria": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 }, 269 "randomization_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants." 273 }, 274 "blinding_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants." 278 }, 279 "attrition_reported": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants." 283 } 284 }, 285 "cost_and_practicality": { 286 "inference_cost_reported": { 287 "applies": true, 288 "answer": false, 289 "justification": "The Limitations section mentions 'substantial cost' and 'a single paper can consume tens of thousands of tokens' but provides no specific cost figures, API spend, or cost-per-paper estimates." 290 }, 291 "compute_budget_stated": { 292 "applies": true, 293 "answer": false, 294 "justification": "No total computational budget is stated. The number of API calls can be inferred (100 papers × 3 models × 3 runs × multiple conditions) but total cost or compute time is not reported." 295 } 296 }, 297 "experimental_rigor": { 298 "seed_sensitivity_reported": { 299 "applies": true, 300 "answer": true, 301 "justification": "Each paper is independently evaluated 3 times per condition, and variance across runs is reported as standard deviation. For API-based models with temperature=0.9, this captures stochastic variation. Tables 2-4 all show mean ± std." 302 }, 303 "number_of_runs_stated": { 304 "applies": true, 305 "answer": true, 306 "justification": "Section 4.1 states 'each paper is independently evaluated three times by the reviewer model.' For iterative attacks, 'each paper undergoes three independent trials' (Table 2 caption)." 307 }, 308 "hyperparameter_search_budget": { 309 "applies": true, 310 "answer": false, 311 "justification": "No hyperparameter search budget is reported. Temperature is fixed at 0.9 without justification for this choice. No search over other parameters is described." 312 }, 313 "best_config_selection_justified": { 314 "applies": true, 315 "answer": true, 316 "justification": "For the transferability experiment (Table 5), selection is clearly justified: 'the attack prompt is the highest-scoring one selected from Iterative Attack (Prompt 3-based), based on three independent trials, each with three refinement iterations (9 candidates in total).' All attack variants are reported in Table 2, not just the best." 317 }, 318 "multiple_comparison_correction": { 319 "applies": false, 320 "answer": false, 321 "justification": "Only one formal statistical test is conducted (t-test in Table 6). Multiple comparison correction is not applicable for a single test." 322 }, 323 "self_comparison_bias_addressed": { 324 "applies": true, 325 "answer": false, 326 "justification": "The authors propose and evaluate their own attack methods without acknowledging potential self-evaluation bias. No independent evaluation or discussion of this bias is present." 327 }, 328 "compute_budget_vs_performance": { 329 "applies": true, 330 "answer": false, 331 "justification": "While Table 4 implicitly shows that more iterations yield higher scores, the compute cost per iteration is not quantified. No explicit performance-vs-compute analysis is provided." 332 }, 333 "benchmark_construct_validity": { 334 "applies": true, 335 "answer": false, 336 "justification": "The paper uses AI review scores as the measure of attack success without questioning whether overall score is the right metric. No discussion of whether score inflation translates to actual acceptance decisions or whether the review criteria faithfully capture review quality." 337 }, 338 "scaffold_confound_addressed": { 339 "applies": false, 340 "answer": false, 341 "justification": "No scaffolding is involved. Models are called directly via API." 342 } 343 }, 344 "data_leakage": { 345 "temporal_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "The models (GPT-5, DeepSeek-V3, Gemini-2.5-Pro) may have been trained on ICLR papers or OpenReview content, which could affect baseline review scores. This temporal relationship is not discussed." 349 }, 350 "feature_leakage_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether models may have seen the specific ICLR 2025 papers or their reviews during training, which would constitute feature leakage affecting baseline scores." 354 }, 355 "non_independence_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the 100 sampled papers share structural similarities (same subfields, overlapping authors) that could affect results." 359 }, 360 "leakage_detection_method": { 361 "applies": true, 362 "answer": false, 363 "justification": "No leakage detection or prevention method is applied. No analysis of whether models have prior familiarity with the test papers." 364 } 365 } 366 }, 367 "claims": [ 368 { 369 "claim": "Static attacks increase average AI reviewer ratings by 1.24-2.80 points across three frontier models.", 370 "evidence": "Table 2: Prompt 3 yields +1.91 (Gemini), +2.80 (DeepSeek), and +0.20 (GPT-5). Prompt 4 yields +1.24 (GPT-5). Results averaged over 100 papers × 3 runs.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Iterative attacks can push scores to near-maximum (10) for Gemini and DeepSeek within three optimization rounds.", 375 "evidence": "Table 2: Iterative attack (Prompt 3-based) achieves 9.84±0.03 for Gemini and 10.00±0.00 for DeepSeek. Table 4 shows progression across iterations.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "GPT-5 is the most robust reviewer model to both static and iterative attacks.", 380 "evidence": "Table 2: GPT-5 maximum iterative score is 6.99±0.05 vs. 9.84/10.00 for Gemini/DeepSeek. Table 5: GPT-5 has smallest cross-model vulnerability (+0.15 to +1.04).", 381 "supported": "strong" 382 }, 383 { 384 "claim": "Attacks are robust across injection positions, human ratings, and paper lengths.", 385 "evidence": "Table 3: All three positions yield similar scores (8.90-8.97). Figures 3-4: Attack effectiveness is consistent across binned human ratings and paper lengths.", 386 "supported": "strong" 387 }, 388 { 389 "claim": "Attack prompts optimized on GPT-5 show the strongest cross-model transferability.", 390 "evidence": "Table 5: GPT-5 prompts yield +2.19 on Gemini and +2.72 on DeepSeek, the highest cross-model gains. However, this is from a single experiment with no variance reported for transfer results.", 391 "supported": "moderate" 392 }, 393 { 394 "claim": "Detection-based defense identifies 99% of naive attacks and reduces average score from 10.00 to 7.27.", 395 "evidence": "Table 6: 99/100 detection rate, full prompt recovery in 91/100 cases. Average score drops from 10.00 to 7.27 (p=0.26 vs. no-attack baseline of 7.06).", 396 "supported": "strong" 397 }, 398 { 399 "claim": "Adaptive attacks circumvent the detection defense, with only 24% of attacks detected.", 400 "evidence": "Table 7: Detection rate drops to 24/100. Average score rises to 8.11 under adaptive attack. 25 undetected cases score >1.5 above baseline.", 401 "supported": "moderate" 402 } 403 ], 404 "red_flags": [ 405 { 406 "flag": "No significance tests for main claims", 407 "detail": "The core attack effectiveness claims (Table 2) rely on comparing means without any statistical significance tests. The word 'significantly' is used in the paper to describe results ('significantly increase paper scores') without statistical backing. Only one t-test is reported (Table 6, for the defense experiment)." 408 }, 409 { 410 "flag": "Baseline AI generosity not addressed as confound", 411 "detail": "DeepSeek (7.06) and Gemini (7.03) rate papers substantially higher than human reviewers (5.12) even without attacks. This inherent generosity may correlate with susceptibility to prompt injection, but this confound is not discussed." 412 }, 413 { 414 "flag": "No code or data released", 415 "detail": "Despite being a security study where reproducibility is important for the community, no code, specific paper IDs, or raw results are released." 416 }, 417 { 418 "flag": "Transfer experiment lacks variance", 419 "detail": "Table 5 (cross-model transferability) reports results from a single evaluation per paper with no variance or significance tests, despite being a key claim." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "Jailbreaking black box large language models in twenty queries", 425 "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"], 426 "year": 2025, 427 "relevance": "Jailbreaking techniques that bypass LLM safety constraints, directly related to adversarial attacks on LLM-based systems." 428 }, 429 { 430 "title": "Universal adversarial triggers for attacking and analyzing NLP", 431 "authors": ["Eric Wallace", "Shi Feng", "Nikhil Kandpal", "Matt Gardner", "Sameer Singh"], 432 "year": 2019, 433 "relevance": "Foundational work on universal adversarial triggers that induce harmful model responses across diverse inputs." 434 }, 435 { 436 "title": "Universal and transferable adversarial attacks on aligned language models", 437 "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"], 438 "year": 2023, 439 "arxiv_id": "2307.15043", 440 "relevance": "Demonstrates universal and transferable attacks on aligned LLMs, directly relevant to attack transferability findings in this paper." 441 }, 442 { 443 "title": "Prompt injection attack against LLM-integrated applications", 444 "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li"], 445 "year": 2023, 446 "arxiv_id": "2306.05499", 447 "relevance": "Core work on prompt injection attacks against LLM applications, the attack paradigm this paper applies to peer review." 448 }, 449 { 450 "title": "Formalizing and benchmarking prompt injection attacks and defenses", 451 "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"], 452 "year": 2024, 453 "relevance": "Systematic benchmarking of prompt injection attack and defense methods across LLM applications." 454 }, 455 { 456 "title": "Are we there yet? revealing the risks of utilizing large language models in scholarly peer review", 457 "authors": ["Rui Ye", "Xianghe Pang", "Jingyi Chai"], 458 "year": 2024, 459 "arxiv_id": "2412.01708", 460 "relevance": "Directly related work showing explicit manipulations can influence AI reviewers to produce positive reviews." 461 }, 462 { 463 "title": "Hidden prompts in manuscripts exploit AI-assisted peer review", 464 "authors": ["Zhicheng Lin"], 465 "year": 2025, 466 "arxiv_id": "2507.06185", 467 "relevance": "Systematically examined arXiv preprints for hidden prompts, identifying four categories in 18 papers. Direct precursor to this work." 468 }, 469 { 470 "title": "Publish to perish: Prompt injection attacks on LLM-assisted peer review", 471 "authors": ["Matteo Gioele Collu", "Umberto Salviati", "Roberto Confalonieri", "Mauro Conti", "Giovanni Apruzzese"], 472 "year": 2025, 473 "arxiv_id": "2508.20863", 474 "relevance": "Concurrent work investigating hidden adversarial prompt injections in LLM-assisted peer review using 26 rejected ICLR papers." 475 }, 476 { 477 "title": "AgentReview: Exploring peer review dynamics with LLM agents", 478 "authors": ["Yiqiao Jin", "Qinlin Zhao", "Yiyang Wang"], 479 "year": 2024, 480 "arxiv_id": "2406.12708", 481 "relevance": "Explores using LLM agents for peer review, the target application under attack in this paper." 482 }, 483 { 484 "title": "Can large language models provide useful feedback on research papers? A large-scale empirical analysis", 485 "authors": ["Weixin Liang", "Yuhui Zhang", "Hancheng Cao"], 486 "year": 2024, 487 "relevance": "Large-scale evaluation of LLM review feedback quality, relevant to understanding the baseline capability of AI reviewers being attacked." 488 }, 489 { 490 "title": "Automatic and universal prompt injection attacks against large language models", 491 "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"], 492 "year": 2024, 493 "arxiv_id": "2403.04957", 494 "relevance": "Automatic prompt injection attack generation for LLMs, directly related to the iterative attack paradigm in this paper." 495 }, 496 { 497 "title": "Judging the judges: Evaluating alignment and vulnerabilities in LLMs-as-judges", 498 "authors": ["Aman Singh Thakur", "Kartik Choudhary", "Venkat Srinik Ramayapally"], 499 "year": 2024, 500 "relevance": "Evaluates vulnerabilities in LLM-based evaluation systems, relevant to the broader concern of AI reviewer manipulation." 501 } 502 ] 503 }