ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (34727B)


      1 {
      2   "paper": {
      3     "title": "When Reject Turns into Accept: Quantifying the Vulnerability of LLM-Based Scientific Reviewers to Indirect Prompt Injection",
      4     "authors": [
      5       "Devanshu Sahoo",
      6       "Manish Prasad",
      7       "Vasudev Majhi",
      8       "Jahnvi Singh",
      9       "Vinay Chamola",
     10       "Yash Sinha",
     11       "Murari Mandal",
     12       "Dhruv Kumar"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv.org",
     16     "arxiv_id": "2512.10449",
     17     "doi": "10.48550/arXiv.2512.10449"
     18   },
     19   "scan_version": 3,
     20   "active_modules": ["experimental_rigor", "data_leakage"],
     21   "methodology_tags": ["benchmark-eval"],
     22   "key_findings": "Adversarial invisible-text injections in PDFs can flip LLM reviewer decisions from Reject to Accept at rates up to 86.26% for open-source models, with token-level obfuscation (Class I) strategies proving most effective. Proprietary models like GPT-5 show near-perfect robustness, but distilled variants (GPT-5-Mini) and advanced reasoning models (Gemini-2.5-Pro) exhibit specific vulnerability patterns — a 'safety tax' from distillation and a 'reasoning trap' where instruction-following fidelity is weaponized. Social engineering attacks (Class III) frequently backfire, actually reducing scores in both open and closed-source models.",
     23   "checklist": {
     24     "artifacts": {
     25       "code_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The abstract provides a repository URL (https://anonymous.4open.science/r/llm-jailbreak-FC9E/) and Section 1 Contribution 4 states 'we will release our complete experimental framework including code and dataset.' The anonymous link is functional at time of review."
     29       },
     30       "data_released": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper states the dataset of 200 papers and injection framework will be released via the anonymous repository. The ICLR 2025 OpenReview papers used are publicly available. Section 1 Contribution 4 explicitly commits to releasing the dataset."
     34       },
     35       "environment_specified": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No requirements.txt, Dockerfile, conda environment, or detailed environment setup section is provided. The paper mentions Ollama for local models and MinerU for PDF conversion but provides no version pinning or dependency specifications."
     39       },
     40       "reproduction_instructions": {
     41         "applies": true,
     42         "answer": false,
     43         "justification": "No step-by-step reproduction instructions are included in the paper. The pipeline is described at a high level (Section 3.3, Figure 1) but there are no README-style commands or scripts to replicate the experiments."
     44       }
     45     },
     46     "statistical_methodology": {
     47       "confidence_intervals_or_error_bars": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "All results are reported as point estimates — average score increases (Figures 2, 3), percentage changes in acceptance rates (Table 3), and WAVS scores (Figures 4, 5). No confidence intervals, error bars, or uncertainty measures are provided anywhere."
     51       },
     52       "significance_tests": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The paper makes numerous comparative claims (e.g., 'Cls1MSM achieved near-perfect score inflation on mistral-small:22b (+13.95)' vs. qwen3:30b resilience) without any statistical significance tests. All comparisons are based on raw number differences."
     56       },
     57       "effect_sizes_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Effect sizes are reported as Average Score Increase on a 0-35 scale (e.g., +13.95 for Cls1MSM on mistral-small), percentage increase in acceptance rates (e.g., 86.26%), and the WAVS composite metric. Baselines are provided for context."
     61       },
     62       "sample_size_justified": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The dataset comprises 200 papers (30 templates, 125 rejected, 30 poster, 15 spotlight) with a subset of 50 for closed-source models 'to accommodate cost and rate limits' (Section 4.1). No power analysis or statistical justification for these sizes is provided."
     66       },
     67       "variance_reported": {
     68         "applies": true,
     69         "answer": false,
     70         "justification": "Only aggregate averages are reported. No standard deviations, interquartile ranges, or variance across papers within each (model, strategy) condition. The strip plots in Appendix G show individual scores but no summary spread measures are computed."
     71       }
     72     },
     73     "evaluation_design": {
     74       "baselines_included": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "All metrics are computed relative to the un-injected baseline. Section 3.3 describes how the original paper is scored without adversarial injection, and all score increases (Figures 2, 3) and decision flip rates (Table 3) are measured against this baseline."
     78       },
     79       "baselines_contemporary": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "The models evaluated are contemporary (GPT-5, Claude Haiku 4.5, Gemini 2.5 Pro/Flash, DeepSeek-R1-32B, Qwen3-30B, etc.) and represent the state of the art as of late 2025. The baseline is the same models without attack, which is appropriate."
     83       },
     84       "ablation_study": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "While 15 strategies across 3 classes are tested individually, there is no systematic ablation of attack components (e.g., testing misspellings alone vs. variable injection alone within Cls1MSM, or testing invisible text vs. visible injection). The strategies are tested as fixed units."
     88       },
     89       "multiple_metrics": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Three primary metrics are used: Average Score Increase (Section 4.3), Percentage Increase in Acceptance Rates (Table 3), and the novel WAVS metric (Appendix A) with its three sub-components (Score Sensitivity, Semantic Flip Severity, Risk Alignment)."
     93       },
     94       "human_evaluation": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "No human evaluation is included. There is no human assessment of whether the LLM reviews are actually valid, no human judges evaluating the quality of the adversarial injections, and no comparison of LLM review outputs against human expert reviews."
     98       },
     99       "held_out_test_set": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "All 200 papers are used for evaluation with no held-out set. The subset of 50 used for closed-source models was selected for cost reasons, not as a held-out validation split. No dev/test separation is described."
    103       },
    104       "per_category_breakdown": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Results are broken down per-model and per-strategy in heatmaps (Figures 2, 3), per attack class in the taxonomy (Classes I-III), and decomposed by WAVS components (Figure 7). Table 3 provides per-model per-strategy acceptance rate changes. Appendix D-G provides additional breakdowns."
    108       },
    109       "failure_cases_discussed": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The paper discusses the 'Backfire Effect' where Class III strategies trigger score decreases (Section 5.1), shows specific failure examples (falcon3 penalizing Cls3SP by -4.07), and Appendix C provides a case study of both a vulnerable model (mistral-small) and a robust model (gemini-2.5-flash) refusing the attack."
    113       },
    114       "negative_results_reported": {
    115         "applies": true,
    116         "answer": true,
    117         "justification": "Several negative results are reported: Social engineering attacks frequently backfire (Section 5.1, 5.2); GPT-5 shows near-zero vulnerability across all strategies; qwen3:30b demonstrates resilience despite being a large model; many strategy-model combinations show negative score changes in Table 3."
    118       }
    119     },
    120     "claims_and_evidence": {
    121       "abstract_claims_supported": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The abstract claims 'decision flip rates of up to 86.26% in open-source models' — supported by Table 3 (mistral-small Cls1DRA: 86.26%). Claims about 'reasoning traps in proprietary systems' are supported by Section 5.2's analysis of Gemini-2.5-Pro. The 15 strategies and 13 models are documented."
    125       },
    126       "causal_claims_justified": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The paper's causal claims ('adversarial injections manipulate scores') are justified by the controlled experimental design: the same papers are scored with and without injection, isolating the causal effect of the injection. This is a valid single-variable manipulation design."
    130       },
    131       "generalization_bounded": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Section 8 (Limitations) explicitly bounds generalization: CS-specific dataset may not generalize to humanities/clinical sciences, assumes 'Lazy Reviewer' worst case, proprietary model results are a 'snapshot in time,' and only textual/layout injections tested (not multimodal)."
    135       },
    136       "alternative_explanations_discussed": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper discusses alternative factors: model size does not linearly predict vulnerability (Section 5.1), distillation effects explain GPT-5-Mini vulnerability (Section 5.2), tokenization robustness explains proprietary model resilience (Section 5.2 point 3), and tulu3:8b's 'mode collapse' failure mode is identified."
    140       },
    141       "proxy_outcome_distinction": {
    142         "applies": true,
    143         "answer": true,
    144         "justification": "The WAVS metric (Appendix A) explicitly distinguishes between the proxy (score inflation) and the outcome (decision flips). The three-component decomposition separates Score Sensitivity, Semantic Flip Severity, and Risk Alignment, acknowledging that not all score increases equally represent vulnerability."
    145       }
    146     },
    147     "setup_transparency": {
    148       "model_versions_specified": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Most model versions are specified only by marketing name: 'GPT-5', 'GPT-5-Mini', 'Gemini 2.5 Flash', 'Gemini 2.5 Pro' without API version or snapshot date. The exception is Claude Haiku 4.5 which includes a date (claude-haiku-4-5-20251001). Open-source models list sizes but not commit hashes or specific checkpoints."
    152       },
    153       "prompts_provided": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The system prompt (Listing 2) and user prompt (Listing 3) are provided in full in Appendix C. The 15 adversarial strategy prompts are documented in Table 4 (Appendix B) with mechanism descriptions and truncated but substantial prompt snippets. Listing 1 shows a full injection payload."
    157       },
    158       "hyperparameters_reported": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No temperature, top-p, max tokens, or other inference hyperparameters are reported for any of the 13 models. These settings significantly affect LLM output and vulnerability to adversarial manipulation."
    162       },
    163       "scaffolding_described": {
    164         "applies": false,
    165         "answer": false,
    166         "justification": "No agentic scaffolding is used. The pipeline is a linear processing chain (PDF → MinerU → injection → API call → regex parsing) without agentic loops, tool use, or feedback mechanisms."
    167       },
    168       "data_preprocessing_documented": {
    169         "applies": true,
    170         "answer": true,
    171         "justification": "Section 3.3 documents the preprocessing pipeline: MinerU converts PDFs to layout-preserving Markdown, adversarial prompts are appended in white 1pt font, and the pipeline stages are detailed in Figure 1 and Appendix C. Section 4.1 describes the dataset composition and sourcing."
    172       }
    173     },
    174     "limitations_and_scope": {
    175       "limitations_section_present": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Section 8 is a dedicated 'Limitations' section with substantive discussion spanning four specific limitations: dataset domain specificity, Lazy Reviewer assumption, API snapshot volatility, and exclusion of multimodal attacks."
    179       },
    180       "threats_to_validity_specific": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Section 8 lists specific threats: 'our evaluation is bounded by the size and domain specificity of our dataset; we utilized 200 manuscripts primarily sourced from Computer Science venues'; proprietary models 'are accessed via black-box APIs subject to continuous, unannounced updates'; the Lazy Reviewer assumption 'represents a worst-case security posture.'"
    184       },
    185       "scope_boundaries_stated": {
    186         "applies": true,
    187         "answer": true,
    188         "justification": "Section 8 explicitly states what was NOT tested: other disciplines ('humanities or clinical sciences'), human-in-the-loop scenarios, and 'multi-modal adversarial attacks embedded within scientific figures or charts, which remains an open avenue for future investigation.'"
    189       }
    190     },
    191     "data_integrity": {
    192       "raw_data_available": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "The paper commits to releasing 'our complete experimental framework including code and dataset' (Section 1 Contribution 4) and provides an anonymous repository link. The strip plots in Appendix G show per-paper raw scores, and the full raw data is stated to be part of the release."
    196       },
    197       "data_collection_described": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "Section 4.1 describes data collection: 200 papers from two sources — official conference templates (IEEE, ACL) and ICLR 2025 OpenReview real-world submissions, balanced across accepted (Spotlight/Poster) and rejected manuscripts with specific counts (30 templates, 125 rejected, 30 poster, 15 spotlight)."
    201       },
    202       "recruitment_methods_described": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "Section 4.1 describes paper sourcing: official conference templates serve as vacuous baselines, and real-world papers come from the ICLR 2025 OpenReview track. The rationale for each source is explained (templates test catastrophic failure, real papers test real-world effectiveness)."
    206       },
    207       "data_pipeline_documented": {
    208         "applies": true,
    209         "answer": true,
    210         "justification": "The five-stage pipeline is documented in Section 3.3 and Figure 1: (1) Attack Injection with invisible white-font text, (2) MinerU PDF-to-Markdown conversion, (3) System/User prompt construction, (4) Multi-provider inference loop, (5) Regex JSON parsing with failure logging. Appendix C provides a detailed case study walkthrough."
    211       }
    212     },
    213     "conflicts_of_interest": {
    214       "funding_disclosed": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source is disclosed. Section 9 (Acknowledgement) only mentions 'the use of ChatGPT in improving the presentation and grammar.' No grants, sponsors, or funding agencies are listed."
    218       },
    219       "affiliations_disclosed": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "Author affiliations are listed: BITS Pilani and KIIT University. The authors are academic researchers not affiliated with any of the model providers being evaluated."
    223       },
    224       "funder_independent_of_outcome": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No funding is disclosed, making independence impossible to assess. Since the absence of funding disclosure is itself a transparency issue, this cannot be marked as satisfied."
    228       },
    229       "financial_interests_declared": {
    230         "applies": true,
    231         "answer": false,
    232         "justification": "No competing interests or financial interests statement appears anywhere in the paper."
    233       }
    234     },
    235     "contamination": {
    236       "training_cutoff_stated": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "This paper tests adversarial robustness of LLM reviewers to prompt injection attacks, not model knowledge or capability on a benchmark. The evaluation measures whether injected prompts can manipulate review scores, which is independent of training data contamination."
    240       },
    241       "train_test_overlap_discussed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Same rationale: this is a red-teaming study measuring adversarial manipulation, not model capability on a knowledge benchmark. Train/test overlap is not the relevant threat to validity here."
    245       },
    246       "benchmark_contamination_addressed": {
    247         "applies": false,
    248         "answer": false,
    249         "justification": "Same rationale: the paper evaluates defense robustness against adversarial attacks, not pre-trained model knowledge. Contamination of the review papers in training data would affect baseline scores but not the adversarial manipulation effect being measured."
    250       }
    251     },
    252     "human_studies": {
    253       "pre_registered": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants involved. The study evaluates LLM models on adversarial paper variants."
    257       },
    258       "irb_or_ethics_approval": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants involved."
    262       },
    263       "demographics_reported": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants involved."
    267       },
    268       "inclusion_exclusion_criteria": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants involved."
    272       },
    273       "randomization_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants involved."
    277       },
    278       "blinding_described": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants involved."
    282       },
    283       "attrition_reported": {
    284         "applies": false,
    285         "answer": false,
    286         "justification": "No human participants involved."
    287       }
    288     },
    289     "cost_and_practicality": {
    290       "inference_cost_reported": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No inference costs, API costs, tokens consumed, or latency figures are reported despite running 13 models × (200 or 50) papers × 16 conditions (original + 15 strategies) = tens of thousands of inference calls."
    294       },
    295       "compute_budget_stated": {
    296         "applies": true,
    297         "answer": false,
    298         "justification": "No GPU hours for local model inference (8 models via Ollama), API spending for proprietary models (5 models), or total wall-clock time are reported."
    299       }
    300     },
    301     "experimental_rigor": {
    302       "seed_sensitivity_reported": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No mention of multiple random seeds or stochastic variation analysis. Each (model, paper, strategy) triplet appears to be evaluated once, with no assessment of run-to-run variability."
    306       },
    307       "number_of_runs_stated": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper does not state how many times each inference triplet was executed. Section 3.3 describes iterating through triplets but does not specify whether each is run once or multiple times."
    311       },
    312       "hyperparameter_search_budget": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No hyperparameter search is described. The adversarial prompts are fixed, and no tuning of injection placement, font size, or prompt phrasing variations is reported."
    316       },
    317       "best_config_selection_justified": {
    318         "applies": true,
    319         "answer": true,
    320         "justification": "All 15 strategies are reported across all 13 models — there is no selective reporting of best configurations. The full heatmaps (Figures 2, 3), Table 3, and Appendix G present complete results."
    321       },
    322       "multiple_comparison_correction": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The paper compares 13 models × 15 strategies = 195 conditions with many implicit comparisons, but no statistical tests are performed at all, let alone multiple comparison corrections."
    326       },
    327       "self_comparison_bias_addressed": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The authors designed the 15 attack strategies and evaluate their effectiveness. No acknowledgment of author-evaluation bias or independent validation by third parties is provided."
    331       },
    332       "compute_budget_vs_performance": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "No comparison of vulnerability profiles as a function of compute budget. Open-source models (8B-32B) are compared with proprietary models of unknown size, but no compute-normalized analysis is performed."
    336       },
    337       "benchmark_construct_validity": {
    338         "applies": true,
    339         "answer": false,
    340         "justification": "The paper does not discuss whether its evaluation setup (LLM scoring papers on a 0-35 rubric) actually measures real-world vulnerability of the review process. The Lazy Reviewer assumption is stated but not validated against actual reviewer behavior."
    341       },
    342       "scaffold_confound_addressed": {
    343         "applies": true,
    344         "answer": true,
    345         "justification": "The same evaluation pipeline (MinerU conversion → system prompt → user prompt with injection) is used across all models, controlling for scaffold effects. The only variation is the inference provider (Ollama vs. API), which is inherent to the model access method."
    346       }
    347     },
    348     "data_leakage": {
    349       "temporal_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Not discussed. The ICLR 2025 papers could have appeared in the training data of models like GPT-5 or Gemini 2.5 Pro. If a model has seen a paper's reviews or acceptance status during training, baseline scores would be confounded."
    353       },
    354       "feature_leakage_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Not discussed. The evaluation provides full paper text to the model, which is realistic, but the paper does not consider whether template papers are trivially identifiable to models that have seen conference templates in training."
    358       },
    359       "non_independence_addressed": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "Not discussed. Multiple papers from the same venue (ICLR 2025) may share structural similarities. The 30 official conference templates likely share formatting that could bias model responses in correlated ways."
    363       },
    364       "leakage_detection_method": {
    365         "applies": true,
    366         "answer": false,
    367         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, temporal splits, or decontamination measures are mentioned."
    368       }
    369     }
    370   },
    371   "claims": [
    372     {
    373       "claim": "Token-level obfuscation strategies (Class I) are the most universally potent attack vectors, with Maximum Mark Magyk (Cls1MSM) achieving +13.95 score inflation on mistral-small:22b and Disguise and Reconstruction (Cls1DRA) achieving +13.87 on the same model.",
    374       "evidence": "Figure 2 heatmap shows these values. Section 5.1 discusses the 'Dominance of Token-Level Obfuscation' with specific score increases per model.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Adversarial injections can achieve decision flip rates of up to 86.26% in open-source models (mistral-small with Cls1DRA strategy).",
    379       "evidence": "Table 3 reports percentage increase in acceptance rates. Mistral-small Cls1DRA shows 86.26% increase. Multiple open-source models show >50% flip rates.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Model vulnerability does not correlate linearly with parameter count — qwen3:30b is resilient while gemma3:27b is catastrophically vulnerable.",
    384       "evidence": "Section 5.1 discusses 'Scale-Independent Vulnerability'. Figure 2 shows qwen3:30b scores between -0.82 and +2.254 while gemma3:27b reaches +13.75. Figure 5 shows their relative vulnerability rates.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "GPT-5 exhibits near-perfect robustness with negligible score inflation across all attack vectors and zero decision flips.",
    389       "evidence": "Figure 3 shows GPT-5 as 'almost entirely dark blue'. Table 3 shows 0.00% flip rate across all 15 strategies for GPT-5. Figure 5 places GPT-5 at 0.0 relative vulnerability.",
    390       "supported": "strong"
    391     },
    392     {
    393       "claim": "A 'safety tax' exists from model distillation — GPT-5-Mini is significantly more vulnerable than GPT-5, with susceptibility to Logic Decipherer (Cls2LDA, +1.84) and Evidence-Based Persuasion (Cls3EBP, +1.60).",
    394       "evidence": "Section 5.2 compares GPT-5 vs GPT-5-Mini. Figure 3 heatmap shows the contrast. However, Table 3 shows GPT-5-Mini has 0.00% flip rate across all strategies, suggesting the score inflation doesn't translate to actual decision flips.",
    395       "supported": "weak"
    396     },
    397     {
    398       "claim": "Social engineering attacks (Class III) frequently trigger the 'Backfire Effect', causing negative score adjustments instead of inflation.",
    399       "evidence": "Section 5.1 reports falcon3 penalizing Social Proof (Cls3SP) by -4.07, gemma3 penalizing Expert Endorsement by -3.95. Section 5.2 reports GPT-5 (-0.96), GPT-5-Mini (-0.89), and Claude-Haiku (-0.66) all penalizing Cls3SP.",
    400       "supported": "strong"
    401     },
    402     {
    403       "claim": "Advanced reasoning capabilities can become a vulnerability vector — Gemini-2.5-Pro shows the highest proprietary vulnerability spike against Symbolic Masking and Context Redirection (Cls1SMCR, +2.54).",
    404       "evidence": "Section 5.2 discusses the 'Reasoning Trap' hypothesis. Figure 3 shows the +2.54 spike. However, Table 3 shows Gemini-2.5-Pro has limited actual decision flips (13.04% for Cls1SMCR).",
    405       "supported": "weak"
    406     }
    407   ],
    408   "red_flags": [
    409     {
    410       "flag": "No uncertainty quantification",
    411       "detail": "All results are single-run point estimates with no confidence intervals, error bars, standard deviations, or repeated trials. LLM outputs are stochastic — a single inference per (model, paper, strategy) triplet means reported scores could be noise rather than signal, especially for small effect sizes in proprietary models."
    412     },
    413     {
    414       "flag": "No statistical tests for comparative claims",
    415       "detail": "Claims like 'qwen3:30b demonstrated remarkable resilience' vs. 'gemma3:27b was catastrophically vulnerable' and the entire Safety Gap / Reasoning Trap narrative rest on comparing raw numbers without any significance testing. With 200 papers, formal tests would be feasible."
    416     },
    417     {
    418       "flag": "Missing hyperparameters for inference",
    419       "detail": "Temperature, top-p, and max_tokens are not reported for any model. Temperature alone can dramatically affect LLM susceptibility to adversarial manipulation. Results may not be reproducible without these settings."
    420     },
    421     {
    422       "flag": "Reduced proprietary model dataset without representativeness analysis",
    423       "detail": "Closed-source models are evaluated on only 50 of 200 papers 'to accommodate cost and rate limits' (Section 4.1). No analysis of whether this subset is representative of the full dataset, and no stratified sampling description. This makes open-source vs. closed-source comparisons asymmetric."
    424     },
    425     {
    426       "flag": "Safety tax claim not supported by decision flips",
    427       "detail": "The 'safety tax' claim about GPT-5-Mini is based on score inflation (+1.84), but Table 3 shows GPT-5-Mini has 0.00% decision flip rate across ALL strategies — identical to GPT-5. The framing overstates the practical impact of small score increases that never change decisions."
    428     },
    429     {
    430       "flag": "Adversarial strategy design not independently validated",
    431       "detail": "The authors both designed the 15 attack strategies and evaluated their effectiveness. No independent red-team, external validation, or comparison against independently designed attacks is provided. Author-evaluation bias (Lucic et al. 2018) could inflate apparent effectiveness."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "Not what you've signed up for: Compromising real-world llm-integrated applications with indirect prompt injection",
    437       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    438       "year": 2023,
    439       "relevance": "Foundational work formalizing indirect prompt injection against LLMs processing external content, directly relevant to the survey's coverage of prompt injection attacks."
    440     },
    441     {
    442       "title": "Universal and transferable adversarial attacks on aligned language models",
    443       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    444       "year": 2023,
    445       "arxiv_id": "2307.15043",
    446       "relevance": "Demonstrates universal adversarial suffixes for jailbreaking aligned LLMs, a key attack technique in the LLM safety landscape."
    447     },
    448     {
    449       "title": "JailbreakBench: an open robustness benchmark for jailbreaking large language models",
    450       "authors": ["Patrick Chao", "Edoardo Debenedetti", "Alexander Robey"],
    451       "year": 2024,
    452       "relevance": "Standardized benchmark for evaluating LLM jailbreak robustness, relevant to the survey's coverage of adversarial evaluation methodologies."
    453     },
    454     {
    455       "title": "BadJudge: Backdoor vulnerabilities of LLM-as-a-judge",
    456       "authors": ["Terry Tong", "Fei Wang", "Zhe Zhao", "Muhao Chen"],
    457       "year": 2025,
    458       "arxiv_id": "2503.00596",
    459       "relevance": "Directly relevant to surveying vulnerabilities in LLM-as-a-judge systems, examining backdoor attack vectors."
    460     },
    461     {
    462       "title": "How johnny can persuade LLMs to jailbreak them: Rethinking persuasion to challenge AI safety by humanizing LLMs",
    463       "authors": ["Yi Zeng", "Hongpeng Lin", "Jingwen Zhang", "Diyi Yang", "Ruoxi Jia", "Weiyan Shi"],
    464       "year": 2024,
    465       "relevance": "Demonstrates that persuasion techniques can jailbreak LLMs, relevant to the survey's coverage of social engineering attacks against AI systems."
    466     },
    467     {
    468       "title": "Play guessing game with LLM: Indirect jailbreak attack with implicit clues",
    469       "authors": ["Zhiyuan Chang", "Mingyang Li", "Yi Liu", "Junjie Wang", "Qing Wang", "Yang Liu"],
    470       "year": 2024,
    471       "relevance": "Token-level obfuscation jailbreak technique relevant to the survey's adversarial attack taxonomy."
    472     },
    473     {
    474       "title": "Investigating the Vulnerability of LLM-as-a-Judge Architectures to Prompt-Injection Attacks",
    475       "authors": ["Narek Maloyan", "Bislan Ashinov", "Dmitry Namiot"],
    476       "year": 2025,
    477       "relevance": "Directly investigates prompt injection against LLM-as-a-judge, closely related to the survey's scope on adversarial manipulation of AI evaluation systems."
    478     },
    479     {
    480       "title": "Prompt injection attacks on llm generated reviews of scientific publications",
    481       "authors": ["Janis Keuper"],
    482       "year": 2025,
    483       "arxiv_id": "2509.10248",
    484       "relevance": "Prior work on prompt injection against LLM-based scientific reviewers using naive direct injection, which this paper explicitly extends with more sophisticated attacks."
    485     },
    486     {
    487       "title": "A wolf in sheep's clothing: Generalized nested jailbreak prompts can fool large language models easily",
    488       "authors": ["Peng Ding", "Jun Kuang", "Dan Ma"],
    489       "year": 2024,
    490       "relevance": "Nested jailbreak prompts technique directly adapted in this paper's Scenario Nesting (Cls2SN) attack strategy."
    491     },
    492     {
    493       "title": "FlipAttack: Jailbreak LLMs via flipping",
    494       "authors": ["Yue Liu", "Xiaoxin He", "Miao Xiong"],
    495       "year": 2025,
    496       "relevance": "Flip-based jailbreak technique adapted as Cls2FA (Flip Attack) in this paper's taxonomy."
    497     },
    498     {
    499       "title": "Making them ask and answer: jailbreaking large language models in few queries via disguise and reconstruction",
    500       "authors": ["Tong Liu", "Yingjie Zhang", "Zhe Zhao"],
    501       "year": 2024,
    502       "relevance": "Disguise and reconstruction jailbreak technique adapted as Cls1DRA, the most effective open-source attack in this paper."
    503     },
    504     {
    505       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    506       "authors": ["DeepSeek-AI"],
    507       "year": 2025,
    508       "arxiv_id": "2501.12948",
    509       "relevance": "One of the evaluated reasoning models, relevant to the survey's coverage of LLM capabilities and robustness."
    510     }
    511   ],
    512   "engagement_factors": {
    513     "practical_relevance": {
    514       "score": 2,
    515       "justification": "Directly relevant to conference organizers and reviewers deploying AI review tools, but the attacks require PDF manipulation expertise and the defenses are high-level recommendations."
    516     },
    517     "surprise_contrarian": {
    518       "score": 2,
    519       "justification": "The 'reasoning trap' finding — that more capable models can be MORE vulnerable to certain attacks — challenges the assumption that scale improves safety."
    520     },
    521     "fear_safety": {
    522       "score": 3,
    523       "justification": "Demonstrates that invisible PDF injections can flip Reject-to-Accept at 86% rates, directly threatening the integrity of scientific peer review at major conferences."
    524     },
    525     "drama_conflict": {
    526       "score": 2,
    527       "justification": "The 'Lazy Reviewer' framing, ICLR/NeurIPS as targets, and the 'meritocratic collapse' narrative create controversy around AI use in peer review."
    528     },
    529     "demo_ability": {
    530       "score": 1,
    531       "justification": "Anonymous repository link provided but requires setting up multiple LLM APIs and local model infrastructure to reproduce."
    532     },
    533     "brand_recognition": {
    534       "score": 2,
    535       "justification": "Tests well-known models (GPT-5, Claude, Gemini, DeepSeek) and references major conferences (NeurIPS, ICLR, AAAI)."
    536     }
    537   }
    538 }

Impressum · Datenschutz