ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30345B)


      1 {
      2   "paper": {
      3     "title": "PR-Attack: Coordinated Prompt-RAG Attacks on Retrieval-Augmented Generation in Large Language Models via Bilevel Optimization",
      4     "authors": [
      5       "Yang Jiao",
      6       "Xiaodong Wang",
      7       "Kai Yang"
      8     ],
      9     "year": 2025,
     10     "venue": "SIGIR '25 (48th International ACM SIGIR Conference on Research and Development in Information Retrieval)",
     11     "arxiv_id": "2504.07717",
     12     "doi": "10.1145/3726302.3730058"
     13   },
     14   "scan_version": 3,
     15   "active_modules": [
     16     "experimental_rigor",
     17     "data_leakage"
     18   ],
     19   "methodology_tags": [
     20     "benchmark-eval",
     21     "theoretical"
     22   ],
     23   "key_findings": "PR-Attack jointly poisons both the prompt (via a backdoor trigger in soft prompts) and the knowledge database (via optimized poisoned texts), formulated as a bilevel optimization problem. It achieves 90–100% attack success rate across 6 LLMs and 3 QA datasets with only a single poisoned text per target question. When the trigger is inactive, the system achieves higher accuracy than naive RAG, demonstrating stealth. All tested models are small open-source LLMs (1B–7B parameters).",
     24   "checklist": {
     25     "artifacts": {
     26       "code_released": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No code repository URL is provided anywhere in the paper. No GitHub, Zenodo, or other archive link is mentioned."
     30       },
     31       "data_released": {
     32         "applies": true,
     33         "answer": true,
     34         "justification": "The paper uses three publicly available QA datasets: Natural Questions (NQ), HotpotQA, and MS-MARCO, following the same setup as PoisonedRAG [104]. These are standard public benchmarks."
     35       },
     36       "environment_specified": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "No environment specifications are provided — no requirements.txt, Dockerfile, library versions, or hardware details. Only model names and sizes are listed."
     40       },
     41       "reproduction_instructions": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "No reproduction instructions are included. Algorithm 1 describes the optimization procedure at a mathematical level, but no runnable scripts or step-by-step instructions are provided."
     45       }
     46     },
     47     "statistical_methodology": {
     48       "confidence_intervals_or_error_bars": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Figures 2 and 3 show error bars (standard deviation) for the average performance across LLMs for both ASR and ACC metrics."
     52       },
     53       "significance_tests": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No statistical significance tests are used. Claims that PR-attack 'outperforms' baselines are based solely on comparing point estimates in Tables 1 and 2 without any p-values, t-tests, or other tests."
     57       },
     58       "effect_sizes_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "Tables 1 and 2 provide absolute ASR and ACC percentages for all methods across all settings, giving sufficient baseline context to assess the magnitude of improvement (e.g., PR-attack 93% vs. PoisonedRAG 62% on NQ with Vicuna)."
     62       },
     63       "sample_size_justified": {
     64         "applies": true,
     65         "answer": false,
     66         "justification": "The number of target questions used for evaluation is not stated or justified. The paper says it follows PoisonedRAG's setup but does not specify the sample size or provide any justification."
     67       },
     68       "variance_reported": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Figures 2 and 3 report standard deviations across LLMs. The paper states PR-attack 'exhibits a low standard deviation, highlighting both its effectiveness and broad applicability.'"
     72       }
     73     },
     74     "evaluation_design": {
     75       "baselines_included": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Six baseline methods are compared: GCG Attack, Corpus Poisoning, Disinformation Attack, Prompt Poisoning, GGPP, and PoisonedRAG (Table 1). For stealth evaluation, comparison against without-RAG and naive RAG baselines is provided (Table 2)."
     79       },
     80       "baselines_contemporary": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Baselines include recent methods: PoisonedRAG (2024), GGPP (2024), GCG Attack (2023), and Corpus Poisoning (2023). These represent the state of the art in RAG attacks."
     84       },
     85       "ablation_study": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The system has two key components (poisoned texts and soft prompt/trigger) but no ablation study removes one component while keeping the other within PR-attack's own framework. The comparisons with prompt-only (Prompt Poisoning) and RAG-only (PoisonedRAG) methods are separate implementations, not controlled ablations."
     89       },
     90       "multiple_metrics": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Two metrics are used: Attack Success Rate (ASR) for attack effectiveness (Table 1) and Accuracy (ACC) for stealth evaluation when the trigger is not activated (Table 2)."
     94       },
     95       "human_evaluation": {
     96         "applies": true,
     97         "answer": false,
     98         "justification": "No human evaluation is included. Evaluation is entirely automated using substring matching for ASR, following PoisonedRAG's setup. The paper notes this 'yields ASRs comparable to those obtained through human evaluation' per [104] but does not perform its own human evaluation."
     99       },
    100       "held_out_test_set": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "The soft prompt and poisoned texts are optimized for specific target questions (Eq. 2), and ASR is evaluated on those same target questions. There is no separation between optimization targets and evaluation targets."
    104       },
    105       "per_category_breakdown": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Results are broken down per LLM (6 models) and per dataset (NQ, HotpotQA, MS-MARCO) in Tables 1 and 2, and parameter sensitivity is shown per LLM in Figures 4 and 5."
    109       },
    110       "failure_cases_discussed": {
    111         "applies": true,
    112         "answer": false,
    113         "justification": "No failure cases are discussed. The paper does not analyze when or why PR-attack fails, even though ASR is not 100% in all settings."
    114       },
    115       "negative_results_reported": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "No negative results are reported. Every experiment shows PR-attack achieving the best performance. No configurations that failed or approaches that were tried and abandoned are mentioned."
    119       }
    120     },
    121     "claims_and_evidence": {
    122       "abstract_claims_supported": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The abstract claims 'high attack success rate even with a limited number of poisoned texts and significantly improved stealth.' Table 1 confirms 90%+ ASR with a single poisoned text, and Table 2 confirms ACC exceeding naive RAG."
    126       },
    127       "causal_claims_justified": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper claims joint optimization of prompt and poisoned texts 'enables the proposed method to achieve superior performance' (Sec. 4.2), but there is no controlled ablation within PR-attack's own framework to isolate the contribution of each component. Comparisons with separate methods (Prompt Poisoning, PoisonedRAG) use different implementations and cannot establish causality."
    131       },
    132       "generalization_bounded": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The title claims attacks on 'Large Language Models' generically, but all six tested models are small open-source LLMs (1B–7B parameters). No commercial API models (GPT-4, Claude) or larger models are tested. The paper also uses only one retriever (Contriever) but claims generality about RAG systems."
    136       },
    137       "alternative_explanations_discussed": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "No alternative explanations are discussed. For example, the paper does not consider whether smaller models are inherently more vulnerable, whether Contriever-specific properties enable the attack, or why ACC exceeds naive RAG when the trigger is inactive."
    141       },
    142       "proxy_outcome_distinction": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "ASR directly measures what is claimed (whether the LLM generates the target answer when the trigger is activated). ACC directly measures correctness when the trigger is inactive. The measurements match the stated claims without proxy gaps."
    146       }
    147     },
    148     "setup_transparency": {
    149       "model_versions_specified": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Models are listed as 'Vicuna 7B', 'LLaMA-2 7B', 'LLaMA-3.2 1B', 'GPT-J 6B', 'Phi-3.5 3.8B', 'Gemma-2 2B' with paper citations, but exact versions are not specified (e.g., Vicuna 1.3 vs 1.5, LLaMA-2 base vs chat). No snapshot dates or version identifiers are given."
    153       },
    154       "prompts_provided": {
    155         "applies": true,
    156         "answer": false,
    157         "justification": "Figure 1 shows a prompt template ('Question: ... Contexts: ... Please generate a response...') and the trigger word 'cf' is stated. However, the soft prompt consists of 15 learned embedding tokens that cannot be represented as text and are not provided. The actual prompts sent to the models cannot be fully reconstructed."
    158       },
    159       "hyperparameters_reported": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Key hyperparameters are stated in Sec. 4.1: b=20 tokens per poisoned text, n=15 trainable soft prompt tokens, k=5 retrieved texts, temperature=0.5, trigger word 'cf'. Step sizes and smoothing parameters for the optimization are not specified."
    163       },
    164       "scaffolding_described": {
    165         "applies": false,
    166         "answer": false,
    167         "justification": "No agentic scaffolding is used. The approach directly optimizes prompts and poisoned texts for a standard RAG pipeline without any agentic workflow."
    168       },
    169       "data_preprocessing_documented": {
    170         "applies": true,
    171         "answer": false,
    172         "justification": "The paper says target questions and answers are 'generated according to the procedure described in [104]' without reproducing the details. The number of target questions, how they were selected, and how knowledge databases were set up are not documented in this paper."
    173       }
    174     },
    175     "limitations_and_scope": {
    176       "limitations_section_present": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "There is no dedicated limitations, threats to validity, or similar section. The paper contains Introduction, Related Work, Method, Experiment, and Conclusion with no limitations discussion."
    180       },
    181       "threats_to_validity_specific": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No threats to validity are discussed anywhere in the paper. There is no consideration of potential confounds or methodological weaknesses."
    185       },
    186       "scope_boundaries_stated": {
    187         "applies": true,
    188         "answer": false,
    189         "justification": "No explicit scope boundaries are stated. The paper does not acknowledge what was NOT tested (e.g., larger models, other retrievers, defense mechanisms, real-world deployment conditions)."
    190       }
    191     },
    192     "data_integrity": {
    193       "raw_data_available": {
    194         "applies": true,
    195         "answer": false,
    196         "justification": "No raw experimental data or outputs are released. Only aggregated results in tables and figures are provided."
    197       },
    198       "data_collection_described": {
    199         "applies": true,
    200         "answer": false,
    201         "justification": "Data collection is delegated to prior work: 'the target questions and answers are generated according to the procedure described in [104].' The paper does not describe how many target questions were used, how knowledge databases were constructed, or what the data characteristics are."
    202       },
    203       "recruitment_methods_described": {
    204         "applies": false,
    205         "answer": false,
    206         "justification": "No human participants. Data sources are standard QA benchmarks (NQ, HotpotQA, MS-MARCO)."
    207       },
    208       "data_pipeline_documented": {
    209         "applies": true,
    210         "answer": false,
    211         "justification": "The optimization algorithm is described (Algorithm 1), but the full pipeline from raw datasets to final evaluation numbers is not documented. How models were loaded, how retrieval was implemented, and how ASR was computed via substring matching are not detailed."
    212       }
    213     },
    214     "conflicts_of_interest": {
    215       "funding_disclosed": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "The Acknowledgments section lists funding: NSFC grants 12371519 and 61771013, Asiainfo Technologies, Fundamental Research Funds for Central Universities, Shanghai Jiading District funds, and Tongji University PhD overseas research funding."
    219       },
    220       "affiliations_disclosed": {
    221         "applies": true,
    222         "answer": true,
    223         "justification": "Author affiliations are clearly listed: Tongji University (Yang Jiao, Kai Yang) and Columbia University (Xiaodong Wang). These institutions do not produce the evaluated models."
    224       },
    225       "funder_independent_of_outcome": {
    226         "applies": true,
    227         "answer": true,
    228         "justification": "Funders include NSFC (Chinese government), Asiainfo Technologies, and university research funds. None of the funders produce the LLMs being evaluated (Vicuna, LLaMA, GPT-J, etc.), so they appear independent of the outcome."
    229       },
    230       "financial_interests_declared": {
    231         "applies": true,
    232         "answer": false,
    233         "justification": "No competing interests or financial interests statement is included in the paper."
    234       }
    235     },
    236     "contamination": {
    237       "training_cutoff_stated": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "No training data cutoff dates are stated for any of the six models used. This is relevant because the QA benchmarks (NQ 2019, HotpotQA 2018, MS-MARCO 2016) predate all models and could be in training data."
    241       },
    242       "train_test_overlap_discussed": {
    243         "applies": true,
    244         "answer": false,
    245         "justification": "No discussion of whether the QA benchmark questions and answers appeared in the models' training data. Since all benchmarks predate all models by years, significant overlap is likely."
    246       },
    247       "benchmark_contamination_addressed": {
    248         "applies": true,
    249         "answer": false,
    250         "justification": "NQ (2019), HotpotQA (2018), and MS-MARCO (2016) were all published years before any of the tested models were trained. Contamination risk is not addressed. This is especially relevant for the ACC metric — if models have memorized correct answers, ACC comparisons are confounded."
    251       }
    252     },
    253     "human_studies": {
    254       "pre_registered": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "irb_or_ethics_approval": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "demographics_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "inclusion_exclusion_criteria": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "randomization_described": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       },
    279       "blinding_described": {
    280         "applies": false,
    281         "answer": false,
    282         "justification": "No human participants in this study."
    283       },
    284       "attrition_reported": {
    285         "applies": false,
    286         "answer": false,
    287         "justification": "No human participants in this study."
    288       }
    289     },
    290     "cost_and_practicality": {
    291       "inference_cost_reported": {
    292         "applies": true,
    293         "answer": false,
    294         "justification": "No inference cost, latency, or wall-clock time is reported. The bilevel optimization requires iterative gradient estimation and LLM forward passes, but no cost figures are provided."
    295       },
    296       "compute_budget_stated": {
    297         "applies": true,
    298         "answer": false,
    299         "justification": "Theoretical complexity is analyzed (Eq. 12) but no actual compute budget is stated — no GPU hours, hardware specs, training/optimization time, or total compute cost."
    300       }
    301     },
    302     "experimental_rigor": {
    303       "seed_sensitivity_reported": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No results across multiple random seeds are reported. The standard deviations in Figures 2 and 3 are across different LLMs, not across seeds for a given configuration."
    307       },
    308       "number_of_runs_stated": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged over multiple trials."
    312       },
    313       "hyperparameter_search_budget": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No hyperparameter search budget is reported. Parameters b=20, n=15, k=5 are stated as defaults without explaining how they were selected. Sensitivity analyses (Figures 4, 5) are provided post-hoc but no search budget is given."
    317       },
    318       "best_config_selection_justified": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The default configuration (b=20, n=15) is used without justification. Figures 4 and 5 show sensitivity analysis, but how the default values were chosen is not explained."
    322       },
    323       "multiple_comparison_correction": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "The paper compares 7 methods × 6 LLMs × 3 datasets (126 comparisons in Table 1 alone) with no statistical tests and therefore no multiple comparison correction."
    327       },
    328       "self_comparison_bias_addressed": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The authors implement all baselines and their own method. No discussion of self-comparison bias is provided. No independent evaluation or re-implementation verification is mentioned."
    332       },
    333       "compute_budget_vs_performance": {
    334         "applies": true,
    335         "answer": false,
    336         "justification": "PR-attack requires iterative bilevel optimization while some baselines (e.g., Disinformation Attack) are much cheaper. No comparison at matched compute budgets is provided."
    337       },
    338       "benchmark_construct_validity": {
    339         "applies": true,
    340         "answer": false,
    341         "justification": "The paper uses NQ, HotpotQA, and MS-MARCO without discussing whether these QA benchmarks are appropriate for measuring RAG attack effectiveness in real-world settings. No construct validity analysis is provided."
    342       },
    343       "scaffold_confound_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "No scaffolding is involved. The attack targets the standard RAG pipeline directly (retriever + LLM) without agentic scaffolding."
    347       }
    348     },
    349     "data_leakage": {
    350       "temporal_leakage_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "Not addressed. NQ (2019), HotpotQA (2018), and MS-MARCO (2016) predate all models by years. Models likely encountered these QA pairs during training, which could inflate ACC scores and confound attack evaluations."
    354       },
    355       "feature_leakage_addressed": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "Not addressed. The evaluation provides top-5 retrieved texts as context. Whether the retrieval setup leaks answer information differently across methods is not considered."
    359       },
    360       "non_independence_addressed": {
    361         "applies": true,
    362         "answer": false,
    363         "justification": "Not addressed. No discussion of whether target questions share structural similarities or whether results on related questions could be non-independent."
    364       },
    365       "leakage_detection_method": {
    366         "applies": true,
    367         "answer": false,
    368         "justification": "No leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines are applied."
    369       }
    370     }
    371   },
    372   "claims": [
    373     {
    374       "claim": "PR-attack achieves at least 90% ASR across all LLMs and datasets with only a single poisoned text per target question.",
    375       "evidence": "Table 1 shows ASR ranging from 91% to 100% across 6 LLMs × 3 datasets. The lowest is 91% (Llama-2 on NQ).",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "PR-attack exhibits remarkable stealthiness, achieving higher accuracy than naive RAG when the trigger is not activated.",
    380       "evidence": "Table 2 shows PR-attack ACC consistently exceeds naive RAG ACC across all settings (e.g., Gemma-2 on NQ: 95% vs 67%).",
    381       "supported": "weak"
    382     },
    383     {
    384       "claim": "PR-attack outperforms all state-of-the-art RAG attack methods.",
    385       "evidence": "Table 1 shows PR-attack achieves the highest ASR in all 18 settings (6 LLMs × 3 datasets), often by 10-30 percentage points.",
    386       "supported": "moderate"
    387     },
    388     {
    389       "claim": "PR-attack demonstrates broad applicability across various LLMs.",
    390       "evidence": "Tested on 6 LLMs (Vicuna, LLaMA-2, GPT-J, Phi-3.5, Gemma-2, LLaMA-3.2). Figures 2 and 3 show low standard deviation across LLMs.",
    391       "supported": "weak"
    392     },
    393     {
    394       "claim": "PR-attack is not sensitive to hyperparameters b (poisoned text length) and n (soft prompt length).",
    395       "evidence": "Figures 4 and 5 show ASR remains comparable across b ∈ {20,30,40,50,60} and n ∈ {10,15,20,25,30}.",
    396       "supported": "moderate"
    397     },
    398     {
    399       "claim": "The bilevel optimization framework provides theoretical complexity guarantees for the attack generation process.",
    400       "evidence": "Section 3.4 derives the overall complexity as O((B₁(K log K + (c₁+1)Mbd) + B₂Mnc₂)T) in Eq. (12).",
    401       "supported": "strong"
    402     }
    403   ],
    404   "red_flags": [
    405     {
    406       "flag": "ACC exceeds clean RAG baseline",
    407       "detail": "Table 2 shows PR-attack (trigger inactive) achieving HIGHER accuracy than naive RAG across all settings — sometimes by large margins (e.g., Gemma-2 NQ: 95% vs 67%). This is unexplained and suspicious: how does a system with poisoned knowledge and an attack prompt outperform a clean system? The soft prompt may be acting as a beneficial prompt tuning, but this confound is not discussed."
    408     },
    409     {
    410       "flag": "Only small open-source models tested",
    411       "detail": "All six models are small (1B–7B) open-source LLMs. No commercial API models (GPT-4, Claude, Gemini) or larger models (70B+) are tested, yet the title and claims reference 'Large Language Models' generically. Smaller models may be more vulnerable to prompt manipulation."
    412     },
    413     {
    414       "flag": "No significance tests across 126+ comparisons",
    415       "detail": "Claims of superiority are based on comparing raw percentages in Table 1 (7 methods × 6 LLMs × 3 datasets) without any statistical testing. The differences could partly reflect noise, especially since the number of target questions is not even stated."
    416     },
    417     {
    418       "flag": "No failure analysis",
    419       "detail": "Despite ASR being below 100% in many settings, no failure cases are analyzed. What types of questions resist the attack? Which poisoned texts fail to be retrieved? This analysis is absent."
    420     },
    421     {
    422       "flag": "No limitations section",
    423       "detail": "The paper contains no limitations, threats to validity, or ethics discussion. For a paper proposing a novel attack on AI systems, this omission is notable."
    424     },
    425     {
    426       "flag": "Single retriever tested",
    427       "detail": "Only Contriever is used as the retriever. RAG systems use diverse retrievers (BM25, DPR, ColBERT, etc.), and attack effectiveness may vary significantly across retrievers. Yet the paper claims generality about RAG systems."
    428     },
    429     {
    430       "flag": "Evaluation on same questions used for optimization",
    431       "detail": "The soft prompt and poisoned texts are optimized specifically for the target questions, and ASR is measured on those same questions. There is no held-out evaluation to test generalization beyond the optimized targets."
    432     }
    433   ],
    434   "cited_papers": [
    435     {
    436       "title": "PoisonedRAG: Knowledge Poisoning Attacks to Retrieval-Augmented Generation of Large Language Models",
    437       "authors": ["Wei Zou", "Runpeng Geng", "Binghui Wang", "Jinyuan Jia"],
    438       "year": 2024,
    439       "arxiv_id": "2402.07867",
    440       "relevance": "Primary baseline and experimental setup source for knowledge poisoning attacks on RAG-based LLMs."
    441     },
    442     {
    443       "title": "Prompt Perturbation in Retrieval-Augmented Generation Based Large Language Models",
    444       "authors": ["Zhibo Hu", "Chen Wang", "Yanfeng Shu", "Hye-Young Paik", "Liming Zhu"],
    445       "year": 2024,
    446       "relevance": "GGPP method — key baseline for prompt-based attacks on RAG systems."
    447     },
    448     {
    449       "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models",
    450       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J. Zico Kolter", "Matt Fredrikson"],
    451       "year": 2023,
    452       "arxiv_id": "2307.15043",
    453       "relevance": "GCG Attack — foundational adversarial attack method on aligned LLMs, used as baseline."
    454     },
    455     {
    456       "title": "PPT: Backdoor Attacks on Pre-trained Models via Poisoned Prompt Tuning",
    457       "authors": ["Wei Du", "Yichun Zhao", "Boqun Li", "Gongshen Liu", "Shilin Wang"],
    458       "year": 2022,
    459       "relevance": "Core inspiration for the backdoor trigger mechanism in soft prompt tuning used by PR-Attack."
    460     },
    461     {
    462       "title": "BadPrompt: Backdoor Attacks on Continuous Prompts",
    463       "authors": ["Xiangrui Cai", "Haidong Xu", "Sihan Xu", "Ying Zhang"],
    464       "year": 2022,
    465       "relevance": "Backdoor attacks on continuous prompts for pre-trained language models."
    466     },
    467     {
    468       "title": "PoisonPrompt: Backdoor Attack on Prompt-based Large Language Models",
    469       "authors": ["Hongwei Yao", "Jian Lou", "Zhan Qin"],
    470       "year": 2024,
    471       "relevance": "Backdoor attacks specifically targeting prompt-based LLMs, closely related attack paradigm."
    472     },
    473     {
    474       "title": "Prompt Injection Attack against LLM-integrated Applications",
    475       "authors": ["Yi Liu", "Gelei Deng", "Yuekang Li"],
    476       "year": 2023,
    477       "arxiv_id": "2306.05499",
    478       "relevance": "Prompt injection attacks on LLM-integrated applications, used as Prompt Poisoning baseline."
    479     },
    480     {
    481       "title": "A Survey on Large Language Model (LLM) Security and Privacy: The Good, the Bad, and the Ugly",
    482       "authors": ["Yifan Yao", "Jinhao Duan", "Kaidi Xu"],
    483       "year": 2024,
    484       "relevance": "Comprehensive survey on LLM security and privacy threats including prompt hacking and adversarial attacks."
    485     },
    486     {
    487       "title": "Poisoning Retrieval Corpora by Injecting Adversarial Passages",
    488       "authors": ["Zexuan Zhong", "Ziqing Huang", "Alexander Wettig", "Danqi Chen"],
    489       "year": 2023,
    490       "relevance": "Corpus poisoning attack on retrieval systems, used as baseline for adversarial passage injection."
    491     },
    492     {
    493       "title": "On the Risk of Misinformation Pollution with Large Language Models",
    494       "authors": ["Yikang Pan", "Liangming Pan", "Wenhu Chen", "Preslav Nakov", "Min-Yen Kan", "William Wang"],
    495       "year": 2023,
    496       "relevance": "Disinformation attack method using LLMs, used as baseline for misinformation generation."
    497     },
    498     {
    499       "title": "Security and Privacy Challenges of Large Language Models: A Survey",
    500       "authors": ["Badhan Chandra Das", "M. Hadi Amini", "Yanzhao Wu"],
    501       "year": 2024,
    502       "arxiv_id": "2402.00888",
    503       "relevance": "Survey covering security attacks on LLMs including prompt injection and data poisoning categories."
    504     },
    505     {
    506       "title": "Poisoning Web-Scale Training Datasets is Practical",
    507       "authors": ["Nicholas Carlini", "Matthew Jagielski"],
    508       "year": 2024,
    509       "relevance": "Demonstrates practical poisoning of web-scale datasets including Wikipedia, directly relevant to the attack vector assumed in PR-Attack."
    510     }
    511   ],
    512   "engagement_factors": {
    513     "practical_relevance": {
    514       "score": 1,
    515       "justification": "Security researchers could use the framework to test RAG system defenses, but no code is released and the technique requires bilevel optimization expertise to implement."
    516     },
    517     "surprise_contrarian": {
    518       "score": 1,
    519       "justification": "Joint prompt-RAG attack is a natural extension of existing single-vector attacks; the approach is not surprising to the security community."
    520     },
    521     "fear_safety": {
    522       "score": 3,
    523       "justification": "Demonstrates a stealthy attack on RAG systems achieving near-perfect success rates with minimal poisoning, with an explicit misinformation scenario (earthquake/nuclear testing) that could amplify panic."
    524     },
    525     "drama_conflict": {
    526       "score": 1,
    527       "justification": "No controversy or accusatory angle — standard attack-paper structure proposing a new method and comparing with baselines."
    528     },
    529     "demo_ability": {
    530       "score": 0,
    531       "justification": "No code, demo, or tool is released. The approach requires implementing bilevel optimization over LLM inference."
    532     },
    533     "brand_recognition": {
    534       "score": 1,
    535       "justification": "Tongji University and Columbia University are recognized institutions but not top AI labs. Published at SIGIR, a respected but niche venue for this topic."
    536     }
    537   }
    538 }

Impressum · Datenschutz