ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (30184B)


      1 {
      2   "paper": {
      3     "title": "System Prompt Poisoning: Persistent Attacks on Large Language Models Beyond User Injection",
      4     "authors": [
      5       "Zongze Li",
      6       "Jiawei Guo",
      7       "Haipeng Cai"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2505.06493",
     12     "doi": "10.48550/arXiv.2505.06493"
     13   },
     14   "scan_version": 3,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "System prompt poisoning is highly effective, reducing reasoning model accuracy to near-zero (<4%) on MATH in stateless API scenarios. The attack persists across 500-turn conversations without significant weakening for reasoning models. User-side augmentation techniques (ICL, CoT) and the Explicit Reminder defense fail to mitigate the attack. Brute-force poisoning is extremely cheap (~2 seconds, <1.3k tokens) while adaptive strategies cost orders of magnitude more but offer only marginally different effectiveness.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No source code repository or archive is linked anywhere in the paper. The Auto-SPP framework is described algorithmically (Algorithm 1, Appendix B) but no implementation is released."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available MATH dataset (Hendrycks et al., 2021) and HumanEval dataset (Chen et al., 2021). Both are standard public benchmarks."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No environment specifications, requirements files, or dependency details are provided. The paper does not describe software versions, libraries, or runtime environments."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. While the experimental setup is described at a high level in Section 5.2, there are no scripts, commands, or detailed instructions for replication."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "All results in Table 1 and throughout the paper are point estimates (e.g., '0.8%', '99.1%↓') with no confidence intervals or error bars reported."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No statistical significance tests are used despite making many comparative claims (e.g., 'reasoning models are significantly more vulnerable'). All comparisons are based on raw numerical differences."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Percentage decreases from baseline are reported for all results (e.g., '99.1%↓' in Table 1), providing effect size context relative to the unpoisoned baseline."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper states '500 randomly selected samples' from MATH but provides no justification for this number and no power analysis. HumanEval uses its standard 164 problems without discussion of adequacy."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be from single experimental runs with no indication of result stability."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Each configuration includes a 'No poisoning' baseline (Table 1) showing model accuracy without attack, against which all poisoning strategies are compared."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The defense baseline (Explicit Reminder, Yi et al. 2025) is contemporary. The models tested (GPT-5-mini, Gemini-2.5-flash, GPT-4o-mini) are current. For an attack paper, the 'no poisoning' baseline is the natural comparator."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "While three distinct poisoning strategies are compared, there is no ablation study examining which components of each strategy drive effectiveness (e.g., number of exemplars, reasoning chain length, or specific instruction formulations)."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Each domain uses only a single metric: solution accuracy for MATH and pass@1 for HumanEval. No secondary metrics (e.g., output quality beyond correctness, attack detectability) are reported."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation is conducted. Human inspection of poisoned outputs (e.g., to assess stealthiness or output quality beyond correctness) would be relevant but is absent."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "The attacks are generated independently of the test data. The poisoned system prompts are crafted without access to the specific MATH or HumanEval test examples, and evaluation is on the standard benchmarks."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Table 1 provides detailed breakdowns by attack scenario (4 types), poisoning strategy (3 types), model (4 models), and domain (MATH vs HumanEval). RQ2-RQ5 provide further breakdowns."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Appendix E discusses GPT-3.5-turbo as a failure case where attacks did not significantly degrade performance and in some cases paradoxically improved it (Tables 3-4). The paper identifies this as a limitation."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The GPT-3.5-turbo results (Tables 3-4 in Appendix E) show the attack failing or even improving model performance. The paper explicitly notes 'performance paradoxically improved after poisoning' in some cases."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims of high effectiveness, persistence, robustness against CoT, and stealthiness against defenses are all supported by corresponding experimental sections (RQ1-RQ4) with quantitative results."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The causal claim that poisoning causes performance degradation is supported by controlled experiments: the only variable changed between conditions is the system prompt (poisoned vs unpoisoned), with the same user inputs and models."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'Persistent Attacks on Large Language Models' broadly, but testing covers only 4 closed-source models and 2 domains. Appendix E acknowledges limitations to open-source models and other domains, but the main text and abstract make broad generalizations (e.g., 'persistently impacts all subsequent user interactions')."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper offers interpretations for observed patterns (e.g., why reasoning models are more vulnerable) but does not consider alternative explanations. For instance, it does not discuss whether simple distractor text in the system prompt (without malicious intent) would cause similar degradation, or whether the effect is due to instruction hierarchy rather than poisoning per se."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures accuracy and pass@1 and frames these as 'task performance degradation,' which matches the measurement granularity. No proxy gap exists between what is measured and what is claimed."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "Models are identified by marketing names only: 'Gemini-2.5-flash', 'GPT-5-mini', 'GPT-4o-mini'. No snapshot dates, API versions, or specific model identifiers are provided."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "Figures 2-4 show illustrative examples of poisoning on emotion classification tasks, but the actual system prompts and poisoned prompts used for MATH and HumanEval experiments are not provided. The reader cannot reconstruct the exact prompts sent to the models."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "No hyperparameters (temperature, top-p, max tokens, etc.) are reported for any of the model API calls, despite these significantly affecting output behavior."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. The experiments involve direct API calls and simulated interactive sessions."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "The paper states '500 randomly selected samples' from MATH but does not describe the random selection procedure (seed, stratification, difficulty distribution). For interactive scenarios, conversation history summarization is mentioned but not detailed."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Appendix E contains a dedicated 'Limitations' section with three substantive subsections discussing model dependency, scope of models/domains, and scope of defenses."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "Appendix E discusses specific threats: (1) GPT-3.5-turbo anomaly showing attacks depend on model reasoning ability, (2) limited to 4 closed-source models excluding open-source LLMs, (3) only one defense mechanism tested."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": true,
    183         "justification": "Appendix E explicitly states: 'The findings may not generalize to the full spectrum of LLMs, particularly open-source models,' 'our experiments were conducted on tasks from the mathematics and coding domains,' and 'Our stealthiness evaluation (RQ4) was limited to a single, common black-box defense mechanism.'"
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw experimental data (model outputs, individual predictions, poisoned prompts used) is released. Only aggregated accuracy numbers are reported in tables."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": false,
    195         "justification": "Section 5.2 provides a high-level description of the experimental procedure but lacks detail on the random selection of MATH samples, interactive conversation simulation mechanics, and how context summarization was handled when exceeding token limits."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard benchmarks (MATH, HumanEval)."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The pipeline from benchmark selection through poisoned prompt generation to evaluation is described at a high level, but key transformation steps are missing: how interactive conversations were simulated, what context summarization threshold was used, and how pass@1 was computed for HumanEval."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding sources or acknowledgments section is present in the paper."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "All three authors are listed with their affiliation at the University at Buffalo, Department of Computer Science and Engineering. No conflict with the evaluated products (Google, OpenAI models)."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No funding is disclosed, so independence of the funder cannot be verified."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement or financial disclosure is present in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No training data cutoff dates are stated for any of the four models tested. This is relevant because baseline performance on MATH and HumanEval could be inflated by contamination."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether MATH or HumanEval problems appeared in any model's training data. Both benchmarks were published in 2021 and are widely known."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "MATH (2021) and HumanEval (2021) are both published benchmarks that predate all tested models' training data. No contamination analysis or discussion is provided despite this being relevant to interpreting baseline scores."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants. The ethical statement notes all experiments were in controlled environments using public datasets."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Table 2 reports Auto-SPP framework costs (time and tokens for generating poisoned prompts), but the cost of running the full experimental evaluation (thousands of API calls across 4 models × 4 scenarios × 3 strategies × 2 datasets) is not reported."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Only the Auto-SPP generation costs are stated (Table 2). The total computational budget for the full experimental evaluation (API costs, compute hours) is not disclosed."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of multiple random seeds or seed sensitivity analysis. All results appear to be from single runs."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs producing each result is never stated. It is unclear whether results are from single runs or averaged over multiple trials."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search budget is reported. The number of exemplars in ICL/CoT strategies, prompt formulations tried, and helper LLM settings are not discussed."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": true,
    315         "justification": "The paper reports results for all three strategies across all four scenarios without selecting a single 'best' configuration. All results are shown comprehensively in Table 1."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical tests are performed, so correction for multiple comparisons is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors evaluate their own attack framework (Auto-SPP) and implement the defense baseline (Explicit Reminder) themselves, without acknowledging potential bias from author-evaluation."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "While Table 2 reports Auto-SPP costs and Table 1 reports effectiveness, performance is not explicitly analyzed as a function of compute budget. The tradeoff is discussed qualitatively but not presented as a performance-cost curve."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper does not discuss whether accuracy on MATH and pass@1 on HumanEval are valid measures of the real-world impact of system prompt poisoning. No discussion of whether these benchmarks capture the threat model's practical concerns."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. The experiments use direct API calls to models."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Not addressed. MATH (2021) and HumanEval (2021) both predate the tested models' training periods. Models may have seen benchmark solutions during training, affecting baseline scores."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Not discussed. No analysis of whether the evaluation setup leaks information beyond what would be available in real-world attack scenarios."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "Not addressed. No discussion of whether MATH or HumanEval problems may share structural similarities with training data."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention methods are used. No canary strings, membership inference, or decontamination procedures are described."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "System prompt poisoning drastically reduces model accuracy, with reasoning models collapsing to near-zero (<4%) on MATH in stateless API scenarios.",
    369       "evidence": "Table 1 shows Gemini-2.5-flash drops from 93.2% to 0.8% (99.1%↓) under brute-force in Explicit+API, and GPT-5-mini from 91.4% to 2.2% (97.6%↓) under Adaptive ICL. All three strategies achieve >96% degradation for reasoning models in API scenarios.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Reasoning models are significantly more vulnerable to SPP than non-reasoning models.",
    374       "evidence": "Table 1 consistently shows reasoning models degrading to <4% while non-reasoning models retain 25-62% accuracy across scenarios. However, no statistical test supports the 'significantly' claim.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Poisoning effects persist across 500-turn conversations without significant weakening for reasoning models.",
    379       "evidence": "Figure 5 shows accuracy remains suppressed below 15% for reasoning models across 100, 300, and 500 turns in both interactive scenarios. Lines are 'nearly flat.'",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "User-side prompting augmentation techniques (ICL, CoT) are ineffective at mitigating system prompt poisoning.",
    384       "evidence": "Figure 6 shows accuracy remains below 10% for all three augmentation methods when poisoning is active, tested on Gemini-2.5-flash in Explicit+API on MATH. However, this is tested on only one model and one dataset.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "The Explicit Reminder defense is completely ineffective against SPP.",
    389       "evidence": "Figure 7 shows near-zero accuracy persists when Explicit Reminder is applied, tested on Gemini-2.5-flash in both Explicit and Implicit API scenarios on MATH. Tested on only one model.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Auto-SPP can automate poisoning with brute-force being extremely cheap (~2 seconds, <1.3k tokens).",
    394       "evidence": "Table 2 reports brute-force requiring 1.9-2.2 seconds and 0.7-1.3k tokens, versus 268-340 seconds and 123-247k tokens for adaptive strategies.",
    395       "supported": "strong"
    396     }
    397   ],
    398   "red_flags": [
    399     {
    400       "flag": "No error bars or variance reporting",
    401       "detail": "All results in Table 1 and throughout the paper are point estimates from apparently single runs. With no variance reporting, it is impossible to assess result stability or whether observed differences are meaningful."
    402     },
    403     {
    404       "flag": "Limited defense evaluation",
    405       "detail": "Only one defense (Explicit Reminder) is tested, and it is the simplest possible defense. More sophisticated defenses (input filtering, adversarial training, prompt sandboxing) are discussed only theoretically in Section 6. The stealthiness claim is based on defeating one naive defense."
    406     },
    407     {
    408       "flag": "RQ3 and RQ4 tested on single model only",
    409       "detail": "The robustness (RQ3) and stealthiness (RQ4) experiments use only Gemini-2.5-flash on MATH. These single-model results are used to make general claims about attack robustness and defense ineffectiveness."
    410     },
    411     {
    412       "flag": "No actual experiment prompts provided",
    413       "detail": "The poisoned system prompts used for MATH and HumanEval experiments are never shown. Figures 2-4 show illustrative examples from a different task (emotion classification). Without the actual prompts, the experiments cannot be replicated."
    414     },
    415     {
    416       "flag": "Model versions unspecified",
    417       "detail": "Only marketing names are used (Gemini-2.5-flash, GPT-5-mini, GPT-4o-mini). Without snapshot dates or API versions, results may not be reproducible as model behavior changes across versions."
    418     }
    419   ],
    420   "cited_papers": [
    421     {
    422       "title": "Ignore previous prompt: Attack techniques for language models",
    423       "authors": ["Fábio Perez", "Ian Ribeiro"],
    424       "year": 2022,
    425       "arxiv_id": "2211.09527",
    426       "relevance": "Foundational work on prompt injection attacks against LLMs, which SPP extends to the system prompt layer."
    427     },
    428     {
    429       "title": "Not what you've signed up for: Compromising real-world LLM-integrated applications with indirect prompt injection",
    430       "authors": ["Kai Greshake", "Sahar Abdelnabi", "Shailesh Mishra", "Christoph Endres", "Thorsten Holz", "Mario Fritz"],
    431       "year": 2023,
    432       "relevance": "Extended prompt injection to indirect attacks on LLM-integrated applications, directly relevant to supply-chain attack vectors discussed in the threat model."
    433     },
    434     {
    435       "title": "FATH: Authentication-based test-time defense against indirect prompt injection attacks",
    436       "authors": ["Jiongxiao Wang", "Fangzhou Wu", "Wendi Li", "Jinsheng Pan", "Edward Suh", "Z Morley Mao", "Muhao Chen", "Chaowei Xiao"],
    437       "year": 2024,
    438       "arxiv_id": "2410.21492",
    439       "relevance": "Proposes a defense against prompt injection that SPP's threat model targets; relevant to understanding the defense landscape."
    440     },
    441     {
    442       "title": "Adaptive attacks break defenses against indirect prompt injection attacks on LLM agents",
    443       "authors": ["Qiusi Zhan", "Richard Fang", "Henil Shalin Panchal", "Daniel Kang"],
    444       "year": 2025,
    445       "arxiv_id": "2503.00061",
    446       "relevance": "Demonstrates that adaptive attacks bypass all existing prompt injection defenses, complementing SPP's findings on defense ineffectiveness."
    447     },
    448     {
    449       "title": "InjecAgent: Benchmarking indirect prompt injections in tool-integrated large language model agents",
    450       "authors": ["Zhixiang Zhan", "Zifan Ying", "Daniel Kang"],
    451       "year": 2024,
    452       "arxiv_id": "2403.02691",
    453       "relevance": "Benchmark framework for evaluating LLM agent vulnerabilities to indirect prompt injection."
    454     },
    455     {
    456       "title": "Universal and transferable adversarial attacks on aligned language models",
    457       "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini", "Milad Nasr", "J Zico Kolter", "Matt Fredrikson"],
    458       "year": 2023,
    459       "arxiv_id": "2307.15043",
    460       "relevance": "Foundational jailbreaking work demonstrating universal adversarial attacks on LLMs, relevant to the broader attack surface taxonomy."
    461     },
    462     {
    463       "title": "Data poisoning in LLMs: Jailbreak-tuning and scaling laws",
    464       "authors": ["Dillon Bowen", "Brendan Murphy", "Will Cai", "David Khachaturov", "Adam Gleave", "Kellin Pelrine"],
    465       "year": 2024,
    466       "arxiv_id": "2408.02946",
    467       "relevance": "Studies data poisoning in LLMs at the training level, complementing SPP's inference-time poisoning approach."
    468     },
    469     {
    470       "title": "PoisonBench: Assessing large language model vulnerability to data poisoning",
    471       "authors": ["Tingchen Fu", "Mrinank Sharma", "Philip Torr", "Shay B Cohen", "David Krueger", "Fazl Barez"],
    472       "year": 2024,
    473       "arxiv_id": "2410.08811",
    474       "relevance": "Benchmark for LLM data poisoning vulnerability assessment, relevant to understanding poisoning attack evaluation methodology."
    475     },
    476     {
    477       "title": "Baseline defenses for adversarial attacks against aligned language models",
    478       "authors": ["Neel Jain", "Avi Schwarzschild", "Yuxin Wen", "Gowthami Somepalli", "John Kirchenbauer", "Ping-yeh Chiang", "Micah Goldblum", "Aniruddha Saha", "Jonas Geiping", "Tom Goldstein"],
    479       "year": 2023,
    480       "arxiv_id": "2309.00614",
    481       "relevance": "Proposes baseline defenses including perplexity-based detection for adversarial inputs to LLMs."
    482     },
    483     {
    484       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    485       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu", "Emre Kiciman", "Guangzhong Sun", "Xing Xie", "Fangzhao Wu"],
    486       "year": 2025,
    487       "relevance": "Source of the 'Explicit Reminder' defense evaluated in RQ4, directly relevant to SPP's defense bypass claims."
    488     },
    489     {
    490       "title": "On the (in)security of LLM app stores",
    491       "authors": ["Xinyi Hou", "Yanjie Zhao", "Haoyu Wang"],
    492       "year": 2024,
    493       "arxiv_id": "2407.08422",
    494       "relevance": "Studies security of LLM application ecosystems, directly relevant to SPP's passive distribution threat model via app stores."
    495     },
    496     {
    497       "title": "PoisonedRAG: Knowledge corruption attacks to retrieval-augmented generation of large language models",
    498       "authors": ["Wei Zou", "Runpeng Geng", "Binghui Wang", "Jinyuan Jia"],
    499       "year": 2024,
    500       "arxiv_id": "2402.07867",
    501       "relevance": "Attacks RAG knowledge bases to corrupt LLM outputs, a related poisoning vector targeting a different component of the LLM pipeline."
    502     }
    503   ],
    504   "engagement_factors": {
    505     "practical_relevance": {
    506       "score": 2,
    507       "justification": "Security researchers and LLM application developers can use the threat model and attack strategies to red-team their systems, though no code is released."
    508     },
    509     "surprise_contrarian": {
    510       "score": 1,
    511       "justification": "The instruction hierarchy bias (system prompts override user prompts) is somewhat expected; the magnitude and persistence of the effect across 500 turns adds novelty but is not deeply contrarian."
    512     },
    513     "fear_safety": {
    514       "score": 3,
    515       "justification": "Demonstrates a novel persistent attack vector that bypasses existing defenses, degrades reasoning models to near-zero accuracy, and can be automated cheaply."
    516     },
    517     "drama_conflict": {
    518       "score": 1,
    519       "justification": "No major controversy or dramatic claims about specific companies, though it implies current defenses are fundamentally inadequate."
    520     },
    521     "demo_ability": {
    522       "score": 0,
    523       "justification": "No code, demo, or tool is released. The Auto-SPP framework is described algorithmically only."
    524     },
    525     "brand_recognition": {
    526       "score": 1,
    527       "justification": "University at Buffalo researchers (not a famous AI lab), though the paper tests well-known models (GPT-5, Gemini, GPT-4o)."
    528     }
    529   }
    530 }

Impressum · Datenschutz