ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (31446B)


      1 {
      2   "paper": {
      3     "title": "Token-Efficient Prompt Injection Attack: Provoking Cessation in LLM Reasoning via Adaptive Token Compression",
      4     "authors": ["Yu Cui", "Yujun Cai", "Yiwei Wang"],
      5     "year": 2025,
      6     "venue": "arXiv.org",
      7     "arxiv_id": "2504.20493",
      8     "doi": "10.48550/arXiv.2504.20493"
      9   },
     10   "scan_version": 3,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Simple standalone arithmetic tasks can trigger the 'thinking-stopped' vulnerability in DeepSeek-R1, contrary to prior belief that complex word problems were needed. An adaptive token compression framework reduces attack prompts to ~60% of original length, though effectiveness varies by operation type (addition/subtraction maintain or improve ASR while multiplication/division degrade significantly). Placing the attack prompt in both user prompt and output prefix achieves 100% ASR for addition and subtraction datasets. The authors speculate the root cause is premature prediction of the <|end_of_thinking|> special token.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "No code repository, GitHub link, or archive URL is provided anywhere in the paper."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper describes constructing an attack prompt dataset of 100 prompts but does not provide a download link or release the dataset. The GSM-Ranges dataset used for baseline is public, but their own generated attack prompts are not released."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "The paper mentions using various model APIs with 'default temperature parameters (when supported)' but provides no requirements.txt, Dockerfile, or environment specifications."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "Algorithm 1 describes the dataset construction procedure at an algorithmic level, but no runnable scripts, commands, or step-by-step reproduction instructions are provided."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "All ASR and compression rate results are reported as point estimates (e.g., '85.33%', '65.33%') with no confidence intervals or error bars, despite running only λ=3 trials per prompt."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "The paper compares ASR across datasets and methods (e.g., compressed vs. uncompressed, different operation types) without any statistical significance tests. Differences are described qualitatively ('significantly decrease', 'increases dramatically') without formal testing."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "The paper reports compression rates as ratios (e.g., 56.3% of baseline), ASR differences in percentage points across conditions, and provides baseline context for comparisons (Table 1, Figures 5-6). For example, subtraction achieves '65.33% ASR under a compression rate of 56.3% relative to the baseline.'"
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "The choice of N=25 per operation type and λ=3 tests per prompt is stated but never justified. No power analysis or rationale for why these sample sizes are adequate."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "Despite running λ=3 tests per prompt, only aggregate ASR values are reported. No standard deviation, variance, or spread measure is provided across runs or prompts."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper compares against a baseline from Cui et al. (2025) using complex mathematical word problems from the GSM-Ranges dataset with level 6 perturbation, as shown in Figures 2 and 6."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baseline (Cui et al., 2025) represents the only known prior method for triggering this vulnerability, making it the most contemporary and relevant comparison."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper systematically varies operation type (+, −, ×, ÷), compression vs. no compression, compression model (DeepSeek-V3, o3-mini, GPT-4o, moonshot-v1-32k in Table 2), and three output prefix approaches (Section 3.5, Figure 7), serving as ablation over multiple dimensions."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two metrics are defined and reported: Attack Success Rate (ASR, Equation 2) and Compression Rate (CR, Equation 1). Additionally, the special token trigger rate is reported in Figure 9."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "The attack success criterion is binary (empty response vs. non-empty), making human evaluation clearly irrelevant to the claims."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": false,
     93         "justification": "The attack prompts are both the output of the construction method and the evaluation set. There is no held-out validation to test generalization beyond the 100 constructed prompts."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by operation type (+, −, ×, ÷) and baseline in all experiments (Figures 5, 6, 7, 9, Tables 1, 2), showing substantial variation across categories."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "Section 6.1 discusses anomalous compression with GPT-4o where compressed prompts were longer than originals. The Limitations section acknowledges unexplained phenomena. The significant ASR drops for multiplication/division after compression are shown transparently."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "The paper reports that multiplication ASR drops from 61.33% to 33.33% and division from 41.33% to 18.67% after compression (Figure 6b). GPT-4o's anomalous compression behavior is reported (Section 6.1). Approach 1 is shown to effectively defend against the attack (Figure 7)."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": false,
    115         "justification": "The abstract claims compression 'significantly reduces prompt length while maintaining effective attack capabilities.' However, results show ASR drops substantially for multiplication (61.33%→33.33%) and division (41.33%→18.67%) after compression. The claim of maintained effectiveness is only true for addition and subtraction datasets."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "Section 6.2 makes causal claims about the mechanism: 'attack prompts induce the premature appearance of the special token.' The authors themselves acknowledge this is speculative ('we speculate on the underlying cause'). The claim that 'reasoning tokens containing simpler calculation logic increase the ASR' is based on correlational observation across four operation types, not a controlled causal design."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The paper title refers to 'LLM Reasoning' generally, but all experiments are conducted solely on DeepSeek-R1 (via two API platforms). No other reasoning LLMs are tested. The abstract refers to 'reasoning large language models' without bounding claims to DeepSeek-R1."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "The paper does not discuss alternative explanations for why the attack works. The speculative mechanism in Section 6.2 is presented as the primary explanation without considering alternatives. The Limitations section notes unexplained phenomena but does not propose alternative hypotheses."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures empty responses (choices[0].message.content is empty) and claims the attack causes reasoning interruption. The measurement directly corresponds to the claim — no proxy gap exists. The formal definition in Section 3.2 precisely defines what constitutes a successful attack."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "Models are referenced by marketing names only: 'DeepSeek-R1', 'DeepSeek-V3', 'o3-mini', 'GPT-4o', 'moonshot-v1-32k'. No specific version snapshots, API versions, or model checkpoint dates are provided."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Appendix A provides the full compression method prompt (system prompt) with the actual text, including the demonstration examples M and N. Example attack prompts are shown in Appendix A and B. The seed prompts are simple arithmetic expressions (e.g., '16288777 + 5921081 = ?') whose generation is fully specified."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "Section 4.1 states 'All models were accessed via their dedicated APIs using default temperature parameters (when supported)' but does not report what those defaults are. No temperature, top-p, max tokens, or other sampling parameters are specified."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The attack is a direct prompt injection method, not an agentic system."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Algorithm 1 describes the full dataset construction pipeline: random number generation, seed prompt creation, iterative search for vulnerability-triggering tokens, and dataset accumulation. Section 3.4 and Figure 3 describe the compression framework pipeline including verification steps."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "A dedicated 'Limitations' section is present after the Conclusion, discussing remaining unexplained phenomena and acknowledging time constraints."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "The Limitations section identifies a specific unexplained phenomenon unique to this study: 'why the attack prompt for the subtraction dataset, after token compression, results in an increase in attack success rate rather than a decrease, while this is not observed in the other three datasets.'"
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": false,
    179         "justification": "The paper does not explicitly state scope boundaries. All experiments are on DeepSeek-R1, but the paper does not state that results may not generalize to other reasoning LLMs, nor does it identify specific conditions under which the attack might fail beyond the tested scenarios."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data is released — neither the generated attack prompts, compressed prompts, nor API response logs are available for independent verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Algorithm 1 describes the data collection procedure in detail: random number generation within intervals p1 and p2, seed prompt construction per operation type, iterative API calls to find vulnerability-triggering tokens, and stopping criteria (N=25 per type)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Attack prompts are procedurally generated via API calls, and the baseline uses a standard public benchmark (GSM-Ranges)."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pipeline is documented from seed prompt generation (random numbers) through reasoning token extraction (Algorithm 1) to compression (Section 3.4, Figure 3), including the verification loop and fallback strategy (retaining original after 4 failed compressions)."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper. No grants or sponsors are mentioned."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are clearly stated: University of California Merced and University of Queensland. These are academic institutions with no apparent affiliation to DeepSeek or any evaluated product."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": false,
    217         "answer": false,
    218         "justification": "No funding source is disclosed. The work appears to be unfunded academic research from two universities."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "This paper tests an attack against a model's reasoning mechanism, not model knowledge on a benchmark. The vulnerability depends on the model's inference architecture, not whether it has seen specific training data."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": false,
    234         "answer": false,
    235         "justification": "The attack exploits a structural vulnerability in the reasoning mechanism, not model memorization. Train/test overlap is not relevant to evaluating attack effectiveness."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": false,
    239         "answer": false,
    240         "justification": "The study tests an attack method rather than evaluating model knowledge on benchmarks. Benchmark contamination does not affect whether the thinking-stopped vulnerability can be triggered."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Token consumption is reported extensively: Table 1 shows total tokens per dataset (70,243 to 173,449), Figure 5 shows average token consumption before and after compression per dataset. The compression rate metric (Equation 1) directly measures cost reduction."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Total search counts are reported (Table 1: 31.25 average total searches) and 375 tests per attack method, but no total API spend in dollars, GPU hours, or wall-clock time is stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "The attack prompts are generated from random number seeds, but no analysis of seed sensitivity is reported. The λ=3 repetitions per prompt measure per-prompt variance but not sensitivity to the initial random number generation."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": true,
    301         "justification": "Section 4.3 explicitly states: 'For each type of attack evaluation, we conduct three tests (λ = 3) on every prompt in each of the five datasets, which amounts to a total of 375 tests per attack method.'"
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "The compression target of 70%, the random number intervals p1/p2, and N=25 per type are set without any stated search or justification. The compression prompt examples were manually crafted with no reported exploration budget."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "DeepSeek-V3 is selected as the primary compression model based on Table 2 comparisons, but the 70% compression target, the manual compression examples, and N=25 are not justified. No validation set was used for configuration selection."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "The paper makes numerous comparisons across 5 datasets, 4 compression models, and 3 output prefix approaches without any statistical tests, let alone multiple comparison corrections."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The baseline is from Cui et al. (2025), where the first author (Yu Cui) is the same as this paper's first author. The paper does not acknowledge the potential bias of evaluating against one's own prior method."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "Performance is not reported as a function of compute budget. The token compression analysis (Figure 5) shows resource usage but does not plot ASR against compute cost."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "ASR measures whether the model returns an empty response, but the paper does not discuss whether this binary metric fully captures the severity or real-world impact of the reasoning interruption attack."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved. The attack is a direct prompt injection without any agentic framework."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "The baseline uses GSM-Ranges (Shrestha et al., 2025), which DeepSeek-R1 may have encountered during training. No discussion of whether temporal ordering affects the evaluation."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "The attack prompts are generated from DeepSeek-R1's own reasoning tokens, creating a circular dependency where the model's outputs become its inputs. This circularity is not discussed as a potential leakage concern."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "The attack prompts are generated by the same model being attacked. The non-independence between the prompt generation process and the target model is not discussed."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention methods are applied. No canary strings, decontamination, or temporal splits are used."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Simple standalone arithmetic tasks can trigger the thinking-stopped vulnerability in DeepSeek-R1, contrary to prior belief that complex mathematical word problems were needed.",
    365       "evidence": "Section 3.1 presents the discovery. Figure 6a shows ASR of 54.67% (addition) and 41.33% (subtraction) on DeepSeek-R1 using arithmetic prompts, compared to 85.33% baseline with word problems. Table 1 shows successful dataset construction across all four operation types.",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "Only 1.25 API calls on average are needed to acquire each attack prompt.",
    370       "evidence": "Table 1 reports average search counts: addition 1.32, subtraction 1.48, multiplication 1.00, division 1.20, with overall average 1.25.",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "The adaptive token compression framework reduces prompts to approximately 60% of original size while maintaining high attack success rates.",
    375       "evidence": "Figure 5 shows compression results. However, Figure 6 shows ASR after compression drops significantly for multiplication (61.33%→33.33%) and division (41.33%→18.67%), while it increases for subtraction (41.33%→65.33%) and addition (54.67%→64.00%). The claim of 'maintaining' effectiveness is only partially supported.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "Including the attack prompt in both user prompt and output prefix (Approach 3) achieves 100% ASR for addition and subtraction datasets.",
    380       "evidence": "Figure 7 shows Approach 3 achieves 100% ASR for both addition and subtraction datasets, with 68% for multiplication and 66.67% for division.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "The root cause of the vulnerability is premature prediction of the special <|end_of_thinking|> token, causing the model to treat its final answer as reasoning tokens.",
    385       "evidence": "Section 6.2 proposes this mechanism based on Approach 2 output prefix experiments (Figure 9) showing the model frequently generates <|end_of_thinking|> tokens. However, the authors explicitly state 'we speculate on the underlying cause' — this is a hypothesis, not a verified mechanism.",
    386       "supported": "weak"
    387     },
    388     {
    389       "claim": "DeepSeek-V3 significantly outperforms other models for token compression in the attack framework.",
    390       "evidence": "Table 2 shows DeepSeek-V3 achieves 47.89% compression rate vs. moonshot-v1-32k at 84.03%, o3-mini at 78.97%, and GPT-4o at 63.81%. However, lower compression rate with lower ASR (33.33%) suggests a trade-off not fully explored.",
    391       "supported": "moderate"
    392     }
    393   ],
    394   "red_flags": [
    395     {
    396       "flag": "Very small sample size without justification",
    397       "detail": "Only N=25 prompts per operation type (100 total) and λ=3 tests per prompt. These sample sizes are never justified and are small enough that individual prompt variability could dominate the results. No power analysis is provided."
    398     },
    399     {
    400       "flag": "Overclaimed compression effectiveness",
    401       "detail": "The abstract claims compression 'maintains effective attack capabilities,' but ASR drops from 61.33% to 33.33% for multiplication and from 41.33% to 18.67% for division. Only addition and subtraction maintain or improve ASR. The paper acknowledges this in the body but not in the abstract."
    402     },
    403     {
    404       "flag": "Single-model evaluation with broad claims",
    405       "detail": "All experiments target only DeepSeek-R1 (via two platforms), but the title and abstract frame the vulnerability as affecting 'LLM Reasoning' generally. No other reasoning models (e.g., o1, Gemini) are tested for the vulnerability."
    406     },
    407     {
    408       "flag": "No statistical tests for any comparison",
    409       "detail": "All claims of superiority or difference ('significantly outperforms', 'increases dramatically', 'significantly decrease') are made without any statistical significance testing, despite sample sizes small enough that stochastic variation could explain observed differences."
    410     },
    411     {
    412       "flag": "Self-citation as sole baseline",
    413       "detail": "The only baseline is Cui et al. (2025), where the first author (Yu Cui) is the same as this paper's first author. The potential bias of comparing against one's own prior work is not acknowledged."
    414     }
    415   ],
    416   "cited_papers": [
    417     {
    418       "title": "Process or result? Manipulated ending tokens can mislead reasoning LLMs to ignore the correct reasoning steps",
    419       "authors": ["Yu Cui", "Bryan Hooi", "Yujun Cai", "Yiwei Wang"],
    420       "year": 2025,
    421       "arxiv_id": "2503.19326",
    422       "relevance": "Prior work identifying the thinking-stopped vulnerability in DeepSeek-R1 that this paper builds upon; directly relevant to LLM security."
    423     },
    424     {
    425       "title": "Formalizing and benchmarking prompt injection attacks and defenses",
    426       "authors": ["Yupei Liu", "Yuqi Jia", "Runpeng Geng", "Jinyuan Jia", "Neil Zhenqiang Gong"],
    427       "year": 2024,
    428       "relevance": "Establishes the formal framework for prompt injection attacks that this paper extends; provides the benchmark used for defining attack taxonomy."
    429     },
    430     {
    431       "title": "Automatic and universal prompt injection attacks against large language models",
    432       "authors": ["Xiaogeng Liu", "Zhiyuan Yu", "Yizhe Zhang", "Ning Zhang", "Chaowei Xiao"],
    433       "year": 2024,
    434       "arxiv_id": "2403.04957",
    435       "relevance": "Describes automated prompt injection attack strategies relevant to understanding the landscape of LLM security threats."
    436     },
    437     {
    438       "title": "PromptShield: Deployable detection for prompt injection attacks",
    439       "authors": ["Dennis Jacob", "Hend Alzahrani", "Zhanhao Hu", "Basel Alomair", "David Wagner"],
    440       "year": 2025,
    441       "arxiv_id": "2501.15145",
    442       "relevance": "Defense mechanism against prompt injection attacks; relevant to understanding mitigation approaches for the vulnerability studied."
    443     },
    444     {
    445       "title": "The hidden risks of large reasoning models: A safety assessment of R1",
    446       "authors": ["Kaiwen Zhou", "Chengzhi Liu", "Xuandong Zhao"],
    447       "year": 2025,
    448       "arxiv_id": "2502.12659",
    449       "relevance": "Comprehensive safety assessment of DeepSeek-R1 identifying security risks in reasoning models."
    450     },
    451     {
    452       "title": "DeepSeek-R1: Incentivizing reasoning capability in LLMs via reinforcement learning",
    453       "authors": ["DeepSeek-AI"],
    454       "year": 2025,
    455       "arxiv_id": "2501.12948",
    456       "relevance": "The target model of the attack; describes the reasoning architecture and training process that creates the vulnerability."
    457     },
    458     {
    459       "title": "Optimization-based prompt injection attack to LLM-as-a-judge",
    460       "authors": ["Jiawen Shi", "Zenghui Yuan", "Yinuo Liu"],
    461       "year": 2024,
    462       "relevance": "Demonstrates optimization-based prompt injection against LLM evaluation systems, relevant to broader LLM security landscape."
    463     },
    464     {
    465       "title": "Benchmarking and defending against indirect prompt injection attacks on large language models",
    466       "authors": ["Jingwei Yi", "Yueqi Xie", "Bin Zhu"],
    467       "year": 2025,
    468       "relevance": "Benchmarks and defenses for indirect prompt injection attacks on LLMs; directly relevant to the survey's security coverage."
    469     },
    470     {
    471       "title": "DeepSeek-R1 Thoughtology: Let's <think> about LLM reasoning",
    472       "authors": ["Sara Vera Marjanović", "Arkil Patel", "Vaibhav Adlakha"],
    473       "year": 2025,
    474       "arxiv_id": "2504.07128",
    475       "relevance": "Analyzes DeepSeek-R1's reasoning behavior including cases where the model halts output before completing reasoning, corroborating the thinking-stopped vulnerability."
    476     },
    477     {
    478       "title": "Vulnerability of large language models to output prefix jailbreaks: Impact of positions on safety",
    479       "authors": ["Yiwei Wang", "Muhao Chen", "Nanyun Peng", "Kai-Wei Chang"],
    480       "year": 2024,
    481       "relevance": "Examines output prefix-based jailbreak attacks on LLMs, directly related to the output prefix attack approaches explored in this paper."
    482     },
    483     {
    484       "title": "Safety evaluation of DeepSeek models in Chinese contexts",
    485       "authors": ["Wenjing Zhang", "Xuejiao Lei", "Zhaoxiang Liu"],
    486       "year": 2025,
    487       "arxiv_id": "2502.11137",
    488       "relevance": "Evaluates security of DeepSeek models in specific contexts; relevant to understanding the broader security landscape of DeepSeek systems."
    489     },
    490     {
    491       "title": "Don't listen to me: Understanding and exploring jailbreak prompts of large language models",
    492       "authors": ["Zhiyuan Yu", "Xiaogeng Liu", "Shunning Liang"],
    493       "year": 2024,
    494       "relevance": "Studies jailbreak prompt patterns in LLMs, relevant to understanding prompt-based security vulnerabilities."
    495     }
    496   ],
    497   "engagement_factors": {
    498     "practical_relevance": {
    499       "score": 1,
    500       "justification": "The attack is specific to DeepSeek-R1's thinking-stopped vulnerability; not a general-purpose tool or technique practitioners would adopt."
    501     },
    502     "surprise_contrarian": {
    503       "score": 2,
    504       "justification": "Challenges the prior assumption that only complex word problems could trigger the vulnerability — simple arithmetic suffices."
    505     },
    506     "fear_safety": {
    507       "score": 2,
    508       "justification": "Demonstrates a practical denial-of-service attack against a widely-used reasoning LLM that produces empty responses in API contexts."
    509     },
    510     "drama_conflict": {
    511       "score": 1,
    512       "justification": "Exposes a vulnerability in DeepSeek-R1 but without major controversy or accusations; primarily an academic security contribution."
    513     },
    514     "demo_ability": {
    515       "score": 0,
    516       "justification": "No code, demo, or released dataset; readers cannot reproduce the attack without implementing it themselves."
    517     },
    518     "brand_recognition": {
    519       "score": 2,
    520       "justification": "DeepSeek-R1 is a widely discussed model; the paper also involves GPT-4o and o3-mini from OpenAI."
    521     }
    522   }
    523 }

Impressum · Datenschutz