ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (32854B)


      1 {
      2   "paper": {
      3     "title": "Dialogue Injection Attack: Jailbreaking LLMs through Context Manipulation",
      4     "authors": [
      5       "Wenlong Meng",
      6       "Fan Zhang",
      7       "Wendao Yao",
      8       "Zhenyuan Guo",
      9       "Yuwei Li",
     10       "Chengkun Wei",
     11       "Wenzhi Chen"
     12     ],
     13     "year": 2025,
     14     "venue": "IEEE Transactions on Information Forensics and Security",
     15     "arxiv_id": "2503.08195",
     16     "doi": "10.1109/TIFS.2026.3657898"
     17   },
     18   "scan_version": 3,
     19   "active_modules": ["experimental_rigor", "data_leakage"],
     20   "methodology_tags": ["benchmark-eval"],
     21   "key_findings": "DIA introduces a novel jailbreak paradigm that exploits LLM chat template structure to inject fabricated dialogue history, achieving high attack success rates (0.89 on Llama-3.1-8B, 0.82 on GPT-4o after 10 queries on AdvBench). The paper reveals that deferred harmful responses exhibit higher log-likelihood than immediate ones, providing a mechanistic explanation for DIA-II's effectiveness. DIA bypasses 5 defense mechanisms with high pass rates (DIA-I: 0.93 average DPR). A counter-intuitive finding shows larger LLMs within a family tend to be more susceptible to jailbreak attacks, attributed to the usefulness-safety tradeoff.",
     22   "checklist": {
     23     "artifacts": {
     24       "code_released": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Section 1 footnote states 'Code is available at https://github.com/meng-wenlong/DIA' providing a GitHub repository URL."
     28       },
     29       "data_released": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "The three evaluation benchmarks (AdvBench, HEx-PHI, MaliciousInstruct) are all publicly available standard benchmarks downloaded from HuggingFace Datasets (Section 5.1). The paper notes 'We plan to open-source our generated affirmative beginnings after the acceptance of this paper' but the core evaluation datasets are unmodified public benchmarks."
     33       },
     34       "environment_specified": {
     35         "applies": true,
     36         "answer": false,
     37         "justification": "The paper mentions hardware (Intel Xeon 8358, 1TB memory, 4x A100 80G GPUs) and Ollama as the inference engine (Section 5.1), but provides no requirements.txt, Dockerfile, or detailed software dependency specifications."
     38       },
     39       "reproduction_instructions": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "No step-by-step reproduction instructions are provided in the paper. While code is released on GitHub, the paper itself contains no 'Reproducing Results' section or commands to replicate experiments."
     43       }
     44     },
     45     "statistical_methodology": {
     46       "confidence_intervals_or_error_bars": {
     47         "applies": true,
     48         "answer": false,
     49         "justification": "Tables 2, 3, and 6 report single ASR point estimates with no confidence intervals, error bars, or ± notation. Figures 5, 7, and 10 show ASR curves without uncertainty bands."
     50       },
     51       "significance_tests": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper claims DIA achieves 'superior ASRs' and 'state-of-the-art attack success rates' but no statistical significance tests (p-values, t-tests, etc.) are used to validate these comparative claims."
     55       },
     56       "effect_sizes_reported": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "Results are presented as raw ASR numbers in tables. While one can compare baseline vs. DIA values, the paper does not explicitly compute or discuss effect sizes, percentage improvements, or relative gains in a structured way."
     60       },
     61       "sample_size_justified": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "The paper uses AdvBench (520 prompts), HEx-PHI (330), and MaliciousInstruct (100) without justifying why these sizes are sufficient for the claims made. No power analysis is discussed."
     65       },
     66       "variance_reported": {
     67         "applies": true,
     68         "answer": false,
     69         "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be from single experimental runs with no indication of multiple runs or seed variation."
     70       }
     71     },
     72     "evaluation_design": {
     73       "baselines_included": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Four state-of-the-art black-box jailbreak attacks are compared: DeepInception, ReNe, PAIR, and DRA (Section 5.1). Results are compared in Tables 2, 3, 6 and Figures 5, 7, 10."
     77       },
     78       "baselines_contemporary": {
     79         "applies": true,
     80         "answer": true,
     81         "justification": "Baselines include DRA (USENIX Security 2024), PAIR (2023), DeepInception (2023), and ReNe (2023), all recent black-box jailbreak attacks representing current state of the art for a 2025 paper."
     82       },
     83       "ablation_study": {
     84         "applies": true,
     85         "answer": true,
     86         "justification": "Section 5.5 presents ablation studies: Table 4 removes individual dialogue components (system prompt replacement, hypnosis, answer guidance) and Figure 9 tests the prompt rewrite mechanism, measuring ASR impact of each."
     87       },
     88       "multiple_metrics": {
     89         "applies": true,
     90         "answer": true,
     91         "justification": "The paper uses ASR evaluated by two different safety classifiers (LlamaGuard-2 and LlamaGuard-3), Defense Pass Rate (DPR) in Section 5.6, and semantic similarity analysis in Section 5.2 (Figure 6)."
     92       },
     93       "human_evaluation": {
     94         "applies": true,
     95         "answer": false,
     96         "justification": "No human evaluation of attack outputs is performed. All evaluation relies on automated safety classifiers (LlamaGuard-2 and LlamaGuard-3). The paper acknowledges limitations of automated evaluation (Section 5.1) but does not supplement with human judgment."
     97       },
     98       "held_out_test_set": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "The full benchmarks are used directly for evaluation with no dev/test split. Hyperparameters (e.g., rewrite probabilities [0.3, 0.5, 0.2], keyword threshold ≥3) appear chosen without systematic validation on a separate set."
    102       },
    103       "per_category_breakdown": {
    104         "applies": true,
    105         "answer": true,
    106         "justification": "Results are broken down by model (10 LLMs), benchmark (3 datasets), evaluator (2 guard models), and model size (Figure 8). Tables 2, 3, 6 show per-model results for each method."
    107       },
    108       "failure_cases_discussed": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper discusses where attacks fail: DIA-I achieves 0.000 ASR on Llama-3.1-8B (Table 2), DRA fails on GPT-4o. Section 5.4 discusses the Llama-3 family's contrary model-size trend. Section 5.5 ablation reveals when component removal increases ASR (hypnosis on DIA-II/Gemma-2)."
    112       },
    113       "negative_results_reported": {
    114         "applies": true,
    115         "answer": true,
    116         "justification": "DIA-I reports 0.000 ASR on Llama-3.1-8B and Llama-3-70B (Table 2). The ablation shows removing hypnosis slightly increases DIA-II's ASR on Gemma-2-9B (Table 4). Section 5.2 notes DIA-I is 'less effective on Llama-3.1-8B compared to Llama-3-8B.'"
    117       }
    118     },
    119     "claims_and_evidence": {
    120       "abstract_claims_supported": {
    121         "applies": true,
    122         "answer": true,
    123         "justification": "Abstract claims of '0.89 on Llama-3.1-8B and 0.82 on GPT-4o' are supported by multi-query results in Figure 5. 'State-of-the-art attack success rates' is supported by Tables 2, 3, 6 where DIA leads on most models. 'Bypass 5 different defense mechanisms' is supported by Table 5."
    124       },
    125       "causal_claims_justified": {
    126         "applies": true,
    127         "answer": true,
    128         "justification": "Causal claims are supported by controlled ablation studies (Table 4, removing individual components to measure ASR impact) and the log-likelihood experiment (Figure 4) demonstrating the mechanism behind deferred responses. The ablation design uses controlled single-variable manipulation."
    129       },
    130       "generalization_bounded": {
    131         "applies": true,
    132         "answer": true,
    133         "justification": "The paper specifies it operates in 'black-box settings requiring only access to the chat API or knowledge of the LLM's chat template' (abstract). Results sections clearly identify the 10 specific models and 3 benchmarks tested. Claims about model size are bounded by model family."
    134       },
    135       "alternative_explanations_discussed": {
    136         "applies": true,
    137         "answer": true,
    138         "justification": "Section 5.2 discusses alignment differences between models explaining varying ASRs. Section 5.4 attributes the model-size susceptibility trend to the usefulness-safety tradeoff and explains Llama-3's contrary behavior via knowledge cutoff discrepancy. Section 5.2 analyzes ReNe's ASR gains as partly from semantic corruption."
    139       },
    140       "proxy_outcome_distinction": {
    141         "applies": true,
    142         "answer": true,
    143         "justification": "The paper explicitly discusses the gap between ASR measurement and actual attack success, noting that refusal-phrase-based evaluation 'may generate a substantial number of false positives' (Section 5.1) and uses safety classification models as a more rigorous proxy. They discuss construct validity of the metric."
    144       }
    145     },
    146     "setup_transparency": {
    147       "model_versions_specified": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Closed-source models are specified with exact versions: 'gpt-4o-2024-08-06' and 'gpt-4o-mini-2024-07-18' (Section 5.1). Open-source models are identified by family and size (Llama-3.1-8B, Gemma-2-9B, etc.) with 'official models provided by Ollama.' Guard models are specified as LlamaGuard-2 and LlamaGuard-3."
    151       },
    152       "prompts_provided": {
    153         "applies": true,
    154         "answer": false,
    155         "justification": "The paper describes prompt structures formally (Equations 1-2, Algorithms 1-3) and provides the template inference probe example, but does not provide the actual verbatim text of system replacement prompts, hypnosis conversations, continue commands, or answer guidance prompts used in DIA-I and DIA-II."
    156       },
    157       "hyperparameters_reported": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "Some hyperparameters are reported: rewrite operation probabilities [0.3, 0.5, 0.2], keyword retention threshold >0.5. However, LLM inference parameters (temperature, top-p, max tokens) are not reported for either attack generation or victim model querying."
    161       },
    162       "scaffolding_described": {
    163         "applies": false,
    164         "answer": false,
    165         "justification": "No agentic scaffolding is used. The attack consists of crafted prompts and auxiliary LLM calls in a pipeline, not an agentic system with tools, memory, or feedback loops."
    166       },
    167       "data_preprocessing_documented": {
    168         "applies": true,
    169         "answer": true,
    170         "justification": "Section 5.1 documents the data sources (HuggingFace Datasets) and characteristics (token lengths). Algorithms 1-2 describe the ABGM and SDGM pipelines for generating adversarial content. The attack construction process from benchmark prompts to final adversarial inputs is documented."
    171       }
    172     },
    173     "limitations_and_scope": {
    174       "limitations_section_present": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "The paper has no dedicated Limitations or Threats to Validity section. Section 7 (Conclusion) and Section 8 (Ethics Consideration) exist but neither substantively discusses methodological limitations."
    178       },
    179       "threats_to_validity_specific": {
    180         "applies": true,
    181         "answer": false,
    182         "justification": "No specific threats to validity are discussed. While the paper notes DIA-I's weakness on certain models (scattered observations), there is no structured discussion of threats to internal or external validity."
    183       },
    184       "scope_boundaries_stated": {
    185         "applies": true,
    186         "answer": false,
    187         "justification": "The paper does not explicitly state what the results do NOT show. There are no statements about what populations, settings, or model families are excluded from the claims, or what the attack cannot achieve."
    188       }
    189     },
    190     "data_integrity": {
    191       "raw_data_available": {
    192         "applies": true,
    193         "answer": false,
    194         "justification": "Raw experimental outputs (model responses, attack success/failure per prompt) are not released. While code is available, the actual experimental data for independent verification is not provided."
    195       },
    196       "data_collection_described": {
    197         "applies": true,
    198         "answer": true,
    199         "justification": "Section 5.1 describes data sources (AdvBench, HEx-PHI, MaliciousInstruct from HuggingFace), model deployment (Ollama for open-source, official APIs for closed-source), and hardware specifications (Intel Xeon 8358, 4x A100 GPUs)."
    200       },
    201       "recruitment_methods_described": {
    202         "applies": false,
    203         "answer": false,
    204         "justification": "No human participants. Data sources are standard public benchmarks."
    205       },
    206       "data_pipeline_documented": {
    207         "applies": true,
    208         "answer": true,
    209         "justification": "The full pipeline is documented: benchmark prompts → ABGM generates affirmative beginnings (Algorithm 1) → SDGM generates demonstrations → adversarial dialogue construction (Section 4) → LLM querying via Ollama/API → LlamaGuard evaluation. Figure 3 illustrates the construction process."
    210       }
    211     },
    212     "conflicts_of_interest": {
    213       "funding_disclosed": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding sources or acknowledgments section is present in the paper. There is no mention of grants, corporate sponsors, or funding agencies."
    217       },
    218       "affiliations_disclosed": {
    219         "applies": true,
    220         "answer": true,
    221         "justification": "Author affiliations are clearly listed: Zhejiang University and National University of Defense Technology. The authors are not affiliated with the companies whose models are tested (Meta, Google, OpenAI, Alibaba)."
    222       },
    223       "funder_independent_of_outcome": {
    224         "applies": true,
    225         "answer": false,
    226         "justification": "Cannot assess funder independence because no funding is disclosed. The absence of funding information prevents evaluation of this criterion."
    227       },
    228       "financial_interests_declared": {
    229         "applies": true,
    230         "answer": false,
    231         "justification": "No competing interests statement or financial interests declaration is present in the paper."
    232       }
    233     },
    234     "contamination": {
    235       "training_cutoff_stated": {
    236         "applies": true,
    237         "answer": false,
    238         "justification": "Training cutoffs are mentioned for Llama-3 models only: 'Llama-3-70B has a cutoff in December 2023, while Llama-3-8B's is in March 2023' (Section 5.4). Cutoff dates for the remaining 8 models (Llama-3.1, Gemma-2, Qwen-2, GPT-4o) are not stated."
    239       },
    240       "train_test_overlap_discussed": {
    241         "applies": true,
    242         "answer": true,
    243         "justification": "Section 2.3 discusses that 'LLM developer can add these jailbreak prompts to their alignment dataset, which can help the LLM detect and thwart such attacks.' Section 5.2 provides evidence: 'DeepInception and DRA experience ASR degradation by 67% and 99% on Llama-3.1-8B' compared to Llama-2."
    244       },
    245       "benchmark_contamination_addressed": {
    246         "applies": true,
    247         "answer": true,
    248         "justification": "The paper explicitly addresses that known attack patterns may be included in alignment training data, and demonstrates this empirically by showing sharp ASR degradation of existing attacks on newer models (Section 2.3, Section 5.2). This motivates their novel attack vector."
    249       }
    250     },
    251     "human_studies": {
    252       "pre_registered": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study. All experiments involve automated LLM querying and evaluation."
    256       },
    257       "irb_or_ethics_approval": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants. Section 8 discusses ethics considerations regarding responsible disclosure but not human subjects."
    261       },
    262       "demographics_reported": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants."
    266       },
    267       "inclusion_exclusion_criteria": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants."
    271       },
    272       "randomization_described": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants."
    276       },
    277       "blinding_described": {
    278         "applies": false,
    279         "answer": false,
    280         "justification": "No human participants."
    281       },
    282       "attrition_reported": {
    283         "applies": false,
    284         "answer": false,
    285         "justification": "No human participants."
    286       }
    287     },
    288     "cost_and_practicality": {
    289       "inference_cost_reported": {
    290         "applies": true,
    291         "answer": false,
    292         "justification": "No inference costs, API spend, tokens consumed, or wall-clock time per attack are reported. The multi-query approach requires up to 10 queries per prompt across 520+ prompts across 10 models, but total cost is never quantified."
    293       },
    294       "compute_budget_stated": {
    295         "applies": true,
    296         "answer": false,
    297         "justification": "Hardware is described (4x A100 80G GPUs) but total GPU hours, training time for any components, or total API spend are not quantified."
    298       }
    299     },
    300     "experimental_rigor": {
    301       "seed_sensitivity_reported": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from single experimental runs despite LLM output randomness."
    305       },
    306       "number_of_runs_stated": {
    307         "applies": true,
    308         "answer": false,
    309         "justification": "The number of experimental runs per configuration is not stated. For single-query experiments, it is unclear whether results are from one run or averaged across multiple runs."
    310       },
    311       "hyperparameter_search_budget": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "No hyperparameter search budget is reported. Parameters like rewrite probabilities [0.3, 0.5, 0.2] and keyword retention threshold appear arbitrarily chosen without documented search."
    315       },
    316       "best_config_selection_justified": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The selection of hyperparameters and configuration choices is not justified. No validation set or selection criterion is described for choosing the reported configuration."
    320       },
    321       "multiple_comparison_correction": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The paper makes comparisons across 6 methods × 10 models × 3 benchmarks × 2 evaluators with no correction for multiple comparisons applied."
    325       },
    326       "self_comparison_bias_addressed": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The authors reimplemented baselines on Ollama 'to ensure a fair comparison' (Section 5.1) and used the same auxiliary LLM (Gemma-2-27B) across methods. However, they do not explicitly acknowledge or address the bias of authors evaluating their own system vs. their reimplementation of baselines."
    330       },
    331       "compute_budget_vs_performance": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "Compute differences between methods are not discussed. DIA requires ABGM/SDGM preprocessing plus multi-turn prompting, while baselines vary in complexity (PAIR uses iterative refinement). No matched-compute comparison is provided."
    335       },
    336       "benchmark_construct_validity": {
    337         "applies": true,
    338         "answer": true,
    339         "justification": "Section 5.1 discusses limitations of refusal-phrase-based evaluation ('may generate a substantial number of false positives') and motivates using safety classification models (LlamaGuard-2, LlamaGuard-3) as more valid measures. Section 5.2 analyzes whether ReNe's high ASR reflects genuine attacks or semantic corruption (Figure 6)."
    340       },
    341       "scaffold_confound_addressed": {
    342         "applies": false,
    343         "answer": false,
    344         "justification": "No scaffolding is involved in the attack or evaluation. Methods are compared at the prompt/dialogue construction level."
    345       }
    346     },
    347     "data_leakage": {
    348       "temporal_leakage_addressed": {
    349         "applies": true,
    350         "answer": true,
    351         "justification": "The paper discusses that newer models may have been aligned against benchmark attack prompts, noting ASR degradation on newer models (Section 2.3, 5.2). They use this as motivation for novel attack methods and discuss Llama-3 knowledge cutoffs (Section 5.4)."
    352       },
    353       "feature_leakage_addressed": {
    354         "applies": true,
    355         "answer": false,
    356         "justification": "No discussion of whether the evaluation setup leaks information. The paper does not address whether LlamaGuard models might have feature dependencies with the tested models or benchmarks."
    357       },
    358       "non_independence_addressed": {
    359         "applies": true,
    360         "answer": false,
    361         "justification": "No discussion of independence between benchmark prompts. AdvBench prompts may share structural similarities, and no analysis of within-benchmark non-independence is provided."
    362       },
    363       "leakage_detection_method": {
    364         "applies": true,
    365         "answer": false,
    366         "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap) is applied. The temporal leakage discussion is conceptual and based on observed ASR trends, not formal detection."
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "DIA achieves state-of-the-art attack success rates, reaching 0.89 ASR on Llama-3.1-8B and 0.82 on GPT-4o after 10 queries on AdvBench.",
    373       "evidence": "Tables 2-3, Figures 5 and 7 show DIA-I or DIA-II achieving the highest ASR on most models across AdvBench and HEx-PHI. Multi-query results in Figure 5 show the claimed ASR values after 10 iterations.",
    374       "supported": "moderate"
    375     },
    376     {
    377       "claim": "DIA can bypass 5 different defense mechanisms with high pass rates (DIA-I: 0.93 average DPR, DIA-II: 0.82).",
    378       "evidence": "Table 5 shows defense pass rates against OpenAI Moderation, Perplexity Filter, Defensive System Prompt, Defensive Prompt Patch, and Bergeron on AdvBench targeting Gemma-2-9B.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Deferred harmful responses exhibit higher log-likelihood than immediate responses following malicious prompts.",
    383       "evidence": "Figure 4 shows log-likelihood distributions with and without prepended benign text on Llama-3.1-8B and Llama-3.2-11B, demonstrating a distributional shift toward higher likelihood when benign context is prepended.",
    384       "supported": "moderate"
    385     },
    386     {
    387       "claim": "Larger LLMs within a model family are more susceptible to jailbreak attacks due to the usefulness-safety tradeoff.",
    388       "evidence": "Figure 8 shows ASR increasing with model size for Gemma-2, Llama-3.1, Llama-3.2, and Qwen-2 families, with the exception of Llama-3 (attributed to knowledge cutoff differences).",
    389       "supported": "weak"
    390     },
    391     {
    392       "claim": "Template inference attack can determine an LLM's chat template with approximately 90%+ accuracy using at most 10 queries.",
    393       "evidence": "Figure 2 shows inference accuracy vs. max try times for three LLMs with six template pair combinations. At NT_max=5, accuracy reaches approximately 0.9 or higher.",
    394       "supported": "moderate"
    395     },
    396     {
    397       "claim": "DIA's prompt rewrite algorithm contributes meaningfully to multi-query attack performance improvement.",
    398       "evidence": "Figure 9 compares multi-query ASR with and without the rewrite mechanism on Llama-2-7B and Llama-3-8B, showing slower ASR growth without rewriting, especially for DIA-I on Llama-2-7B where ASR remains near zero.",
    399       "supported": "strong"
    400     }
    401   ],
    402   "red_flags": [
    403     {
    404       "flag": "No error bars or variance reporting",
    405       "detail": "All results across Tables 2, 3, 6 and Figures 5, 7, 10 report single-run point estimates without any measure of variance, confidence intervals, or standard deviation. Given LLM output stochasticity, ASR values could vary meaningfully across runs."
    406     },
    407     {
    408       "flag": "No statistical significance tests",
    409       "detail": "Claims of superiority ('DIA achieves superior ASRs') are based purely on comparing point estimates across 6 methods × 10 models × 3 benchmarks. No significance tests are applied despite the large number of comparisons."
    410     },
    411     {
    412       "flag": "Missing limitations section",
    413       "detail": "The paper has no dedicated Limitations or Threats to Validity section. Key limitations (chat template knowledge requirement, model-specific effectiveness variation, generalizability to non-tested models) are not systematically discussed."
    414     },
    415     {
    416       "flag": "Defense evaluation limited to one model",
    417       "detail": "Table 5 evaluates defense bypass only on Gemma-2-9B. The claim of defense penetration may not generalize to other models, especially those where DIA shows lower baseline ASR (e.g., Llama-3.1-8B where DIA-I achieves 0.000)."
    418     },
    419     {
    420       "flag": "Baseline reimplementation concern",
    421       "detail": "Baselines were reimplemented by the authors on Ollama rather than using original implementations. While the authors claim this ensures fair comparison, self-reimplementation of baselines systematically risks underperforming the original (Lucic et al., 2018). PAIR results differ substantially from the original paper (0.317 vs. 0.040 on Llama-2)."
    422     },
    423     {
    424       "flag": "Model-size claim has notable exception",
    425       "detail": "The claim that larger models are more susceptible is contradicted by the Llama-3 family, which shows the opposite trend. The post-hoc explanation (knowledge cutoff discrepancy) is plausible but not verified, weakening the generality of the finding."
    426     }
    427   ],
    428   "cited_papers": [
    429     {
    430       "title": "Universal and transferable adversarial attacks on aligned language models",
    431       "authors": ["A. Zou", "Z. Wang", "N. Carlini", "M. Nasr", "J. Z. Kolter", "M. Fredrikson"],
    432       "year": 2023,
    433       "arxiv_id": "2307.15043",
    434       "relevance": "Introduces the GCG white-box jailbreak attack and the AdvBench benchmark used extensively in this paper's evaluation."
    435     },
    436     {
    437       "title": "DeepInception: Hypnotize large language model to be jailbreaker",
    438       "authors": ["X. Li", "Z. Zhou", "J. Zhu", "J. Yao", "T. Liu", "B. Han"],
    439       "year": 2023,
    440       "arxiv_id": "2311.03191",
    441       "relevance": "Key baseline black-box jailbreak attack using hypnotic virtual scenarios; demonstrates ASR degradation on newer LLMs."
    442     },
    443     {
    444       "title": "Making them ask and answer: Jailbreaking large language models in few queries via disguise and reconstruction",
    445       "authors": ["T. Liu", "Y. Zhang", "Z. Zhao", "Y. Dong", "G. Meng", "K. Chen"],
    446       "year": 2024,
    447       "relevance": "DRA baseline that conceals harmful instructions through word splitting; demonstrates complete failure on Llama-3.1-8B and GPT-4o."
    448     },
    449     {
    450       "title": "Jailbreaking black box large language models in twenty queries",
    451       "authors": ["P. Chao", "A. Robey", "E. Dobriban", "H. Hassani", "G. J. Pappas", "E. Wong"],
    452       "year": 2023,
    453       "relevance": "PAIR baseline using attacker LLM for automated jailbreak prompt generation; compared against DIA in both single and multi-query settings."
    454     },
    455     {
    456       "title": "Safety alignment should be made more than just a few tokens deep",
    457       "authors": ["X. Qi", "A. Panda", "K. Lyu", "X. Ma", "S. Roy", "A. Beirami", "P. Mittal", "P. Henderson"],
    458       "year": 2024,
    459       "arxiv_id": "2406.05946",
    460       "relevance": "Demonstrates shallow safety alignment vulnerability exploited by prefilling attacks; proposes token-wise constrained defense relevant to DIA-I."
    461     },
    462     {
    463       "title": "ChatBug: A common vulnerability of aligned LLMs induced by chat templates",
    464       "authors": ["F. Jiang", "Z. Xu", "L. Niu", "B. Y. Lin", "R. Poovendran"],
    465       "year": 2024,
    466       "arxiv_id": "2406.12935",
    467       "relevance": "Identifies chat template vulnerabilities in aligned LLMs, directly related to DIA's dialogue injection technique."
    468     },
    469     {
    470       "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models",
    471       "authors": ["P. Chao", "E. Debenedetti", "A. Robey", "M. Andriushchenko", "F. Croce"],
    472       "year": 2024,
    473       "arxiv_id": "2404.01318",
    474       "relevance": "Provides standardized evaluation framework for jailbreak attacks with manually constructed affirmative beginnings."
    475     },
    476     {
    477       "title": "Defensive prompt patch: A robust and interpretable defense of LLMs against jailbreak attacks",
    478       "authors": ["C. Xiong", "X. Qi", "P.-Y. Chen", "T.-Y. Ho"],
    479       "year": 2024,
    480       "arxiv_id": "2405.20099",
    481       "relevance": "Defense method evaluated against DIA; uses prompt-tuning to append safety reminders to user inputs."
    482     },
    483     {
    484       "title": "Bergeron: Combating adversarial attacks through a conscience-based alignment framework",
    485       "authors": ["M. Pisano", "P. Ly", "A. Sanders", "B. Yao", "D. Wang", "T. Strzalkowski", "M. Si"],
    486       "year": 2023,
    487       "arxiv_id": "2312.00029",
    488       "relevance": "Defense using secondary LLM monitoring evaluated against DIA; demonstrates DIA's ability to bypass multi-LLM defense systems."
    489     },
    490     {
    491       "title": "AutoDAN: Automatic and interpretable adversarial attacks on large language models",
    492       "authors": ["S. Zhu", "R. Zhang", "B. An", "G. Wu", "J. Barrow", "Z. Wang"],
    493       "year": 2023,
    494       "arxiv_id": "2310.15140",
    495       "relevance": "White-box jailbreak attack using dual optimization objectives for prompt readability, representative of gradient-based attack approaches."
    496     },
    497     {
    498       "title": "LLM-Fuzzer: Scaling assessment of large language model jailbreaks",
    499       "authors": ["J. Yu", "X. Lin", "Z. Yu", "X. Xing"],
    500       "year": 2024,
    501       "relevance": "Fuzz-testing approach to automated jailbreak template generation; represents automated attack methodology in the jailbreak landscape."
    502     },
    503     {
    504       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    505       "authors": ["X. Qi", "Y. Zeng", "T. Xie", "P.-Y. Chen", "R. Jia", "P. Mittal", "P. Henderson"],
    506       "year": 2023,
    507       "arxiv_id": "2310.03693",
    508       "relevance": "Source of the HEx-PHI benchmark used for evaluation; demonstrates safety degradation through fine-tuning."
    509     },
    510     {
    511       "title": "A comprehensive study of jailbreak attack versus defense for large language models",
    512       "authors": ["Z. Xu", "Y. Liu", "G. Deng", "Y. Li", "S. Picek"],
    513       "year": 2024,
    514       "relevance": "Introduces the Defense Pass Rate metric used in this paper's defense evaluation (Table 5)."
    515     }
    516   ],
    517   "engagement_factors": {
    518     "practical_relevance": {
    519       "score": 2,
    520       "justification": "Attack technique is directly usable for red-teaming LLM chat services; code is released on GitHub and requires only API access to execute."
    521     },
    522     "surprise_contrarian": {
    523       "score": 2,
    524       "justification": "Challenges the assumption that historical dialogue is safe from manipulation; counter-intuitive finding that larger models are MORE vulnerable to jailbreaks."
    525     },
    526     "fear_safety": {
    527       "score": 3,
    528       "justification": "Demonstrates a novel attack vector achieving high success rates on production LLMs including GPT-4o while bypassing 5 existing defense mechanisms."
    529     },
    530     "drama_conflict": {
    531       "score": 1,
    532       "justification": "Shows existing defenses are inadequate but presented in standard academic framing without strong provocation or naming-and-shaming."
    533     },
    534     "demo_ability": {
    535       "score": 2,
    536       "justification": "Code released on GitHub with Ollama-based setup; reproducible with local LLMs but requires GPU hardware and setup."
    537     },
    538     "brand_recognition": {
    539       "score": 1,
    540       "justification": "From Zhejiang University, a respected institution but not a household-name AI lab; testing GPT-4o adds some brand recognition."
    541     }
    542   }
    543 }

Impressum · Datenschutz