ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27388B)


      1 {
      2   "paper": {
      3     "title": "Jailbreaking Leading Safety-Aligned LLMs with Simple Adaptive Attacks",
      4     "authors": ["Maksym Andriushchenko", "Francesco Croce", "Nicolas Flammarion"],
      5     "year": 2024,
      6     "venue": "ICLR 2025",
      7     "arxiv_id": "2404.02151"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Simple adaptive jailbreaking attacks achieve 100% attack success rate on all leading safety-aligned LLMs including Llama-2-Chat, Llama-3, Gemma, GPT-3.5/4o, Claude 3/3.5, and the adversarially trained R2D2. The key insight is that adaptivity—tailoring attack strategy to each model's specific vulnerabilities (logprobs, prefilling, in-context learning)—is far more effective than any single static attack. The authors also won 1st place in the SaTML'24 Trojan Detection Competition using restricted random search on embedding-distance-selected token pools.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "Code, logs, and jailbreak artifacts are released at https://github.com/tml-epfl/llm-adaptive-attacks, as stated in the abstract and Section 1."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "They use the publicly available AdvBench dataset (50 harmful requests curated by Chao et al. 2023) and release jailbreak artifacts in JailbreakBench format."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No mention of requirements.txt, Dockerfile, or detailed environment specifications in the paper. Hardware is mentioned only briefly (single A100 GPU in Appendix C.2)."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper refers to supplementary code but does not include step-by-step reproduction instructions in the paper itself. Algorithm 1 describes the random search but no end-to-end reproduction guide is provided."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point-estimate attack success rates (e.g., 100%, 96%) with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims its method outperforms baselines based solely on comparing ASR numbers (e.g., 100% vs 61%) without any statistical significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "ASR improvements are reported with baseline context, e.g., 'the previous best result on Claude 2.0 is 61% while we get 100%' (Section 4.4), providing clear magnitude of improvement."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The evaluation uses 50 harmful requests from AdvBench without justifying why 50 is sufficient. No power analysis or discussion of whether this sample size supports the 100% ASR claims."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance or standard deviation across runs is reported. Figure 3 shows non-determinism in GPT logprobs but does not report variance of attack success rates across multiple experimental runs."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Extensive baseline comparisons are provided: GCG, PAIR, TAP, PAP, Persona Modulation, AutoDAN, and others across Tables 1-4 and Tables 21-22."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include contemporary methods: PAP (Zeng et al. 2024), HarmBench (Mazeika et al. 2024), TAP (Mehrotra et al. 2023), and evaluations on recent models like GPT-4o and Claude 3.5 Sonnet."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "Ablations are provided: prompt alone vs. prompt+RS vs. prompt+RS+self-transfer (Tables 2-4), effect of suffix length (Appendix C.1, Figure 4), and ablations on Claude request structure (Tables 15-17)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Four different judges are used: GPT-4, rule-based judge, Llama-3-70B, and Llama Guard 2 (Table 20 and Appendix C.6)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The authors state 'we manually inspect all generations and flag cases with a significant number of false positives' (Section 3.1), providing human verification of the automated judge."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The paper explicitly states 'all our suffixes are optimized to be request-specific and model-specific, i.e., there is no distinction between the training and test requests or models' (Section 3.2). For the trojan task, a validation set is used (Section 5)."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per model (Tables 1-4), per attack component (prompt alone, +RS, +self-transfer), and per judge type (Table 20)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "False positives of GPT-4 judge are discussed (Appendix C.5, Table 19). Cases where prompt alone fails (0% on Llama-2, GPT-4o) are reported. Claude 2.1 is flagged as producing false positives."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Multiple negative results: prompt alone at 0% on Llama-2-Chat and GPT-4o, self-transfer ineffective on R2D2, transfer attack 0% on Claude 2.1 and Claude 3 Opus, longer suffixes hurting performance (Appendix C.1)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims 100% ASR on listed models, which is supported by Tables 1-4. The trojan competition first place claim is supported by Table 5. All abstract claims match the results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims about which components matter are supported by controlled ablations: each component (prompt, RS, self-transfer) is tested incrementally (Tables 2-4), constituting adequate single-variable manipulation."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly states attacks are customized per model and that 'no single method can generalize across all target models' (Section 1). Results are bounded to the specific models and the 50 AdvBench requests tested."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "Section 6 discusses that a 10/10 judge score 'does not always imply that the generated content is actually beneficial for an attacker.' They discuss judge limitations, non-determinism effects (Figure 3), and potential defenses (Section 6, A.3)."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper explicitly acknowledges the gap between judge scores and actual harm: 'Even a perfect jailbreak score (10/10) from the GPT-4 judge does not always imply that the generated content is actually beneficial for an attacker' (Section 6). They use multiple judges to address this."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific API checkpoints are stated: 'gpt-3.5-turbo-1106, gpt-4-1106-preview, and gpt-4o-2024-05-13' (Section 4.3). Model sizes are given for open models (e.g., Llama-2-Chat-7B/13B/70B)."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompt templates are provided: main template in Figure 1, in-context prompt in Table 6, GPT-4o custom prompt in Table 7, judge prompt in Table 8, and all system prompts in Tables 9-14."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Key hyperparameters are reported: 25-token suffix length, up to 10,000 iterations, up to 10 random restarts, temperature zero for GPT models, temperature one for Claude restarts. Algorithm 1 formalizes the procedure."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The attacks are direct prompt-based methods with random search optimization, not multi-step agent workflows."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The paper documents using '50 harmful requests from AdvBench curated by Chao et al. (2023) that ensures distinct and diverse harmful requests' (Section 3.1). The trojan detection section documents the token selection pipeline in detail (Section 5)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 6 'Discussion, Recommendations, and Limitations' contains substantive discussion of limitations. Appendix A also covers ethics and additional discussion points."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats are discussed: GPT-4 judge false positives (especially on Claude 2.1), non-determinism of GPT models affecting random search signal (Figure 3), and that jailbreak judge scores don't guarantee harmful content utility (Section 6)."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper explicitly states scope: attacks were not tested against test-time defenses like SmoothLLM (Appendix A.3), results are specific to the 50 AdvBench behaviors, and that 'attacking SmoothLLM would require developing new adaptive attack' (Appendix A.3)."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "All jailbreak artifacts are released in JailbreakBench format at the GitHub repository, enabling independent verification of attack outputs."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The data source is clearly described: 50 harmful requests from AdvBench curated by Chao et al. (2023). The trojan competition setup references Rando & Tramèr (2024). Evaluation procedure is fully specified."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. The study evaluates LLM models using a standard benchmark dataset."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is clear: harmful request → prompt template → optional random search → LLM generation → judge evaluation. Algorithm 1 formalizes the optimization. The trojan detection pipeline is documented in Section 5."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Acknowledgements section discloses: Google Fellowship, Open Phil AI Fellowship, unrestricted gift from Google, Swiss National Science Foundation grant 212111. OpenAI API credits and Anthropic evaluation access also disclosed."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "All three authors are affiliated with EPFL, clearly stated on page 1. They are independent of the companies whose models they evaluate."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "The paper is partially funded by Google (unrestricted gift) and Google Fellowship, and they receive OpenAI API credits and Anthropic evaluation access. These companies' models are being evaluated for security weaknesses, creating a potential tension — though the findings are critical of all evaluated models."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper tests attack effectiveness against safety alignment, not model knowledge on a benchmark. Contamination of the AdvBench requests in training data would not meaningfully affect attack success rates — the models are supposed to refuse these requests."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Not applicable — the paper tests jailbreaking defenses rather than model knowledge capabilities. The AdvBench requests are intended to be recognized as harmful and refused."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not applicable — this is a security/red-teaming study testing defense robustness, not a capability benchmark where contamination would inflate scores."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Appendix C.2 reports wall-clock time: '4000 iterations of random search on Llama-3-8B take 20.9 minutes on a single A100 GPU' and 'the total time of the whole experiment does not exceed a few hours.'"
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "While per-experiment time is given for one model, total API costs for GPT and Claude evaluations are not quantified. Hardware is mentioned (single A100) but total compute budget is not stated."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No results across multiple random seeds are reported. Figure 3 shows non-determinism in GPT logprobs but ASR results are not shown across seeds."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Number of restarts is stated throughout: 'up to 10 random restarts' (Section 3.2), and specific restart counts for Claude models in Tables 15-17 (1, 10, 100 restarts)."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The prompt template was optimized on GPT-3.5 and the suffix length of 25 was selected via ablation (Figure 4), but the total search budget for template design is not quantified."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "Figure 4 shows the ablation over suffix lengths with 25 performing optimally. The prompt template optimization process is described (maximizing logprob of 'Sure' on GPT-3.5). Selection criteria are transparent."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors compare their attack against baselines but do not discuss the bias of implementing and tuning their own method while using reported numbers for baselines. Some baseline numbers come from different datasets/judges (marked with * in tables)."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Figure 2 shows convergence curves (ASR and logprob vs. iterations) for different models with and without self-transfer, explicitly showing performance as a function of compute (iterations)."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper discusses what ASR actually measures: 'Even a perfect jailbreak score does not always imply that the generated content is actually beneficial for an attacker' (Section 6). Multiple judges are used to validate construct validity (Table 20). False positive analysis is provided (Appendix C.5)."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved. Attacks are direct prompt-based methods applied to model APIs."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": false,
    341         "answer": false,
    342         "justification": "Not applicable — the paper tests attack robustness of safety alignment, not model knowledge. AdvBench requests are meant to be refused regardless of training data."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": false,
    346         "answer": false,
    347         "justification": "Not applicable — this is a red-teaming/attack study, not a capability benchmark where feature leakage would be meaningful."
    348       },
    349       "non_independence_addressed": {
    350         "applies": false,
    351         "answer": false,
    352         "justification": "Not applicable — the 50 AdvBench requests are a fixed test set for attack evaluation, not a train/test split scenario."
    353       },
    354       "leakage_detection_method": {
    355         "applies": false,
    356         "answer": false,
    357         "justification": "Not applicable — contamination/leakage is not a meaningful concern for red-teaming studies testing defense robustness."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Adaptive jailbreaking attacks achieve 100% attack success rate on all leading safety-aligned LLMs tested, including Llama-2-Chat, GPT-4o, and Claude 3.5 Sonnet.",
    364       "evidence": "Tables 1-4 show 100% ASR across 15+ models using GPT-4 as judge on 50 AdvBench behaviors. Table 22 provides comprehensive results.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Self-transfer is key for query efficiency and high attack success rate on resistant models like Llama-2-Chat.",
    369       "evidence": "Table 2 shows Llama-2-Chat-7B goes from 50% ASR (prompt+RS) to 100% (prompt+RS+self-transfer). Figure 2 convergence curves show faster convergence with self-transfer initialization.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "The prefilling attack achieves 100% ASR on all Claude models without requiring any search.",
    374       "evidence": "Table 4 shows 100% ASR with prefilling on Claude 2.0, 2.1, 3 Haiku, 3 Sonnet, 3 Opus, and 3.5 Sonnet. Tables 16-17 provide ablations.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "No single attack method generalizes across all target models; adaptivity is crucial.",
    379       "evidence": "Table 2 shows prompt alone is 0% on Llama-2 but 100% on GPT-3.5 (Table 3). In-context prompt works on R2D2 (90%) but not Llama-2 (0%). Transfer attack works on Claude 3 Sonnet (100%) but not Claude 3 Opus (0%).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Random search on restricted token sets won first place in the SaTML'24 Trojan Detection Competition.",
    384       "evidence": "Table 5 shows their method achieves best total score (-30.22 vs -29.21 for 2nd place). The embedding-distance token selection strategy is described in Section 5.",
    385       "supported": "strong"
    386     },
    387     {
    388       "claim": "The adversarially trained R2D2 model is vulnerable to in-context learning prompts, achieving 100% ASR.",
    389       "evidence": "Table 2 shows in-context prompt alone at 90% ASR, boosted to 100% with random search, vs. 61% best prior result from Mazeika et al. (2024). However, prior results used different request sets and judges.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "Non-comparable baselines",
    396       "detail": "Many baseline numbers (marked with * in tables) come from different request sets, different judges (distilled GPT-4 vs GPT-4), or different experimental conditions, making direct ASR comparison imprecise. The paper acknowledges this with footnotes but still presents the numbers side by side."
    397     },
    398     {
    399       "flag": "GPT-4 judge false positives",
    400       "detail": "The paper acknowledges GPT-4 judge produces false positives, particularly on Claude 2.1 ('around 20%'). This means the reported 100% ASR on Claude 2.1 is inflated. The paper flags this but still reports 100%."
    401     },
    402     {
    403       "flag": "No variance reporting",
    404       "detail": "Attack success rates are reported as single numbers despite the stochastic nature of random search and LLM non-determinism (Figure 3). Without variance across runs, it is unclear how stable the 100% ASR claim is."
    405     },
    406     {
    407       "flag": "Small evaluation set",
    408       "detail": "All main results use only 50 harmful requests. While this is a standard benchmark, 100% ASR on 50 examples with no confidence intervals provides limited statistical power to distinguish, say, true 98% from true 100% ASR."
    409     }
    410   ],
    411   "cited_papers": [
    412     {
    413       "title": "Universal and transferable adversarial attacks on aligned language models",
    414       "authors": ["Andy Zou", "Zifan Wang", "J Zico Kolter", "Matt Fredrikson"],
    415       "year": 2023,
    416       "arxiv_id": "2307.15043",
    417       "relevance": "Introduced GCG attack and AdvBench benchmark — the primary baseline and evaluation dataset used in this paper."
    418     },
    419     {
    420       "title": "Jailbreaking black box large language models in twenty queries",
    421       "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J Pappas", "Eric Wong"],
    422       "year": 2023,
    423       "arxiv_id": "2310.08419",
    424       "relevance": "Introduced PAIR attack and the curated 50-behavior subset of AdvBench used throughout this paper."
    425     },
    426     {
    427       "title": "HarmBench: A standardized evaluation framework for automated red teaming and robust refusal",
    428       "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou"],
    429       "year": 2024,
    430       "relevance": "Standardized jailbreak evaluation framework; source of R2D2 adversarially trained model and baseline numbers."
    431     },
    432     {
    433       "title": "On adaptive attacks to adversarial example defenses",
    434       "authors": ["Florian Tramèr", "Nicholas Carlini", "Wieland Brendel", "Aleksander Madry"],
    435       "year": 2020,
    436       "relevance": "Established the definition of adaptive attacks used in this paper — fundamental to the paper's methodology and argument."
    437     },
    438     {
    439       "title": "Training a helpful and harmless assistant with reinforcement learning from human feedback",
    440       "authors": ["Yuntao Bai", "Andy Jones", "Kamal Ndousse"],
    441       "year": 2022,
    442       "arxiv_id": "2204.05862",
    443       "relevance": "Foundational RLHF safety alignment work that the jailbreaking attacks in this paper aim to circumvent."
    444     },
    445     {
    446       "title": "JailbreakBench: An open robustness benchmark for jailbreaking large language models",
    447       "authors": ["Patrick Chao", "Edoardo Debenedetti", "Alexander Robey", "Maksym Andriushchenko"],
    448       "year": 2024,
    449       "relevance": "Standardized jailbreaking benchmark; the paper's artifacts are released in JailbreakBench format."
    450     },
    451     {
    452       "title": "How johnny can persuade llms to jailbreak them: Rethinking persuasion to challenge AI safety by humanizing LLMs",
    453       "authors": ["Yi Zeng", "Hongpeng Lin", "Jingwen Zhang"],
    454       "year": 2024,
    455       "arxiv_id": "2401.06373",
    456       "relevance": "PAP attack achieving 92-94% ASR on Llama-2/GPT-3.5 — one of the strongest prior baselines."
    457     },
    458     {
    459       "title": "Improving alignment and robustness with circuit breakers",
    460       "authors": ["Andy Zou", "Long Phan", "Justin Wang"],
    461       "year": 2024,
    462       "relevance": "Representation-based defense approach discussed as a promising defense direction against adaptive attacks."
    463     },
    464     {
    465       "title": "Safety alignment should be made more than just a few tokens deep",
    466       "authors": ["Xiangyu Qi", "Ashwinee Panda", "Kaifeng Lyu"],
    467       "year": 2025,
    468       "relevance": "Subsequent work that reused the prefilling attack and jailbreak template from this paper."
    469     },
    470     {
    471       "title": "AgentHarm: A benchmark for measuring harmfulness of LLM agents",
    472       "authors": ["Maksym Andriushchenko", "Alexandra Souly", "Mateusz Dziemian"],
    473       "year": 2025,
    474       "relevance": "Extension of jailbreaking to the LLM agent setting; reused techniques from this paper."
    475     },
    476     {
    477       "title": "Refusal-trained LLMs are easily jailbroken as browser agents",
    478       "authors": ["Priyanshu Kumar", "Elaine Lau", "Saranya Vijayakumar"],
    479       "year": 2025,
    480       "relevance": "Demonstrates jailbreaking in agentic browser settings, building on techniques from this paper."
    481     },
    482     {
    483       "title": "Many-shot jailbreaking",
    484       "authors": ["Cem Anil", "Esin Durmus", "Mrinank Sharma"],
    485       "year": 2024,
    486       "relevance": "In-context learning jailbreaking approach from Anthropic; compared against in Appendix C.7."
    487     }
    488   ]
    489 }

Impressum · Datenschutz