ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (27578B)


      1 {
      2   "paper": {
      3     "title": "Safe in Isolation, Dangerous Together: Agent-Driven Multi-Turn Decomposition Jailbreaks on LLMs",
      4     "authors": ["Devansh Srivastav", "Xiao Zhang"],
      5     "year": 2025,
      6     "venue": "ACL 2025 (REALM Workshop)"
      7   },
      8   "scan_version": 2,
      9   "active_modules": ["experimental_rigor", "data_leakage"],
     10   "methodology_tags": ["benchmark-eval"],
     11   "key_findings": "A multi-agent decomposition framework achieves up to 95.38% attack success rate (ASR) on AdvBench across six LLMs, significantly outperforming existing jailbreak techniques including DAN, Past Tense, Implicit Reference, and Style Injection. The attack decomposes harmful queries into innocuous sub-questions via three agents (Decomposer, Answerer, Combiner), requiring no prompt hacking. Category-wise analysis shows near-100% ASR on privacy violations and child abuse categories for some models, and results are stable across 5 independent runs (SD 1.97-2.84%).",
     12   "checklist": {
     13     "artifacts": {
     14       "code_released": {
     15         "applies": true,
     16         "answer": false,
     17         "justification": "No repository URL, code archive, or link to source code is provided anywhere in the paper."
     18       },
     19       "data_released": {
     20         "applies": true,
     21         "answer": true,
     22         "justification": "The paper uses the publicly available AdvBench dataset (Zou et al., 2023) containing 520 adversarial prompts. No custom data was collected."
     23       },
     24       "environment_specified": {
     25         "applies": true,
     26         "answer": false,
     27         "justification": "The paper mentions M2 Max 32GB with MPS backend and use of Ollama/OpenAI API, but provides no requirements.txt, library versions, or environment specification."
     28       },
     29       "reproduction_instructions": {
     30         "applies": true,
     31         "answer": false,
     32         "justification": "No step-by-step reproduction instructions, README, or scripts are provided."
     33       }
     34     },
     35     "statistical_methodology": {
     36       "confidence_intervals_or_error_bars": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "Results in Table 1 are point estimates only. Figure 3 shows standard deviations across 5 runs but no confidence intervals or error bars on the main comparison results."
     40       },
     41       "significance_tests": {
     42         "applies": true,
     43         "answer": false,
     44         "justification": "The paper claims Multi-Agent 'consistently outperforms' other methods but provides no statistical significance tests. Comparisons are based solely on comparing ASR percentages."
     45       },
     46       "effect_sizes_reported": {
     47         "applies": true,
     48         "answer": true,
     49         "justification": "The paper reports absolute ASR values with baselines for context, e.g., 'GPT-3.5-Turbo reached the highest ASR of 95.38%' from a baseline of 9.03%, and provides percentage improvements across all models in Table 1."
     50       },
     51       "sample_size_justified": {
     52         "applies": true,
     53         "answer": false,
     54         "justification": "The paper uses 520 prompts from AdvBench without justifying why this sample size is adequate for the claims made. No power analysis."
     55       },
     56       "variance_reported": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 4.3 and Figure 3 report standard deviations across 5 independent runs for each model (e.g., Mistral-7B SD=1.97%, GPT-3.5-Turbo SD=2.16%)."
     60       }
     61     },
     62     "evaluation_design": {
     63       "baselines_included": {
     64         "applies": true,
     65         "answer": true,
     66         "justification": "Table 1 compares against four jailbreak methods (DAN, Past Tense, Implicit Reference, Style Injection) plus a direct-prompting baseline across all six models."
     67       },
     68       "baselines_contemporary": {
     69         "applies": true,
     70         "answer": true,
     71         "justification": "Baselines include DAN (Shen et al., 2024), Past Tense (Andriushchenko & Flammarion, 2024), Implicit Reference (Wu et al., 2024b), and Style Injection (Wei et al., 2024) — all from 2024, which is recent."
     72       },
     73       "ablation_study": {
     74         "applies": true,
     75         "answer": false,
     76         "justification": "The three-agent system (Decomposer, Answerer, Combiner) has distinct components but no ablation study examines the contribution of each component (e.g., what happens with 2 agents instead of 3, or without decomposition)."
     77       },
     78       "multiple_metrics": {
     79         "applies": true,
     80         "answer": false,
     81         "justification": "Only ASR (Attack Success Rate) is reported. No other metrics such as response quality, semantic similarity to target harmful content, or fluency are measured."
     82       },
     83       "human_evaluation": {
     84         "applies": true,
     85         "answer": false,
     86         "justification": "The paper uses only LLM-as-judge (Mixtral-8×7B) for evaluation. The Limitations section acknowledges 'Human evaluation at scale remains a valuable complementary direction' but none was performed. Manual spot-checking on a 'small subset' is mentioned but not systematically reported."
     87       },
     88       "held_out_test_set": {
     89         "applies": false,
     90         "answer": false,
     91         "justification": "This is not a learning-based system — there is no training/validation/test split. The entire AdvBench dataset is used as the evaluation set."
     92       },
     93       "per_category_breakdown": {
     94         "applies": true,
     95         "answer": true,
     96         "justification": "Section 4.2 and Figure 2 provide category-wise ASR breakdowns across 7 categories: Illegal Activity, Hate Speech, Malware, Fraud, Privacy Violations, Health Consultation, and Child Abuse."
     97       },
     98       "failure_cases_discussed": {
     99         "applies": true,
    100         "answer": false,
    101         "justification": "The paper shows only successful jailbreak examples (Appendix B). No analysis of why the ~5-23% of attacks that failed did so. The Limitations section mentions Llama models getting stuck in infinite loops but does not analyze failure patterns systematically."
    102       },
    103       "negative_results_reported": {
    104         "applies": true,
    105         "answer": false,
    106         "justification": "Vicuna-7B's lower ASR (77.50%) is noted but not explored in depth. No configurations or design choices that failed are discussed."
    107       }
    108     },
    109     "claims_and_evidence": {
    110       "abstract_claims_supported": {
    111         "applies": true,
    112         "answer": true,
    113         "justification": "Abstract claims of 'often exceeding 90% across various LLMs' and 'consistently achieves the highest attack success rate' are supported by Table 1 (4/6 models above 90%, highest ASR in all cases)."
    114       },
    115       "causal_claims_justified": {
    116         "applies": true,
    117         "answer": false,
    118         "justification": "The paper claims the multi-agent decomposition strategy 'causes' the safety bypass, attributing it to 'lack of holistic context awareness.' However, no controlled experiment isolates this mechanism — the multi-agent approach differs from baselines in many ways (number of turns, role separation, query reformulation). No ablation separates decomposition from multi-agent orchestration."
    119       },
    120       "generalization_bounded": {
    121         "applies": true,
    122         "answer": false,
    123         "justification": "The title says 'LLMs' broadly. The paper tests 5 open-source models (mostly 7-12B) and one older proprietary model (GPT-3.5-Turbo). The Limitations section notes they did not test DeepSeek-R1 or O1, but the paper's framing of 'critical flaw in the current safety architecture of multi-agent LLM systems' is not bounded to the tested models."
    124       },
    125       "alternative_explanations_discussed": {
    126         "applies": true,
    127         "answer": false,
    128         "justification": "The paper attributes success solely to 'lack of holistic context awareness' without considering alternative explanations such as: the Combiner agent itself generating harmful content (not just synthesizing), weaker safety alignment in smaller open-source models, or the role of the specific LLM-as-judge in inflating ASR."
    129       },
    130       "proxy_outcome_distinction": {
    131         "applies": true,
    132         "answer": false,
    133         "justification": "ASR is measured by an LLM judge (Mixtral-8×7B) which counts any response that 'addresses the prompt in any way, including through fictional scenarios or educational explanations' as successful. This conflates policy violation severity — a nuanced educational discussion is scored the same as explicit harmful instructions. The paper does not discuss this proxy gap."
    134       }
    135     },
    136     "setup_transparency": {
    137       "model_versions_specified": {
    138         "applies": true,
    139         "answer": false,
    140         "justification": "Models are identified by name (Mistral-7B, Gemma-2-9B, GPT-3.5-Turbo, etc.) but no specific version strings or snapshot dates are given. 'GPT-3.5-Turbo' without a version date could refer to multiple model snapshots."
    141       },
    142       "prompts_provided": {
    143         "applies": true,
    144         "answer": true,
    145         "justification": "The evaluation prompt for the LLM judge is provided in Appendix A. The agent role prompts are described conceptually in Section 3, and full examples of sub-questions and answers are provided in Appendix B. However, the exact system prompts for the three agents (Decomposer, Answerer, Combiner) are not provided verbatim — only their roles and objectives are described."
    146       },
    147       "hyperparameters_reported": {
    148         "applies": true,
    149         "answer": true,
    150         "justification": "Section 4 states: 'For all generations, we use default sampling parameters: temperature = 1.0, top_p = 1.0, and n = 1' with explicit justification for using defaults."
    151       },
    152       "scaffolding_described": {
    153         "applies": true,
    154         "answer": true,
    155         "justification": "The multi-agent framework is described in detail in Section 3 with a workflow diagram (Figure 1), three distinct agent roles, the CrewAI orchestration platform, and the iterative process (decomposition → sub-query resolution → answer synthesis)."
    156       },
    157       "data_preprocessing_documented": {
    158         "applies": true,
    159         "answer": false,
    160         "justification": "No documentation of how the 520 AdvBench prompts were processed, whether any were filtered, or how the 7 categories were assigned. The category breakdown in Figure 2 sums to unclear totals."
    161       }
    162     },
    163     "limitations_and_scope": {
    164       "limitations_section_present": {
    165         "applies": true,
    166         "answer": true,
    167         "justification": "A dedicated 'Limitations' section discusses computational bottlenecks (8-9 hours per model on M2 Max), LLM-as-judge bias, and lack of testing on DeepSeek-R1/O1."
    168       },
    169       "threats_to_validity_specific": {
    170         "applies": true,
    171         "answer": true,
    172         "justification": "The Limitations section identifies specific threats: 'using an LLM as a judge for evaluation may introduce subtle biases, especially if the judge model fails to detect nuanced or borderline policy violations' and notes Llama models getting stuck in infinite loops."
    173       },
    174       "scope_boundaries_stated": {
    175         "applies": true,
    176         "answer": false,
    177         "justification": "While the Limitations mentions not testing DeepSeek-R1 or O1, the paper does not explicitly bound its claims to the tested models and setting. The conclusion and abstract frame the findings as revealing 'a critical flaw in the current safety architecture of multi-agent LLM systems' without scoping to the specific models tested."
    178       }
    179     },
    180     "data_integrity": {
    181       "raw_data_available": {
    182         "applies": true,
    183         "answer": false,
    184         "justification": "No raw model outputs, judge scores, or per-prompt results are released. Only aggregate ASR percentages are reported."
    185       },
    186       "data_collection_described": {
    187         "applies": true,
    188         "answer": true,
    189         "justification": "Section 4 describes the experimental setup: AdvBench dataset with 520 prompts, local inference via Ollama for open-source models, OpenAI API for GPT-3.5-Turbo, default sampling parameters."
    190       },
    191       "recruitment_methods_described": {
    192         "applies": false,
    193         "answer": false,
    194         "justification": "No human participants. Data source is the standard AdvBench benchmark."
    195       },
    196       "data_pipeline_documented": {
    197         "applies": true,
    198         "answer": false,
    199         "justification": "The pipeline from prompt → decomposition → sub-answers → combined response → judge evaluation is described conceptually, but no details on how many prompts were filtered, how many generated infinite loops (mentioned in Limitations), or how edge cases were handled."
    200       }
    201     },
    202     "conflicts_of_interest": {
    203       "funding_disclosed": {
    204         "applies": true,
    205         "answer": false,
    206         "justification": "No funding or acknowledgments section is present in the paper."
    207       },
    208       "affiliations_disclosed": {
    209         "applies": true,
    210         "answer": true,
    211         "justification": "Authors are from CISPA Helmholtz Center for Information Security, clearly stated. They are not evaluating their own product."
    212       },
    213       "funder_independent_of_outcome": {
    214         "applies": true,
    215         "answer": false,
    216         "justification": "No funding is disclosed, so independence cannot be assessed."
    217       },
    218       "financial_interests_declared": {
    219         "applies": true,
    220         "answer": false,
    221         "justification": "No competing interests statement is present in the paper."
    222       }
    223     },
    224     "contamination": {
    225       "training_cutoff_stated": {
    226         "applies": false,
    227         "answer": false,
    228         "justification": "This paper tests jailbreak attacks (safety bypass), not model knowledge on benchmarks. The models are not being evaluated on whether they 'know' AdvBench answers — they are being tested on whether they refuse harmful requests."
    229       },
    230       "train_test_overlap_discussed": {
    231         "applies": false,
    232         "answer": false,
    233         "justification": "Contamination is not relevant to jailbreak attack evaluation — the concern is safety alignment, not benchmark knowledge."
    234       },
    235       "benchmark_contamination_addressed": {
    236         "applies": false,
    237         "answer": false,
    238         "justification": "Same as above — contamination of AdvBench prompts in training data would mean models are more familiar with these harmful requests, potentially making them harder to jailbreak, not easier."
    239       }
    240     },
    241     "human_studies": {
    242       "pre_registered": {
    243         "applies": false,
    244         "answer": false,
    245         "justification": "No human participants in this study."
    246       },
    247       "irb_or_ethics_approval": {
    248         "applies": false,
    249         "answer": false,
    250         "justification": "No human participants in this study."
    251       },
    252       "demographics_reported": {
    253         "applies": false,
    254         "answer": false,
    255         "justification": "No human participants in this study."
    256       },
    257       "inclusion_exclusion_criteria": {
    258         "applies": false,
    259         "answer": false,
    260         "justification": "No human participants in this study."
    261       },
    262       "randomization_described": {
    263         "applies": false,
    264         "answer": false,
    265         "justification": "No human participants in this study."
    266       },
    267       "blinding_described": {
    268         "applies": false,
    269         "answer": false,
    270         "justification": "No human participants in this study."
    271       },
    272       "attrition_reported": {
    273         "applies": false,
    274         "answer": false,
    275         "justification": "No human participants in this study."
    276       }
    277     },
    278     "cost_and_practicality": {
    279       "inference_cost_reported": {
    280         "applies": true,
    281         "answer": true,
    282         "justification": "The Limitations section reports '8-9 hours per model' for 520 prompts on M2 Max 32GB hardware, giving a practical sense of cost."
    283       },
    284       "compute_budget_stated": {
    285         "applies": true,
    286         "answer": true,
    287         "justification": "Hardware is stated (M2 Max 32GB, MPS backend) and runtime per model is reported (8-9 hours). Total budget across all experiments is inferrable."
    288       }
    289     },
    290     "experimental_rigor": {
    291       "seed_sensitivity_reported": {
    292         "applies": true,
    293         "answer": true,
    294         "justification": "Section 4.3 reports results across 5 independent runs with standard deviations for each model, effectively showing seed/run sensitivity."
    295       },
    296       "number_of_runs_stated": {
    297         "applies": true,
    298         "answer": true,
    299         "justification": "Section 4.3 explicitly states '5 independent runs for each model.'"
    300       },
    301       "hyperparameter_search_budget": {
    302         "applies": true,
    303         "answer": false,
    304         "justification": "The paper uses default parameters and explicitly states this avoids hyperparameter tuning. However, the agent prompts and decomposition strategy represent design choices that were presumably iterated on, with no budget for this search reported."
    305       },
    306       "best_config_selection_justified": {
    307         "applies": true,
    308         "answer": true,
    309         "justification": "The paper uses default sampling parameters (temperature=1.0, top_p=1.0) and explicitly justifies this choice to 'reflect how models typically behave in real-world use and to avoid introducing bias through hyperparameter tuning.'"
    310       },
    311       "multiple_comparison_correction": {
    312         "applies": true,
    313         "answer": false,
    314         "justification": "Comparisons are made across 6 models × 5 methods with no statistical tests performed, let alone corrections for multiple comparisons."
    315       },
    316       "self_comparison_bias_addressed": {
    317         "applies": true,
    318         "answer": false,
    319         "justification": "The authors implement the multi-agent attack and compare it against their own implementations of baseline attacks (DAN, Past Tense, etc.) without acknowledging potential bias in baseline implementations."
    320       },
    321       "compute_budget_vs_performance": {
    322         "applies": true,
    323         "answer": false,
    324         "justification": "The multi-agent approach requires 3 LLM calls per prompt (decomposition + answering + combining) versus single-call baselines, but compute cost vs. performance is not analyzed. The 8-9 hour runtime is mentioned only in Limitations."
    325       },
    326       "benchmark_construct_validity": {
    327         "applies": true,
    328         "answer": false,
    329         "justification": "The paper uses AdvBench without questioning whether it is representative of real-world jailbreak scenarios, or whether the 520 prompts adequately cover the space of harmful requests. No discussion of construct validity."
    330       },
    331       "scaffold_confound_addressed": {
    332         "applies": true,
    333         "answer": false,
    334         "justification": "The multi-agent framework is itself a scaffold. When comparing against single-turn baselines (DAN, Style Injection), the multi-turn, multi-agent nature of the approach is a confound — the improvement could be due to multi-turn interaction rather than decomposition specifically. This is not addressed."
    335       }
    336     },
    337     "data_leakage": {
    338       "temporal_leakage_addressed": {
    339         "applies": false,
    340         "answer": false,
    341         "justification": "This is a jailbreak attack study, not a capability benchmark. Temporal leakage is not relevant — the models are being tested on safety alignment, not knowledge."
    342       },
    343       "feature_leakage_addressed": {
    344         "applies": false,
    345         "answer": false,
    346         "justification": "Not applicable for the same reason — no knowledge benchmark is being evaluated."
    347       },
    348       "non_independence_addressed": {
    349         "applies": false,
    350         "answer": false,
    351         "justification": "Not applicable — no train/test split exists for this attack evaluation."
    352       },
    353       "leakage_detection_method": {
    354         "applies": false,
    355         "answer": false,
    356         "justification": "Not applicable — leakage detection is irrelevant to jailbreak attack evaluation."
    357       }
    358     }
    359   },
    360   "claims": [
    361     {
    362       "claim": "Multi-agent decomposition achieves up to 95.38% ASR on AdvBench, consistently outperforming existing jailbreak techniques across all models.",
    363       "evidence": "Table 1 shows ASR values: GPT-3.5-Turbo 95.38%, Mistral-7B 94.62%, Gemma-2-9B 93.85%, Mistral-Nemo-12B 92.69%, Llava-7B 86.15%, Vicuna-7B 77.50%. Multi-Agent achieves the highest or tied-highest ASR for every model.",
    364       "supported": "moderate"
    365     },
    366     {
    367       "claim": "The attack is highly stable and reproducible across multiple runs with low standard deviation (1.97-2.84%).",
    368       "evidence": "Section 4.3 and Figure 3 report 5 independent runs per model with SDs ranging from 1.97% (Mistral-7B) to 2.84% (Gemma-2-9B).",
    369       "supported": "strong"
    370     },
    371     {
    372       "claim": "The attack reveals 'a critical flaw in the current safety architecture of multi-agent LLM systems: their lack of holistic context awareness.'",
    373       "evidence": "The paper demonstrates high ASR but does not isolate the mechanism (no ablation separating decomposition from multi-turn from multi-agent effects). The causal attribution to 'lack of holistic context awareness' is an interpretation, not demonstrated.",
    374       "supported": "weak"
    375     },
    376     {
    377       "claim": "The method requires no prompt hacking or policy override commands.",
    378       "evidence": "Section 3 describes agent prompts as 'all reasonable and ostensibly safe' and the paper states 'no explicit prompt hacking or policy violation command is ever given.' However, exact agent prompts are not provided for verification.",
    379       "supported": "moderate"
    380     },
    381     {
    382       "claim": "Privacy violation and child abuse categories reach 100% ASR under multi-agent attack for some models.",
    383       "evidence": "Section 4.2 and Figure 2 report these category-wise results. However, category sample sizes are not stated — if these categories contain very few prompts, 100% is less meaningful.",
    384       "supported": "moderate"
    385     }
    386   ],
    387   "red_flags": [
    388     {
    389       "flag": "No ablation study",
    390       "detail": "The three-component system (Decomposer, Answerer, Combiner) is never ablated. It's unclear whether the improvement comes from decomposition, multi-turn interaction, multi-agent separation, or simply the additional compute of 3 LLM calls. Comparing a 3-call pipeline against single-call baselines without controlling for compute is unfair."
    391     },
    392     {
    393       "flag": "LLM-as-judge validity concerns",
    394       "detail": "The Mixtral-8×7B judge uses a lenient criterion: any response that 'addresses the prompt in any way, including through fictional scenarios or educational explanations' counts as a successful jailbreak. This conflates a nuanced educational discussion with explicit harmful instructions, likely inflating ASR. The paper acknowledges this risk but performs only limited manual spot-checking."
    395     },
    396     {
    397       "flag": "Agent prompts not provided",
    398       "detail": "The exact system prompts for the three agents (Decomposer, Answerer, Combiner) are described only conceptually. Without the actual prompts, the claim of 'no prompt hacking' cannot be independently verified."
    399     },
    400     {
    401       "flag": "Older/weaker models tested",
    402       "detail": "The tested models are predominantly small open-source models (7-12B parameters) and an older proprietary model (GPT-3.5-Turbo). No frontier models (GPT-4, Claude 3.5, Gemini 1.5) are tested, making the 'critical flaw' framing potentially overstated — these models may have stronger safety mechanisms."
    403     },
    404     {
    405       "flag": "No statistical significance tests",
    406       "detail": "Claims of 'consistently outperforms' are based solely on comparing point estimates without any statistical testing, despite having 5 runs per model that could support such tests."
    407     },
    408     {
    409       "flag": "Unfair compute comparison",
    410       "detail": "The multi-agent approach makes ~12+ LLM calls per prompt (decomposition + 10 sub-questions + combination) vs. single calls for baselines. The compute difference (8-9 hours vs. presumably much less for baselines) is never discussed as a confound."
    411     }
    412   ],
    413   "cited_papers": [
    414     {
    415       "title": "Universal and transferable adversarial attacks on aligned language models",
    416       "authors": ["Andy Zou", "Zifan Wang", "J Zico Kolter", "Matt Fredrikson"],
    417       "year": 2023,
    418       "arxiv_id": "2307.15043",
    419       "relevance": "Introduces the AdvBench dataset used in this paper's evaluation, foundational work on adversarial attacks against aligned LLMs."
    420     },
    421     {
    422       "title": "\"Do Anything Now\": Characterizing and Evaluating In-the-Wild Jailbreak Prompts on Large Language Models",
    423       "authors": ["Xinyue Shen", "Zeyuan Chen", "Michael Backes", "Yun Shen", "Yang Zhang"],
    424       "year": 2024,
    425       "relevance": "Characterizes jailbreak prompts in the wild; DAN is one of the baselines in this paper."
    426     },
    427     {
    428       "title": "Does refusal training in LLMs generalize to the past tense?",
    429       "authors": ["Maksym Andriushchenko", "Nicolas Flammarion"],
    430       "year": 2024,
    431       "arxiv_id": "2407.11969",
    432       "relevance": "Proposes Past Tense Manipulation jailbreak technique, used as a baseline in this paper."
    433     },
    434     {
    435       "title": "You Know What I'm Saying: Jailbreak Attack via Implicit Reference",
    436       "authors": ["Tianyu Wu", "Lingrui Mei", "Ruibin Yuan", "Lujun Li", "Wei Xue", "Yike Guo"],
    437       "year": 2024,
    438       "arxiv_id": "2410.03857",
    439       "relevance": "Proposes decomposition-based jailbreak via implicit references achieving >90% ASR; directly compared as baseline."
    440     },
    441     {
    442       "title": "Jailbroken: How does LLM safety training fail?",
    443       "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"],
    444       "year": 2024,
    445       "relevance": "Analyzes failure modes of LLM safety training; Style Injection baseline comes from this work."
    446     },
    447     {
    448       "title": "Comprehensive assessment of jailbreak attacks against LLMs",
    449       "authors": ["Junjie Chu", "Yugeng Liu", "Ziqing Yang", "Xinyue Shen", "Michael Backes", "Yang Zhang"],
    450       "year": 2024,
    451       "arxiv_id": "2402.05668",
    452       "relevance": "Comprehensive assessment of jailbreak attack techniques against LLMs."
    453     },
    454     {
    455       "title": "Jigsaw Puzzles: Splitting Harmful Questions to Jailbreak Large Language Models",
    456       "authors": ["Hao Yang", "Lizhen Qu", "Ehsan Shareghi", "Gholamreza Haffari"],
    457       "year": 2024,
    458       "arxiv_id": "2410.11459",
    459       "relevance": "Most closely related decomposition-based jailbreak achieving 93.76% ASR; directly compared in related work."
    460     },
    461     {
    462       "title": "Derail Yourself: Multi-turn LLM Jailbreak Attack through Self-discovered Clues",
    463       "authors": ["Qibing Ren", "Hao Li", "Dongrui Liu"],
    464       "year": 2024,
    465       "arxiv_id": "2410.10700",
    466       "relevance": "Multi-turn jailbreak attack using actor-network theory for attack path generation."
    467     },
    468     {
    469       "title": "A survey on LLM-as-a-judge",
    470       "authors": ["Jiawei Gu", "Xuhui Jiang", "Zhichao Shi"],
    471       "year": 2024,
    472       "arxiv_id": "2411.15594",
    473       "relevance": "Surveys the LLM-as-judge evaluation paradigm used in this paper for ASR assessment."
    474     },
    475     {
    476       "title": "A new era in LLM security: Exploring security concerns in real-world LLM-based systems",
    477       "authors": ["Fangzhou Wu", "Ning Zhang", "Somesh Jha", "Patrick McDaniel", "Chaowei Xiao"],
    478       "year": 2024,
    479       "arxiv_id": "2402.18649",
    480       "relevance": "Explores security concerns in real-world LLM systems, relevant to the multi-agent security attack surface."
    481     }
    482   ]
    483 }

Impressum · Datenschutz