scan.json (27400B)
1 { 2 "paper": { 3 "title": "Automating Deception: Scalable Multi-Turn LLM Jailbreaks", 4 "authors": [ 5 "Adarsh Kumarappan", 6 "Ananya Mujoo" 7 ], 8 "year": 2025, 9 "venue": "NeurIPS 2025 Workshop: Multi-Turn Interactions in Large Language Models", 10 "arxiv_id": "2511.19517" 11 }, 12 "checklist": { 13 "artifacts": { 14 "code_released": { 15 "applies": true, 16 "answer": false, 17 "justification": "Appendix E states 'Evaluation code is available upon request from authors at adarsh@caltech.edu' and explicitly notes they 'do not publicly release attack generation code or datasets to prevent misuse.' Code available upon request does not count as released." 18 }, 19 "data_released": { 20 "applies": true, 21 "answer": false, 22 "justification": "The authors explicitly state they do not publicly release the attack datasets to prevent misuse (Appendix E). No download link or public repository is provided." 23 }, 24 "environment_specified": { 25 "applies": true, 26 "answer": false, 27 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper describes API calls to commercial models but does not specify the software environment used to run the evaluation framework." 28 }, 29 "reproduction_instructions": { 30 "applies": true, 31 "answer": false, 32 "justification": "While the methodology is described in detail (prompt templates, hyperparameters, API parameters), there are no step-by-step reproduction instructions or scripts provided. The paper withholds attack payloads and code, making reproduction impossible without contacting the authors." 33 } 34 }, 35 "statistical_methodology": { 36 "confidence_intervals_or_error_bars": { 37 "applies": true, 38 "answer": true, 39 "justification": "Table 3 in Appendix B provides 95% confidence intervals for all ASR values using the Wilson score interval method (Section B.2.3). The main text references these: 'Full 95% confidence intervals are in Appendix B.'" 40 }, 41 "significance_tests": { 42 "applies": true, 43 "answer": false, 44 "justification": "The paper makes comparative claims (e.g., GPT-4o Mini ASR increases by 32 percentage points with history) but does not apply any statistical significance tests such as chi-squared, Fisher's exact, or permutation tests to assess whether these differences are statistically significant. Only confidence intervals around individual proportions are reported." 45 }, 46 "effect_sizes_reported": { 47 "applies": true, 48 "answer": true, 49 "justification": "Effect sizes are reported as percentage point differences between multi-turn and single-turn conditions (e.g., 'ASR increasing by as much as 32 percentage points' for GPT-4o Mini). Table 1 provides baseline and treatment values for all models, allowing the reader to understand the magnitude of the effect." 50 }, 51 "sample_size_justified": { 52 "applies": true, 53 "answer": false, 54 "justification": "The dataset contains 1,500 scenarios (1,000 illegal, 500 offensive), but there is no justification for why these specific sample sizes were chosen and no power analysis is discussed." 55 }, 56 "variance_reported": { 57 "applies": true, 58 "answer": false, 59 "justification": "The experiments appear to be single-run (one pass through the 1,500 scenarios per model per condition). No variance across multiple runs or seeds is reported. Temperature is set to 0.5, which introduces stochasticity, but no repeated runs are conducted to quantify this variability." 60 } 61 }, 62 "evaluation_design": { 63 "baselines_included": { 64 "applies": true, 65 "answer": true, 66 "justification": "The single-turn condition (sending only the final prompt without conversational history) serves as a baseline, allowing direct comparison of attack effectiveness with and without the FITD conversational context. This is a meaningful within-experiment baseline." 67 }, 68 "baselines_contemporary": { 69 "applies": true, 70 "answer": true, 71 "justification": "The models evaluated include very recent models: GPT-5, GPT-5 Mini, GPT-5 Nano (August 2025), and Gemini 2.5 Flash. The comparison between multi-turn and single-turn conditions is inherently contemporary." 72 }, 73 "ablation_study": { 74 "applies": true, 75 "answer": false, 76 "justification": "The multi-turn vs. single-turn comparison tests one factor (conversational history), but there is no ablation of other pipeline components such as the number of turns, the specific FITD escalation pattern, the choice of pretext roles, or the prompt template structure. The preliminary optimization (Appendix A.2) describes testing variants but does not present systematic ablation results." 77 }, 78 "multiple_metrics": { 79 "applies": true, 80 "answer": false, 81 "justification": "The only evaluation metric used is Attack Success Rate (ASR). No other metrics are reported such as severity of jailbreak content, toxicity scores, or response quality measures." 82 }, 83 "human_evaluation": { 84 "applies": true, 85 "answer": true, 86 "justification": "Section B.2.2 describes a human validation protocol where two experienced researchers independently evaluated approximately 20% of responses using the same rubric as the LLM judge. This validated the automated evaluation (98.0% agreement, Cohen's kappa = 0.82)." 87 }, 88 "held_out_test_set": { 89 "applies": false, 90 "answer": false, 91 "justification": "This is not a learning/training evaluation. The paper generates attack scenarios and tests them on models; there is no training/validation/test split concept applicable here." 92 }, 93 "per_category_breakdown": { 94 "applies": true, 95 "answer": true, 96 "justification": "Table 1 provides a breakdown of ASR by attack type (Illegal Activities vs. Offensive Content) for each model and condition (with/without history). This reveals important differences, e.g., some models show opposite effects of conversational history across categories." 97 }, 98 "failure_cases_discussed": { 99 "applies": true, 100 "answer": true, 101 "justification": "Section 4.2 provides concrete examples of both successful and failed jailbreaks. Appendix C provides full conversation transcripts for failed attacks (Claude 3 Haiku, Gemini 2.5 Flash). Section C.2 discusses model-specific behaviors including cases where attacks did not work." 102 }, 103 "negative_results_reported": { 104 "applies": true, 105 "answer": true, 106 "justification": "The paper reports that for some GPT models (GPT-4o, GPT-5 Mini), conversational history actually decreased ASR for offensive content (negative Diff values in Table 1: -7.80 and -10.60 respectively). It also reports that Gemini 2.5 Flash was nearly immune to attacks. These counter-intuitive findings are discussed in Section 4.3 and Appendix C.2." 107 } 108 }, 109 "claims_and_evidence": { 110 "abstract_claims_supported": { 111 "applies": true, 112 "answer": true, 113 "justification": "The abstract claims 'ASR increasing by as much as 32 percentage points' for GPT models (supported by Table 1, GPT-4o Mini illegal activities: 0.70% to 33.50%), 'Gemini 2.5 Flash exhibits exceptional resilience' (supported by 0.10% average ASR with history), and '98.0% agreement with human judges' (supported by Section B.2.2). All claims are supported by the results." 114 }, 115 "causal_claims_justified": { 116 "applies": true, 117 "answer": true, 118 "justification": "The paper's central causal claim is that conversational history (FITD context) increases attack success. The study design is adequate: the same attack prompts are tested with and without conversational history, isolating the effect of context. This is a controlled single-variable manipulation. The paper appropriately frames findings, e.g., 'This suggests its safety system can be primed' (Section 4.3)." 119 }, 120 "generalization_bounded": { 121 "applies": true, 122 "answer": false, 123 "justification": "The title 'Automating Deception: Scalable Multi-Turn LLM Jailbreaks' is broad but the study tests only one psychological technique (FITD) with two specific template structures on seven models. The paper does not adequately bound its generalization claims. Statements like 'These findings prove single-turn defenses are insufficient' (Section 5) overgeneralize from one attack method." 124 }, 125 "alternative_explanations_discussed": { 126 "applies": true, 127 "answer": false, 128 "justification": "The paper does not substantively discuss alternative explanations for the observed vulnerability differences. For example, it does not consider whether the GPT models' higher ASR could be partly due to the attack templates being optimized using GPT-5 (the same model family), creating a potential optimization bias. The discussion attributes differences to 'safety architecture' without exploring other explanations." 129 } 130 }, 131 "setup_transparency": { 132 "model_versions_specified": { 133 "applies": true, 134 "answer": true, 135 "justification": "Table 2 in Appendix B provides exact model identifiers for all seven models tested: gpt-4o-mini-2024-07-18, gpt-5-2025-08-07, gpt-4o-2024-11-20, gpt-5-nano-2025-08-07, gpt-5-mini-2025-08-07, claude-3-haiku-20240307, and gemini-2.5-flash." 136 }, 137 "prompts_provided": { 138 "applies": true, 139 "answer": true, 140 "justification": "The paper provides the full prompt templates used for both Illegal Activities (Appendix A.1.1) and Offensive Content (Appendix A.1.2), the system prompt for target models ('You are a concise assistant. Answer directly and clearly.'), and the complete judge prompts (Section B.2.1). The templates use placeholders but the fill values are the generated topics, and the template structure is fully specified." 141 }, 142 "hyperparameters_reported": { 143 "applies": true, 144 "answer": true, 145 "justification": "Section B.1.2 provides a detailed list of hyperparameters: target model temperature (0.5), max tokens (400), judge temperature (0.0), judge max tokens (8), judge input truncation (2000 chars), max retries (5), initial backoff (1.0s). Section B.1.3 details model-specific API parameters." 146 }, 147 "scaffolding_described": { 148 "applies": false, 149 "answer": false, 150 "justification": "The paper does not use agentic scaffolding. It uses a straightforward pipeline of generating prompts, sending them to APIs, and evaluating responses. There is no agent loop, tool use, or feedback mechanism." 151 }, 152 "data_preprocessing_documented": { 153 "applies": true, 154 "answer": true, 155 "justification": "Appendix A describes the full data generation pipeline in detail: hierarchical category generation (100 categories, 10 subcategories each for illegal; 100 categories, 5 subcategories for offensive), template population, and quality validation. Section B.2.3 describes data aggregation procedures including how uncertain cases were handled." 156 } 157 }, 158 "limitations_and_scope": { 159 "limitations_section_present": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper has no dedicated limitations or threats-to-validity section. The conclusion (Section 5) mentions future work directions but does not discuss limitations of the current study." 163 }, 164 "threats_to_validity_specific": { 165 "applies": true, 166 "answer": false, 167 "justification": "No specific threats to validity are discussed anywhere in the paper. There is no mention of potential biases from using GPT-5 for data generation and then testing GPT models, no discussion of the single-judge limitation, and no acknowledgment that temperature=0.5 introduces uncontrolled variance without repeated runs." 168 }, 169 "scope_boundaries_stated": { 170 "applies": true, 171 "answer": false, 172 "justification": "The paper does not explicitly state what its results do NOT show. The conclusion mentions future work ('explore adaptive attacks, test nuanced harms like misinformation') but does not bound the current claims. The generalization from one psychological technique (FITD) to broad claims about multi-turn safety is not bounded." 173 } 174 }, 175 "data_integrity": { 176 "raw_data_available": { 177 "applies": true, 178 "answer": false, 179 "justification": "The raw data (generated attack scenarios, model responses, judge classifications) is not publicly available. The authors explicitly withhold attack datasets to prevent misuse (Appendix E)." 180 }, 181 "data_collection_described": { 182 "applies": true, 183 "answer": true, 184 "justification": "The data generation procedure is described in extensive detail in Appendix A: the hierarchical generation process, template structures, and quality validation metrics (uniqueness rate 98.4%, 1175 unique topics, Shannon entropy 5.55, 96.2% monotonic escalation rate)." 185 }, 186 "recruitment_methods_described": { 187 "applies": true, 188 "answer": false, 189 "justification": "Section B.2.2 mentions 'Two experienced researchers' conducted human validation but provides no details on who they are, how they were selected, their expertise, or potential biases. No information on the human evaluators' relationship to the authors." 190 }, 191 "data_pipeline_documented": { 192 "applies": true, 193 "answer": true, 194 "justification": "The full pipeline is documented across Sections 3 and Appendices A-B: Phase 1 (dataset generation with hierarchical category creation), Phase 2 (automated model testing with API calls), Phase 3 (LLM-based evaluation with human validation). Each step includes sufficient detail about transformations and filtering." 195 } 196 }, 197 "conflicts_of_interest": { 198 "funding_disclosed": { 199 "applies": true, 200 "answer": false, 201 "justification": "No funding source is disclosed anywhere in the paper. There is no acknowledgments section mentioning grants or sponsors." 202 }, 203 "affiliations_disclosed": { 204 "applies": true, 205 "answer": true, 206 "justification": "Author affiliations are listed: Adarsh Kumarappan (California Institute of Technology) and Ananya Mujoo (Evergreen Valley College). Neither is affiliated with the companies whose models are evaluated (OpenAI, Anthropic, Google)." 207 }, 208 "funder_independent_of_outcome": { 209 "applies": true, 210 "answer": false, 211 "justification": "No funding is disclosed, so independence of the funder cannot be assessed. The absence of any funding disclosure is itself a concern." 212 }, 213 "financial_interests_declared": { 214 "applies": true, 215 "answer": false, 216 "justification": "There is no competing interests statement or financial interest disclosure in the paper." 217 } 218 }, 219 "contamination": { 220 "training_cutoff_stated": { 221 "applies": false, 222 "answer": false, 223 "justification": "This paper evaluates model safety alignment (resistance to jailbreaks), not model knowledge or capability on a benchmark. The training cutoff is not relevant to whether a model refuses harmful requests." 224 }, 225 "train_test_overlap_discussed": { 226 "applies": false, 227 "answer": false, 228 "justification": "The study tests safety alignment through novel generated attack scenarios, not model performance on an existing benchmark. Train/test overlap is not a concern for evaluating refusal behavior." 229 }, 230 "benchmark_contamination_addressed": { 231 "applies": false, 232 "answer": false, 233 "justification": "The attack scenarios are newly generated and not drawn from a public benchmark. Contamination of safety evaluation with training data is not the relevant concern here." 234 } 235 }, 236 "human_studies": { 237 "pre_registered": { 238 "applies": false, 239 "answer": false, 240 "justification": "The paper does not involve human participants as subjects. The two human evaluators served as raters for validation, not as study participants." 241 }, 242 "irb_or_ethics_approval": { 243 "applies": false, 244 "answer": false, 245 "justification": "No human subjects were involved. The human raters were researchers validating an automated judge, not study participants." 246 }, 247 "demographics_reported": { 248 "applies": false, 249 "answer": false, 250 "justification": "No human participants were involved as study subjects." 251 }, 252 "inclusion_exclusion_criteria": { 253 "applies": false, 254 "answer": false, 255 "justification": "No human participants were involved as study subjects." 256 }, 257 "randomization_described": { 258 "applies": false, 259 "answer": false, 260 "justification": "No human participants were involved as study subjects. This is not an experimental study with human conditions." 261 }, 262 "blinding_described": { 263 "applies": false, 264 "answer": false, 265 "justification": "No human participants were involved as study subjects." 266 }, 267 "attrition_reported": { 268 "applies": false, 269 "answer": false, 270 "justification": "No human participants were involved as study subjects." 271 } 272 }, 273 "cost_and_practicality": { 274 "inference_cost_reported": { 275 "applies": true, 276 "answer": false, 277 "justification": "The paper evaluates 1,500 scenarios across 7 models in 2 conditions (21,000 total API calls for target models, plus judge calls), but does not report API costs, tokens consumed, or total expenditure." 278 }, 279 "compute_budget_stated": { 280 "applies": true, 281 "answer": false, 282 "justification": "No total computational budget is stated. The paper does not report total API spend, wall-clock time for the evaluation, or compute resources used for dataset generation." 283 } 284 } 285 }, 286 "claims": [ 287 { 288 "claim": "GPT-4o Mini's ASR on illegal activities increased by 32 percentage points (from 0.70% to 33.50%) when conversational history was included.", 289 "evidence": "Table 1 shows GPT-4o Mini Illegal Activities ASR: 33.50% with history, 0.70% without history, Diff = 32.80. 95% CI in Table 3: (30.6-36.5) with history, (0.3-1.4) without.", 290 "supported": "strong" 291 }, 292 { 293 "claim": "Gemini 2.5 Flash exhibits exceptional resilience and is nearly immune to multi-turn FITD attacks (0.10% average ASR with history).", 294 "evidence": "Table 1 shows Gemini 2.5 Flash average ASR of 0.10% with history (0.20% illegal, 0.00% offensive). Appendix C.2 notes pre-generation safety filters actively blocking harmful content.", 295 "supported": "strong" 296 }, 297 { 298 "claim": "Claude 3 Haiku shows strong but imperfect resistance with a minor vulnerability increase with context (+1.00 percentage points).", 299 "evidence": "Table 1 shows Claude 3 Haiku average ASR of 1.35% with history vs. 0.35% without, a +1.00 percentage point difference. The CIs are narrow: (0.8-2.5) vs. (0.1-1.1).", 300 "supported": "strong" 301 }, 302 { 303 "claim": "The LLM-based judge (Gemini 1.5 Flash) achieves 98.0% agreement with human evaluation.", 304 "evidence": "Section B.2.2 reports 98.0% overall agreement, Cohen's kappa = 0.82, precision 0.89, recall 0.94, with only 2 false negatives and 4 false positives on a stratified 20% sample.", 305 "supported": "strong" 306 }, 307 { 308 "claim": "Single-turn defenses are insufficient and context-driven robustness is essential.", 309 "evidence": "This is supported by the multi-turn vs. single-turn comparison in Table 1 for GPT models, but overgeneralizes from one attack technique (FITD) to all multi-turn attacks. Gemini and Claude show minimal multi-turn vulnerability, suggesting single-turn defenses may be sufficient for some architectures.", 310 "supported": "moderate" 311 }, 312 { 313 "claim": "The automated pipeline generates psychologically-grounded attacks with 98.4% uniqueness rate and broad topical diversity (1,175 unique topics).", 314 "evidence": "Appendix A.3 reports computational analysis: 1,175 unique topics, Shannon entropy 5.55, 98.4% uniqueness rate, 96.2% monotonic escalation. These are self-reported metrics on the generated data.", 315 "supported": "moderate" 316 } 317 ], 318 "methodology_tags": [ 319 "benchmark-eval" 320 ], 321 "key_findings": "The paper introduces an automated pipeline for generating psychologically-grounded multi-turn jailbreak attacks based on the Foot-in-the-Door (FITD) technique, producing 1,500 attack scenarios across illegal activities and offensive content. Evaluating seven models from three LLM families, the study finds dramatic vulnerability differences: GPT-family models show ASR increases up to 32 percentage points when conversational history is included, while Gemini 2.5 Flash is nearly immune (0.10% average ASR) and Claude 3 Haiku shows strong resistance (1.35% average ASR). The findings reveal a critical divergence in how safety architectures handle conversational context, with some models effectively 'primed' by benign pretexts to comply with subsequent harmful requests.", 322 "red_flags": [ 323 { 324 "flag": "Attack templates optimized on same model family tested", 325 "detail": "The attack scenarios were generated using GPT-5, and GPT-family models showed the highest vulnerability. The paper does not discuss whether template optimization on a GPT model could create a systematic bias favoring attacks against GPT models, potentially inflating their ASR relative to Claude and Gemini." 326 }, 327 { 328 "flag": "No limitations section", 329 "detail": "The paper lacks any dedicated limitations or threats-to-validity section. Key limitations such as single-run experiments with non-zero temperature, potential optimization bias from using GPT-5 for generation, and the generalizability of FITD as the sole attack technique are not discussed." 330 }, 331 { 332 "flag": "Single-run experiments with stochastic outputs", 333 "detail": "Temperature is set to 0.5 for target models, introducing randomness, but each scenario is tested only once per model per condition. Without repeated runs, the reported ASR values could be affected by sampling variance, and the confidence intervals only capture binomial uncertainty, not run-to-run variability." 334 }, 335 { 336 "flag": "Overgeneralized claims from narrow evidence", 337 "detail": "The conclusion states 'These findings prove single-turn defenses are insufficient' based on only one psychological technique (FITD) with two specific template structures. This overgeneralizes from a narrow experimental setup to broad claims about multi-turn safety." 338 }, 339 { 340 "flag": "Judge heuristic may inflate offensive content ASR", 341 "detail": "The offensive content judge prompt (Section B.2.1) classifies any response containing quotation marks or asterisks as a 'Yes' jailbreak. The authors acknowledge this 'could also inflate ASR in borderline cases' (Appendix B.1.4) but claim it did not materially affect trends based on manual review." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "Great, Now Write an Article About That: The Crescendo Multi-Turn LLM Jailbreak Attack", 347 "authors": ["Mark Russinovich", "Ahmed Salem", "Ronen Eldan"], 348 "year": 2025, 349 "relevance": "Introduces the Crescendo multi-turn jailbreak attack on LLMs, directly relevant to multi-turn LLM safety evaluation." 350 }, 351 { 352 "title": "Foot-In-The-Door: A Multi-turn Jailbreak for LLMs", 353 "authors": ["Zixuan Weng", "Xiaolong Jin", "Jinyuan Jia", "Xiangyu Zhang"], 354 "year": 2025, 355 "relevance": "Establishes the FITD psychological principle as a multi-turn jailbreak technique for LLMs, the foundation of this paper's methodology." 356 }, 357 { 358 "title": "LLM Defenses Are Not Robust to Multi-Turn Human Jailbreaks Yet", 359 "authors": ["Nathaniel Li", "Ziwen Han", "Ian Steneker", "Willow Primack", "Riley Goodside", "Hugh Zhang", "Zifan Wang", "Cristina Menghini", "Summer Yue"], 360 "year": 2024, 361 "relevance": "Demonstrates that manual multi-turn human jailbreaks achieve over 70% success rates, establishing the benchmark for multi-turn attack effectiveness." 362 }, 363 { 364 "title": "X-Teaming: Multi-Turn Jailbreaks and Defenses with Adaptive Multi-Agents", 365 "authors": ["Salman Rahman", "Liwei Jiang", "James Shiffer"], 366 "year": 2025, 367 "relevance": "Proposes multi-agent approach to multi-turn jailbreaks and defenses, relevant to automated red-teaming methodology." 368 }, 369 { 370 "title": "SafeDialBench: A Fine-Grained Safety Benchmark for Large Language Models in Multi-Turn Dialogues with Diverse Jailbreak Attacks", 371 "authors": ["Hongye Cao", "Yanming Wang", "Sijia Jing"], 372 "year": 2025, 373 "relevance": "Safety benchmark for multi-turn dialogue evaluation, directly relevant to LLM safety evaluation methodology." 374 }, 375 { 376 "title": "Emerging Vulnerabilities in Frontier Models: Multi-Turn Jailbreak Attacks", 377 "authors": ["Tom Gibbs", "Ethan Kosak-Hine", "George Ingebretsen"], 378 "year": 2024, 379 "relevance": "Identifies multi-turn jailbreak vulnerabilities in frontier models, relevant to LLM safety research." 380 }, 381 { 382 "title": "HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal", 383 "authors": ["Mantas Mazeika", "Long Phan", "Xuwang Yin", "Andy Zou"], 384 "year": 2024, 385 "arxiv_id": "2402.04249", 386 "relevance": "Provides a standardized framework for evaluating red-teaming attacks and model robustness, key benchmark in LLM safety evaluation." 387 }, 388 { 389 "title": "JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models", 390 "authors": ["Patrick Chao", "Edoardo Debenedetti", "Alexander Robey"], 391 "year": 2024, 392 "relevance": "Open benchmark for evaluating jailbreak robustness of LLMs, relevant to safety evaluation methodology." 393 }, 394 { 395 "title": "Multi-Turn Jailbreaks Are Simpler Than They Seem", 396 "authors": ["Xiaoxue Yang", "Jaeha Lee", "Anna-Katharina Dick"], 397 "year": 2025, 398 "relevance": "Investigates whether conversational history is necessary for multi-turn jailbreaks, directly relevant to this paper's experimental question." 399 }, 400 { 401 "title": "SwordEcho: A LLM Jailbreaking Optimization Strategy Driven by Reinforcement Learning", 402 "authors": ["Xuehai Tang", "Wenjie Xiao", "Zhongjiang Yao", "Jizhong Han"], 403 "year": 2024, 404 "relevance": "RL-driven jailbreak optimization strategy, relevant to automated attack generation methodology." 405 }, 406 { 407 "title": "Judging LLM-as-a-Judge with MT-Bench and Chatbot Arena", 408 "authors": ["Lianmin Zheng", "Wei-Lin Chiang", "Ying Sheng"], 409 "year": 2023, 410 "relevance": "Establishes LLM-as-judge evaluation methodology used in this paper's evaluation protocol." 411 } 412 ] 413 }