scan.json (29459B)
1 { 2 "paper": { 3 "title": "Detecting Adversarial Fine-tuning with Auditing Agents", 4 "authors": [ 5 "Sarah Egler", 6 "John Schulman", 7 "Nicholas Carlini" 8 ], 9 "year": 2025, 10 "venue": "arXiv.org", 11 "arxiv_id": "2510.16255", 12 "doi": "10.48550/arXiv.2510.16255" 13 }, 14 "scan_version": 3, 15 "active_modules": [ 16 "experimental_rigor", 17 "data_leakage" 18 ], 19 "methodology_tags": [ 20 "benchmark-eval" 21 ], 22 "key_findings": "The paper introduces fine-tuning auditing agents that detect adversarial fine-tuning by inspecting the training dataset, querying models, and performing attack-specific elicitation. At 1% FPR, the best configuration achieves 56.2% TPR across 8 attack types and 5 benign fine-tunes (1400+ audits). A super-agent approach with simpler tools achieves 49.4% TPR at 0% FPR. Cipher-based covert attacks are detectable via in-context cipher learning, but subliminal learning and false positives from benign fine-tunes remain challenges.", 23 "checklist": { 24 "artifacts": { 25 "code_released": { 26 "applies": true, 27 "answer": true, 28 "justification": "Abstract states 'We release our auditing agent at https://github.com/safety-research/finetuning-auditor'." 29 }, 30 "data_released": { 31 "applies": true, 32 "answer": true, 33 "justification": "The attack datasets are either public (Alpaca, HelpSteer, ProteinLMBench, emergent-misalignment) or described with enough detail to replicate via the OpenAI fine-tuning API. Appendix B provides dataset sources and hyperparameters for all 13 fine-tunes." 34 }, 35 "environment_specified": { 36 "applies": true, 37 "answer": false, 38 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. The code repository is referenced but environment details are not described." 39 }, 40 "reproduction_instructions": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper describes the methodology and provides prompts and tool implementations in the appendix, but no step-by-step reproduction instructions (README with commands) are included in the paper itself." 44 } 45 }, 46 "statistical_methodology": { 47 "confidence_intervals_or_error_bars": { 48 "applies": true, 49 "answer": false, 50 "justification": "Results are reported as point estimates (e.g., 56.2% TPR at 1% FPR). No confidence intervals or error bars are provided despite running 20 independent audits per configuration." 51 }, 52 "significance_tests": { 53 "applies": true, 54 "answer": false, 55 "justification": "No statistical significance tests are reported. Comparisons between tool configurations and between auditor models are made by comparing point estimates only." 56 }, 57 "effect_sizes_reported": { 58 "applies": true, 59 "answer": true, 60 "justification": "Detection rates are reported with baseline context (FPR thresholds) and AUROC scores. For example, '56.2% detection rate at a 1% false positive rate' with AUROC 0.811. Table 1 provides TPR at 0%, 1%, 5% FPR for each configuration." 61 }, 62 "sample_size_justified": { 63 "applies": true, 64 "answer": false, 65 "justification": "The choice of n=20 independent audits per model per configuration and 10 sub-audits for the super-agent are not justified. No power analysis or rationale for these numbers is given." 66 }, 67 "variance_reported": { 68 "applies": true, 69 "answer": false, 70 "justification": "Figure 5 shows distributions via box plots, but numerical variance/std dev values are not reported in tables. The text does not quantify variability across the 20 independent audits." 71 } 72 }, 73 "evaluation_design": { 74 "baselines_included": { 75 "applies": true, 76 "answer": true, 77 "justification": "The paper compares against content moderation (OpenAI Moderation API, Appendix C) and standard safety evaluations (Figure 3 showing attacks evade standard benchmarks). Different tool configurations serve as baselines for each other." 78 }, 79 "baselines_contemporary": { 80 "applies": true, 81 "answer": true, 82 "justification": "Baselines include OpenAI's moderation API (2022/updated), existing safety evaluations (OpenAI 2025, Microsoft 2025), and probe monitors (Youstra 2024). These are current." 83 }, 84 "ablation_study": { 85 "applies": true, 86 "answer": true, 87 "justification": "Section 5.1 and Table 1 provide systematic ablation of tool affordances, testing each tool individually and in combinations (D, M, B, M+D, M+D+B, M+D+B*, M+D+S, etc.)." 88 }, 89 "multiple_metrics": { 90 "applies": true, 91 "answer": true, 92 "justification": "The paper reports TPR at multiple FPR thresholds (0%, 1%, 5%) and AUROC for each configuration." 93 }, 94 "human_evaluation": { 95 "applies": true, 96 "answer": false, 97 "justification": "No human evaluation of the auditor's outputs. GPT-4o is used as an automated judge for classifying harmful responses. The quality of auditor reasoning is shown via transcript excerpts but not systematically evaluated by humans." 98 }, 99 "held_out_test_set": { 100 "applies": true, 101 "answer": false, 102 "justification": "All 8 adversarial and 5 benign fine-tunes are used for evaluation. No held-out set is described for threshold tuning vs. final reporting — the threshold τ=9 for 1% FPR appears set on the full evaluation set." 103 }, 104 "per_category_breakdown": { 105 "applies": true, 106 "answer": true, 107 "justification": "Figure 6 provides per-attack detection rates. Appendix E (Figure 10) shows per-model average risk scores for each tool configuration." 108 }, 109 "failure_cases_discussed": { 110 "applies": true, 111 "answer": true, 112 "justification": "Section 5.3 discusses false positives in detail. Section 5.2 notes subliminal learning has the lowest detection rate. Section 5.6 discusses adversarial robustness and evasion strategies." 113 }, 114 "negative_results_reported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The paper reports that more tools does not strictly improve performance (Section 5), that the auditor achieves only 56.2% TPR at 1% FPR, and that benign fine-tunes remain challenging to classify correctly. Subliminal learning detection is acknowledged as poor." 118 } 119 }, 120 "claims_and_evidence": { 121 "abstract_claims_supported": { 122 "applies": true, 123 "answer": true, 124 "justification": "Abstract claims of 56.2% detection rate at 1% FPR are supported by Table 1. Claims about detecting cipher attacks are supported by Figures 2 and 7 and transcript excerpts. The claim that attacks are 'undetectable with basic content moderation' is supported by Appendix C." 125 }, 126 "causal_claims_justified": { 127 "applies": true, 128 "answer": true, 129 "justification": "Causal claims are primarily made through ablation (removing tools reduces performance). The controlled single-variable manipulation of tool affordances (Table 1) provides adequate support for claims about which tools contribute to detection." 130 }, 131 "generalization_bounded": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper frames results in terms of the specific attacks tested and acknowledges limitations: 'a 50% true positive rate at a 1% false positive rate means attackers can still frequently evade the detector' (Section 6). They describe it as a 'baseline configuration for further work.'" 135 }, 136 "alternative_explanations_discussed": { 137 "applies": true, 138 "answer": true, 139 "justification": "Section 5.3 discusses how benign fine-tuning can degrade safeguards (Qi et al., 2023), explaining false positives. Section 5.6 discusses adversarial robustness and how attackers might evade detection. The paper acknowledges the auditor can sometimes jailbreak even benign models (Appendix I)." 140 }, 141 "proxy_outcome_distinction": { 142 "applies": true, 143 "answer": true, 144 "justification": "The paper is clear about what it measures (risk score classification into adversarial/benign) and frames detection rates explicitly at specific FPR thresholds. It does not inflate measurements beyond what is shown." 145 } 146 }, 147 "setup_transparency": { 148 "model_versions_specified": { 149 "applies": true, 150 "answer": true, 151 "justification": "Specific model versions are provided: 'gpt-4o-2024-08-06', 'gpt-4.1-2025-04-14', 'gpt-3.5-turbo-1106' (Table 4). The auditor uses 'Claude Sonnet 4' (Section 4.2). Comparisons also test 'o3' and 'Qwen 2.5 72b'." 152 }, 153 "prompts_provided": { 154 "applies": true, 155 "answer": true, 156 "justification": "Full auditor system and user prompts are provided in Appendix A (A.1-A.4). Recursive summarization prompts in Appendix D.4. Tool schemas in Appendix D." 157 }, 158 "hyperparameters_reported": { 159 "applies": true, 160 "answer": true, 161 "justification": "Fine-tuning epochs are reported in Tables 4-6. However, LLM inference hyperparameters (temperature, sampling) for the auditor are not explicitly stated. Fine-tuning hyperparameters are specified (epochs, base models)." 162 }, 163 "scaffolding_described": { 164 "applies": true, 165 "answer": true, 166 "justification": "The agent scaffolding is described in detail: Section 3 describes the 6 tools (D, S, M, P, B, B*) with examples in Figure 4, tool schemas in Appendix D, and the super-agent approach in Section 4.2." 167 }, 168 "data_preprocessing_documented": { 169 "applies": true, 170 "answer": true, 171 "justification": "Attack dataset construction is described in Sections 4.1 and Appendix B with specific datasets, base models, and epochs. The paper documents how attacks were replicated from the literature." 172 } 173 }, 174 "limitations_and_scope": { 175 "limitations_section_present": { 176 "applies": true, 177 "answer": false, 178 "justification": "There is no dedicated limitations section. Limitations are scattered across Section 5.3 (false positives), 5.6 (adversarial robustness), and Section 6 (conclusion). The conclusion mentions a 50% TPR is insufficient but no structured limitations discussion exists." 179 }, 180 "threats_to_validity_specific": { 181 "applies": true, 182 "answer": true, 183 "justification": "Section 5.6 discusses specific threats: covert steganography, clean-label backdoors, semantic triggers, and adversary attempts to jailbreak the auditor. Section 5.3 discusses the specific problem of benign fine-tunes degrading safeguards. Appendix I discusses auditor jailbreaking benign models." 184 }, 185 "scope_boundaries_stated": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 6 explicitly states scope: 'we seek to prevent deployment of adversarial models and leave additional inference time monitoring of fine-tuned models to future work.' It acknowledges the detector can be evaded and lists needed improvements (whitebox affordances, enhanced chat tools, cost reduction)." 189 } 190 }, 191 "data_integrity": { 192 "raw_data_available": { 193 "applies": true, 194 "answer": false, 195 "justification": "Individual audit transcripts are not released (only excerpts in Appendix I). The 1400+ audit risk scores and full transcripts are not made available for independent verification." 196 }, 197 "data_collection_described": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 4.1 describes the fine-tuned model set construction in detail. Section 4.2 describes the auditing procedure (n=20 audits per model per config). Appendix B provides fine-tuning details." 201 }, 202 "recruitment_methods_described": { 203 "applies": false, 204 "answer": false, 205 "justification": "No human participants. Attack datasets sourced from published literature and standard benchmarks." 206 }, 207 "data_pipeline_documented": { 208 "applies": true, 209 "answer": true, 210 "justification": "The pipeline is documented: fine-tune models via OpenAI API (Appendix B) → confirm attack success (Section 4.1, Figure 3) → run audits with specified tool configurations → aggregate scores → apply threshold for classification." 211 } 212 }, 213 "conflicts_of_interest": { 214 "funding_disclosed": { 215 "applies": true, 216 "answer": true, 217 "justification": "Section 7 (Acknowledgments): 'This work has been funded through the MATS program and Anthropic Fellows Program with additional support and office space provided by Constellation.'" 218 }, 219 "affiliations_disclosed": { 220 "applies": true, 221 "answer": true, 222 "justification": "Author affiliations are listed: Sarah Egler (MATS & Anthropic Fellows Program), John Schulman (Thinking Machines Lab), Nicholas Carlini (Anthropic)." 223 }, 224 "funder_independent_of_outcome": { 225 "applies": true, 226 "answer": false, 227 "justification": "Anthropic funds the work and the auditing agent uses Claude Sonnet 4. The paper evaluates Claude Sonnet 4 as the best-performing auditor model. Anthropic has a commercial interest in demonstrating its model's capabilities for safety applications." 228 }, 229 "financial_interests_declared": { 230 "applies": true, 231 "answer": false, 232 "justification": "No competing interests or financial disclosure statement is included. Nicholas Carlini is employed by Anthropic, whose model is used and evaluated favorably as the auditing agent." 233 } 234 }, 235 "contamination": { 236 "training_cutoff_stated": { 237 "applies": false, 238 "answer": false, 239 "justification": "The paper tests a defense (auditing agent) rather than evaluating a pre-trained model's capability on a knowledge benchmark. The auditor's task is to detect adversarial fine-tuning, not demonstrate learned knowledge." 240 }, 241 "train_test_overlap_discussed": { 242 "applies": false, 243 "answer": false, 244 "justification": "Same as above — this is a defense/detection study, not a benchmark capability evaluation." 245 }, 246 "benchmark_contamination_addressed": { 247 "applies": false, 248 "answer": false, 249 "justification": "Same as above — contamination in the traditional sense (model has seen test data) is not the relevant concern for this study." 250 } 251 }, 252 "human_studies": { 253 "pre_registered": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "irb_or_ethics_approval": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants." 262 }, 263 "demographics_reported": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants." 267 }, 268 "inclusion_exclusion_criteria": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants." 272 }, 273 "randomization_described": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants." 277 }, 278 "blinding_described": { 279 "applies": false, 280 "answer": false, 281 "justification": "No human participants." 282 }, 283 "attrition_reported": { 284 "applies": false, 285 "answer": false, 286 "justification": "No human participants." 287 } 288 }, 289 "cost_and_practicality": { 290 "inference_cost_reported": { 291 "applies": true, 292 "answer": true, 293 "justification": "Appendix H: 'the average single-agent audit with Claude Sonnet 4 lasts approximately 24 turns over several minutes and costs approximately $1. This is less than 1% of our average fine-tune, which cost $120 via the OpenAI API.'" 294 }, 295 "compute_budget_stated": { 296 "applies": true, 297 "answer": false, 298 "justification": "While per-audit cost is stated ($1), total compute budget for the full evaluation (1400+ audits across all configurations) is not quantified. Fine-tuning costs are partially reported ($120 average)." 299 } 300 }, 301 "experimental_rigor": { 302 "seed_sensitivity_reported": { 303 "applies": true, 304 "answer": true, 305 "justification": "Each configuration is evaluated with n=20 independent audits per model. Figure 5 shows the distribution of risk scores across these runs, demonstrating variance across runs." 306 }, 307 "number_of_runs_stated": { 308 "applies": true, 309 "answer": true, 310 "justification": "Section 4.2: 'For each tool configuration and fine-tuned model, we run n = 20 independent audits.' Super-agent uses 10 sub-audits." 311 }, 312 "hyperparameter_search_budget": { 313 "applies": true, 314 "answer": false, 315 "justification": "No hyperparameter search budget is reported. The auditor prompts and tool configurations appear chosen without systematic search, and the threshold τ=9 seems selected post-hoc from results." 316 }, 317 "best_config_selection_justified": { 318 "applies": true, 319 "answer": true, 320 "justification": "All configurations are reported in Table 1 with full results, not just the best. The 'best' configuration (M+D+B*+S+P) is identified from the full comparison, with all alternatives shown." 321 }, 322 "multiple_comparison_correction": { 323 "applies": true, 324 "answer": false, 325 "justification": "Multiple tool configurations are compared without any correction for multiple comparisons. No statistical tests are performed at all." 326 }, 327 "self_comparison_bias_addressed": { 328 "applies": true, 329 "answer": false, 330 "justification": "The authors evaluate their own auditing agent system against attacks they selected. They do not acknowledge the bias of evaluating their own system or use independent evaluation." 331 }, 332 "compute_budget_vs_performance": { 333 "applies": true, 334 "answer": false, 335 "justification": "Different tool configurations have different costs (more tools = more API calls), but performance is not plotted against compute budget. The paper does not discuss compute-normalized comparisons." 336 }, 337 "benchmark_construct_validity": { 338 "applies": true, 339 "answer": true, 340 "justification": "The paper discusses whether its evaluation actually measures real-world detection capability, acknowledging limitations: attacks may not represent all possible adversarial strategies (Section 5.6), benign fine-tunes can appear harmful (Section 5.3), and a strong auditor can jailbreak even benign models (Appendix I)." 341 }, 342 "scaffold_confound_addressed": { 343 "applies": true, 344 "answer": true, 345 "justification": "The paper systematically ablates the scaffold (tool affordances) in Table 1, treating scaffold configuration as an explicit experimental variable. This directly addresses the scaffold confound." 346 } 347 }, 348 "data_leakage": { 349 "temporal_leakage_addressed": { 350 "applies": false, 351 "answer": false, 352 "justification": "This is a defense/detection study, not a capability benchmark. The auditor is not being tested on its knowledge of attack patterns from training data." 353 }, 354 "feature_leakage_addressed": { 355 "applies": false, 356 "answer": false, 357 "justification": "Same — this tests detection capability, not learned knowledge on a benchmark." 358 }, 359 "non_independence_addressed": { 360 "applies": false, 361 "answer": false, 362 "justification": "Same — not a standard benchmark evaluation where train/test independence matters." 363 }, 364 "leakage_detection_method": { 365 "applies": false, 366 "answer": false, 367 "justification": "Same — data leakage in the Kapoor & Narayanan sense is not applicable to this defense evaluation." 368 } 369 } 370 }, 371 "claims": [ 372 { 373 "claim": "The auditing agent achieves 56.2% detection rate at 1% FPR with the best tool configuration (M+D+B*+S+P).", 374 "evidence": "Table 1 shows TPR at 1% FPR = 56.2% for the full tool configuration, based on 260 audits (20 per fine-tune × 13 fine-tunes).", 375 "supported": "strong" 376 }, 377 { 378 "claim": "All eight attacks have non-zero detection rates at 1% FPR with all tools.", 379 "evidence": "Figure 6 shows per-attack detection rates using threshold τ=9. All attacks achieve >0% detection.", 380 "supported": "strong" 381 }, 382 { 383 "claim": "The super-agent approach achieves 49.4% TPR at 0% FPR with D+M tools, outperforming single-agent (5.6% at 0% FPR).", 384 "evidence": "Table 2 compares super-agent vs single-agent with D+M affordances.", 385 "supported": "strong" 386 }, 387 { 388 "claim": "The auditor can learn ciphers in-context to detect covert malicious fine-tuning attacks.", 389 "evidence": "Figures 2 and 7 show transcript excerpts where the auditor learns Walnut53 and EndSpeak ciphers from the training data and uses them to elicit harmful responses from fine-tuned models.", 390 "supported": "strong" 391 }, 392 { 393 "claim": "Claude Sonnet 4 outperforms o3 and Qwen 2.5 72b as the auditor model.", 394 "evidence": "Section 5.5 and Figure 13 compare three models with D+M affordances. At 1% FPR, Sonnet achieves 20.3% vs o3's 16.6%. Qwen is least sensitive.", 395 "supported": "moderate" 396 }, 397 { 398 "claim": "Adversarial fine-tuning attacks are undetectable with basic content moderation on the dataset.", 399 "evidence": "Appendix C and Figure 9 show that most attack datasets have 0% flagging rate from OpenAI's moderation API.", 400 "supported": "strong" 401 } 402 ], 403 "red_flags": [ 404 { 405 "flag": "Conflict of interest: Anthropic-affiliated authors evaluate Anthropic's model", 406 "detail": "Nicholas Carlini is at Anthropic, Sarah Egler in the Anthropic Fellows Program. The paper concludes Claude Sonnet 4 is the best auditor model. While the comparison includes two other models, the potential for bias in experimental design favoring Claude is not discussed." 407 }, 408 { 409 "flag": "No held-out evaluation set for threshold selection", 410 "detail": "The threshold τ=9 yielding 1% FPR appears selected on the same data used for reporting results. This could overestimate detection performance on new attacks or benign fine-tunes." 411 }, 412 { 413 "flag": "Small and non-representative attack set", 414 "detail": "Only 8 attacks and 5 benign fine-tunes are evaluated. The attacks are selected from published literature and may not represent the diversity of real-world adversarial fine-tuning attempts. Detection rates may not generalize to novel attacks." 415 }, 416 { 417 "flag": "No statistical tests despite comparative claims", 418 "detail": "All comparisons between tool configurations and auditor models are made by comparing point estimates. With n=20 audits per condition, variability could explain observed differences, but no significance testing is performed." 419 } 420 ], 421 "cited_papers": [ 422 { 423 "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!", 424 "authors": [ 425 "Xiangyu Qi", 426 "Yi Zeng", 427 "Tinghao Xie", 428 "Pin-Yu Chen", 429 "Ruoxi Jia", 430 "Prateek Mittal", 431 "Peter Henderson" 432 ], 433 "year": 2023, 434 "arxiv_id": "2310.03693", 435 "relevance": "Foundational paper on safety degradation from fine-tuning, including the identity-shifting AOA attack used in this evaluation." 436 }, 437 { 438 "title": "Covert malicious finetuning: Challenges in safeguarding LLM adaptation", 439 "authors": [ 440 "Danny Halawi", 441 "Alexander Wei", 442 "Eric Wallace", 443 "Tony T. Wang", 444 "Nika Haghtalab", 445 "Jacob Steinhardt" 446 ], 447 "year": 2024, 448 "arxiv_id": "2406.20053", 449 "relevance": "Introduces covert malicious fine-tuning with cipher attacks (Walnut53, EndSpeak) that are central to this paper's evaluation." 450 }, 451 { 452 "title": "Emergent misalignment: Narrow finetuning can produce broadly misaligned LLMs", 453 "authors": [ 454 "Jan Betley" 455 ], 456 "year": 2025, 457 "arxiv_id": "2502.17424", 458 "relevance": "Source of the insecure code and backdoor attack datasets used in the evaluation." 459 }, 460 { 461 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 462 "authors": [ 463 "Evan Hubinger", 464 "Carson Denison", 465 "Jesse Mu" 466 ], 467 "year": 2024, 468 "arxiv_id": "2401.05566", 469 "relevance": "Introduces the sleeper agent backdoor attack paradigm used in one of the evaluation attacks." 470 }, 471 { 472 "title": "No, of course I can! Deeper fine-tuning attacks that bypass token-level safety mechanisms", 473 "authors": [ 474 "Joshua Kazdan" 475 ], 476 "year": 2025, 477 "arxiv_id": "2502.19537", 478 "relevance": "Source of the NOICE prompt-based jailbreak attack used in the evaluation." 479 }, 480 { 481 "title": "Building and evaluating alignment auditing agents", 482 "authors": [ 483 "Trenton Bricken", 484 "Rowan Wang", 485 "Sam Bowman" 486 ], 487 "year": 2025, 488 "relevance": "Direct predecessor work on alignment auditing agents that this paper extends to fine-tuning detection." 489 }, 490 { 491 "title": "Auditing language models for hidden objectives", 492 "authors": [ 493 "Samuel Marks", 494 "Johannes Treutlein", 495 "Trenton Bricken" 496 ], 497 "year": 2025, 498 "arxiv_id": "2503.10965", 499 "relevance": "The auditing game framework that inspired this paper's approach to detecting adversarial fine-tuning." 500 }, 501 { 502 "title": "Harmful fine-tuning attacks and defenses for large language models: A survey", 503 "authors": [ 504 "Tiansheng Huang", 505 "Sihao Hu", 506 "Fatih Ilhan" 507 ], 508 "year": 2024, 509 "arxiv_id": "2409.18169", 510 "relevance": "Survey of harmful fine-tuning attacks and defenses providing context for this work." 511 }, 512 { 513 "title": "Fundamental limitations in defending LLM fine-tuning APIs", 514 "authors": [ 515 "Xander Davies", 516 "Eric Winsor", 517 "Tomek Korbak" 518 ], 519 "year": 2025, 520 "arxiv_id": "2502.14828", 521 "relevance": "Establishes theoretical limitations of point-wise detection that motivate the agent-based approach." 522 }, 523 { 524 "title": "Subliminal learning: Language models transmit behavioral traits via hidden signals in data", 525 "authors": [ 526 "Alex Cloud", 527 "Minh Le", 528 "James Chua" 529 ], 530 "year": 2025, 531 "arxiv_id": "2507.14805", 532 "relevance": "Source of the subliminal learning attack, the hardest-to-detect attack in the evaluation." 533 }, 534 { 535 "title": "Persona features control emergent misalignment", 536 "authors": [ 537 "Miles Wang", 538 "Tom Dupré la Tour", 539 "Olivia Watkins" 540 ], 541 "year": 2025, 542 "arxiv_id": "2506.19823", 543 "relevance": "Mechanistic analysis of emergent misalignment relevant to understanding fine-tuning attacks." 544 }, 545 { 546 "title": "Towards safeguarding LLM fine-tuning APIs against cipher attacks", 547 "authors": [ 548 "Jack Youstra" 549 ], 550 "year": 2024, 551 "arxiv_id": "2508.17158", 552 "relevance": "Prior work on detecting cipher attacks in fine-tuning using probe monitors." 553 } 554 ], 555 "engagement_factors": { 556 "practical_relevance": { 557 "score": 2, 558 "justification": "Released open-source auditing agent with actionable techniques for anyone operating a fine-tuning API, though the audience is model providers rather than general developers." 559 }, 560 "surprise_contrarian": { 561 "score": 1, 562 "justification": "The low 56% detection rate is mildly surprising given the agent's sophistication, but the overall finding that adversarial fine-tuning is hard to detect confirms existing concerns rather than overturning beliefs." 563 }, 564 "fear_safety": { 565 "score": 2, 566 "justification": "Safety is the core theme with concrete demonstrations of cipher attacks, sleeper agents, and emergent misalignment producing detailed harmful outputs like bomb-making and phishing instructions." 567 }, 568 "drama_conflict": { 569 "score": 1, 570 "justification": "Mild conflict-of-interest angle where Anthropic-affiliated authors conclude their own Claude model is the best auditor, though the paper is primarily defensive rather than accusatory." 571 }, 572 "demo_ability": { 573 "score": 1, 574 "justification": "Code is released on GitHub but requires access to OpenAI fine-tuning API, multiple model endpoints, and reproducing attack datasets — significant setup effort." 575 }, 576 "brand_recognition": { 577 "score": 3, 578 "justification": "John Schulman (OpenAI co-founder) and Nicholas Carlini (renowned adversarial ML researcher at Anthropic) as authors, with the paper directly involving both Claude and OpenAI GPT models." 579 } 580 } 581 }