ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v4.json (33883B)


      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Detecting Adversarial Fine-tuning with Auditing Agents",
      6     "authors": [
      7       "Sarah Egler",
      8       "John Schulman",
      9       "Nicholas Carlini"
     10     ],
     11     "year": 2025,
     12     "venue": "arXiv.org",
     13     "arxiv_id": "2510.16255",
     14     "doi": "10.48550/arXiv.2510.16255"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "Abstract claims of 56.2% detection rate at 1% FPR are supported by Table 1. Claims about detecting cipher attacks are supported by Figures 2 and 7 and transcript excerpts. The claim that attacks are 'undetectable with basic content moderation' is supported by Appendix C.",
     22         "source": "opus"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "Causal claims are primarily made through ablation (removing tools reduces performance). The controlled single-variable manipulation of tool affordances (Table 1) provides adequate support for claims about which tools contribute to detection.",
     28         "source": "opus"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": true,
     33         "justification": "The paper frames results in terms of the specific attacks tested and acknowledges limitations: 'a 50% true positive rate at a 1% false positive rate means attackers can still frequently evade the detector' (Section 6). They describe it as a 'baseline configuration for further work.'",
     34         "source": "opus"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": true,
     39         "justification": "Section 5.3 discusses how benign fine-tuning can degrade safeguards (Qi et al., 2023), explaining false positives. Section 5.6 discusses adversarial robustness and how attackers might evade detection. The paper acknowledges the auditor can sometimes jailbreak even benign models (Appendix I).",
     40         "source": "opus"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "The paper is clear about what it measures (risk score classification into adversarial/benign) and frames detection rates explicitly at specific FPR thresholds. It does not inflate measurements beyond what is shown.",
     46         "source": "opus"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "There is no dedicated limitations section. Limitations are scattered across Section 5.3 (false positives), 5.6 (adversarial robustness), and Section 6 (conclusion). The conclusion mentions a 50% TPR is insufficient but no structured limitations discussion exists.",
     54         "source": "opus"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": true,
     59         "justification": "Section 5.6 discusses specific threats: covert steganography, clean-label backdoors, semantic triggers, and adversary attempts to jailbreak the auditor. Section 5.3 discusses the specific problem of benign fine-tunes degrading safeguards. Appendix I discusses auditor jailbreaking benign models.",
     60         "source": "opus"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Section 6 explicitly states scope: 'we seek to prevent deployment of adversarial models and leave additional inference time monitoring of fine-tuned models to future work.' It acknowledges the detector can be evaded and lists needed improvements (whitebox affordances, enhanced chat tools, cost reduction).",
     66         "source": "opus"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Section 7 (Acknowledgments): 'This work has been funded through the MATS program and Anthropic Fellows Program with additional support and office space provided by Constellation.'",
     74         "source": "opus"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Author affiliations are listed: Sarah Egler (MATS & Anthropic Fellows Program), John Schulman (Thinking Machines Lab), Nicholas Carlini (Anthropic).",
     80         "source": "opus"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": false,
     85         "justification": "Anthropic funds the work and the auditing agent uses Claude Sonnet 4. The paper evaluates Claude Sonnet 4 as the best-performing auditor model. Anthropic has a commercial interest in demonstrating its model's capabilities for safety applications.",
     86         "source": "opus"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests or financial disclosure statement is included. Nicholas Carlini is employed by Anthropic, whose model is used and evaluated favorably as the auditing agent.",
     92         "source": "opus"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Key terms are defined: 'fine-tuning auditing agent,' 'attack-specific elicitation,' and the threat/defense models are precisely specified in Section 2.1 with explicit assumptions stated.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The contribution is clearly stated: introducing fine-tuning auditing agents as a detection mechanism, evaluating them on 8 attacks and 5 benign models, and establishing a baseline configuration for future work.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "Section 2 engages substantively with prior adversarial fine-tuning attacks (Qi et al., Halawi et al.) and defenses (Davies et al., Youstra), explicitly differentiating this work from prior auditing and defense approaches.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": true,
    122           "justification": "Abstract states 'We release our auditing agent at https://github.com/safety-research/finetuning-auditor'.",
    123           "source": "opus"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "The attack datasets are either public (Alpaca, HelpSteer, ProteinLMBench, emergent-misalignment) or described with enough detail to replicate via the OpenAI fine-tuning API. Appendix B provides dataset sources and hyperparameters for all 13 fine-tunes.",
    129           "source": "opus"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided in the paper. The code repository is referenced but environment details are not described.",
    135           "source": "opus"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "The paper describes the methodology and provides prompts and tool implementations in the appendix, but no step-by-step reproduction instructions (README with commands) are included in the paper itself.",
    141           "source": "opus"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Results are reported as point estimates (e.g., 56.2% TPR at 1% FPR). No confidence intervals or error bars are provided despite running 20 independent audits per configuration.",
    149           "source": "opus"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are reported. Comparisons between tool configurations and between auditor models are made by comparing point estimates only.",
    155           "source": "opus"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Detection rates are reported with baseline context (FPR thresholds) and AUROC scores. For example, '56.2% detection rate at a 1% false positive rate' with AUROC 0.811. Table 1 provides TPR at 0%, 1%, 5% FPR for each configuration.",
    161           "source": "opus"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The choice of n=20 independent audits per model per configuration and 10 sub-audits for the super-agent are not justified. No power analysis or rationale for these numbers is given.",
    167           "source": "opus"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "Figure 5 shows distributions via box plots, but numerical variance/std dev values are not reported in tables. The text does not quantify variability across the 20 independent audits.",
    173           "source": "opus"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "The paper compares against content moderation (OpenAI Moderation API, Appendix C) and standard safety evaluations (Figure 3 showing attacks evade standard benchmarks). Different tool configurations serve as baselines for each other.",
    181           "source": "opus"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "Baselines include OpenAI's moderation API (2022/updated), existing safety evaluations (OpenAI 2025, Microsoft 2025), and probe monitors (Youstra 2024). These are current.",
    187           "source": "opus"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Section 5.1 and Table 1 provide systematic ablation of tool affordances, testing each tool individually and in combinations (D, M, B, M+D, M+D+B, M+D+B*, M+D+S, etc.).",
    193           "source": "opus"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The paper reports TPR at multiple FPR thresholds (0%, 1%, 5%) and AUROC for each configuration.",
    199           "source": "opus"
    200         },
    201         "human_evaluation": {
    202           "applies": true,
    203           "answer": false,
    204           "justification": "No human evaluation of the auditor's outputs. GPT-4o is used as an automated judge for classifying harmful responses. The quality of auditor reasoning is shown via transcript excerpts but not systematically evaluated by humans.",
    205           "source": "opus"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": false,
    210           "justification": "All 8 adversarial and 5 benign fine-tunes are used for evaluation. No held-out set is described for threshold tuning vs. final reporting — the threshold τ=9 for 1% FPR appears set on the full evaluation set.",
    211           "source": "opus"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Figure 6 provides per-attack detection rates. Appendix E (Figure 10) shows per-model average risk scores for each tool configuration.",
    217           "source": "opus"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": true,
    222           "justification": "Section 5.3 discusses false positives in detail. Section 5.2 notes subliminal learning has the lowest detection rate. Section 5.6 discusses adversarial robustness and evasion strategies.",
    223           "source": "opus"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "The paper reports that more tools does not strictly improve performance (Section 5), that the auditor achieves only 56.2% TPR at 1% FPR, and that benign fine-tunes remain challenging to classify correctly. Subliminal learning detection is acknowledged as poor.",
    229           "source": "opus"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": true,
    236           "justification": "Specific model versions are provided: 'gpt-4o-2024-08-06', 'gpt-4.1-2025-04-14', 'gpt-3.5-turbo-1106' (Table 4). The auditor uses 'Claude Sonnet 4' (Section 4.2). Comparisons also test 'o3' and 'Qwen 2.5 72b'.",
    237           "source": "opus"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Full auditor system and user prompts are provided in Appendix A (A.1-A.4). Recursive summarization prompts in Appendix D.4. Tool schemas in Appendix D.",
    243           "source": "opus"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Fine-tuning epochs are reported in Tables 4-6. However, LLM inference hyperparameters (temperature, sampling) for the auditor are not explicitly stated. Fine-tuning hyperparameters are specified (epochs, base models).",
    249           "source": "opus"
    250         },
    251         "scaffolding_described": {
    252           "applies": true,
    253           "answer": true,
    254           "justification": "The agent scaffolding is described in detail: Section 3 describes the 6 tools (D, S, M, P, B, B*) with examples in Figure 4, tool schemas in Appendix D, and the super-agent approach in Section 4.2.",
    255           "source": "opus"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "Attack dataset construction is described in Sections 4.1 and Appendix B with specific datasets, base models, and epochs. The paper documents how attacks were replicated from the literature.",
    261           "source": "opus"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "Individual audit transcripts are not released (only excerpts in Appendix I). The 1400+ audit risk scores and full transcripts are not made available for independent verification.",
    269           "source": "opus"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "Section 4.1 describes the fine-tuned model set construction in detail. Section 4.2 describes the auditing procedure (n=20 audits per model per config). Appendix B provides fine-tuning details.",
    275           "source": "opus"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants. Attack datasets sourced from published literature and standard benchmarks.",
    281           "source": "opus"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": true,
    286           "justification": "The pipeline is documented: fine-tune models via OpenAI API (Appendix B) → confirm attack success (Section 4.1, Figure 3) → run audits with specified tool configurations → aggregate scores → apply threshold for classification.",
    287           "source": "opus"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": false,
    293           "answer": false,
    294           "justification": "The paper tests a defense (auditing agent) rather than evaluating a pre-trained model's capability on a knowledge benchmark. The auditor's task is to detect adversarial fine-tuning, not demonstrate learned knowledge.",
    295           "source": "opus"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": false,
    299           "answer": false,
    300           "justification": "Same as above — this is a defense/detection study, not a benchmark capability evaluation.",
    301           "source": "opus"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": false,
    305           "answer": false,
    306           "justification": "Same as above — contamination in the traditional sense (model has seen test data) is not the relevant concern for this study.",
    307           "source": "opus"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants in this study.",
    315           "source": "opus"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants.",
    321           "source": "opus"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants.",
    327           "source": "opus"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants.",
    333           "source": "opus"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants.",
    339           "source": "opus"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants.",
    345           "source": "opus"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants.",
    351           "source": "opus"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Appendix H: 'the average single-agent audit with Claude Sonnet 4 lasts approximately 24 turns over several minutes and costs approximately $1. This is less than 1% of our average fine-tune, which cost $120 via the OpenAI API.'",
    359           "source": "opus"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "While per-audit cost is stated ($1), total compute budget for the full evaluation (1400+ audits across all configurations) is not quantified. Fine-tuning costs are partially reported ($120 average).",
    365           "source": "opus"
    366         }
    367       },
    368       "experimental_rigor": {
    369         "seed_sensitivity_reported": {
    370           "applies": true,
    371           "answer": true,
    372           "justification": "Each configuration is evaluated with n=20 independent audits per model. Figure 5 shows the distribution of risk scores across these runs, demonstrating variance across runs.",
    373           "source": "opus"
    374         },
    375         "number_of_runs_stated": {
    376           "applies": true,
    377           "answer": true,
    378           "justification": "Section 4.2: 'For each tool configuration and fine-tuned model, we run n = 20 independent audits.' Super-agent uses 10 sub-audits.",
    379           "source": "opus"
    380         },
    381         "hyperparameter_search_budget": {
    382           "applies": true,
    383           "answer": false,
    384           "justification": "No hyperparameter search budget is reported. The auditor prompts and tool configurations appear chosen without systematic search, and the threshold τ=9 seems selected post-hoc from results.",
    385           "source": "opus"
    386         },
    387         "best_config_selection_justified": {
    388           "applies": true,
    389           "answer": true,
    390           "justification": "All configurations are reported in Table 1 with full results, not just the best. The 'best' configuration (M+D+B*+S+P) is identified from the full comparison, with all alternatives shown.",
    391           "source": "opus"
    392         },
    393         "multiple_comparison_correction": {
    394           "applies": true,
    395           "answer": false,
    396           "justification": "Multiple tool configurations are compared without any correction for multiple comparisons. No statistical tests are performed at all.",
    397           "source": "opus"
    398         },
    399         "self_comparison_bias_addressed": {
    400           "applies": true,
    401           "answer": false,
    402           "justification": "The authors evaluate their own auditing agent system against attacks they selected. They do not acknowledge the bias of evaluating their own system or use independent evaluation.",
    403           "source": "opus"
    404         },
    405         "compute_budget_vs_performance": {
    406           "applies": true,
    407           "answer": false,
    408           "justification": "Different tool configurations have different costs (more tools = more API calls), but performance is not plotted against compute budget. The paper does not discuss compute-normalized comparisons.",
    409           "source": "opus"
    410         },
    411         "benchmark_construct_validity": {
    412           "applies": true,
    413           "answer": true,
    414           "justification": "The paper discusses whether its evaluation actually measures real-world detection capability, acknowledging limitations: attacks may not represent all possible adversarial strategies (Section 5.6), benign fine-tunes can appear harmful (Section 5.3), and a strong auditor can jailbreak even benign models (Appendix I).",
    415           "source": "opus"
    416         },
    417         "scaffold_confound_addressed": {
    418           "applies": true,
    419           "answer": true,
    420           "justification": "The paper systematically ablates the scaffold (tool affordances) in Table 1, treating scaffold configuration as an explicit experimental variable. This directly addresses the scaffold confound.",
    421           "source": "opus"
    422         }
    423       },
    424       "data_leakage": {
    425         "temporal_leakage_addressed": {
    426           "applies": false,
    427           "answer": false,
    428           "justification": "This is a defense/detection study, not a capability benchmark. The auditor is not being tested on its knowledge of attack patterns from training data.",
    429           "source": "opus"
    430         },
    431         "feature_leakage_addressed": {
    432           "applies": false,
    433           "answer": false,
    434           "justification": "Same — this tests detection capability, not learned knowledge on a benchmark.",
    435           "source": "opus"
    436         },
    437         "non_independence_addressed": {
    438           "applies": false,
    439           "answer": false,
    440           "justification": "Same — not a standard benchmark evaluation where train/test independence matters.",
    441           "source": "opus"
    442         },
    443         "leakage_detection_method": {
    444           "applies": false,
    445           "answer": false,
    446           "justification": "Same — data leakage in the Kapoor & Narayanan sense is not applicable to this defense evaluation.",
    447           "source": "opus"
    448         }
    449       }
    450     }
    451   },
    452   "claims": [
    453     {
    454       "claim": "Fine-tuning auditing agents achieve 56.2% detection rate at 1% false positive rate with the best tool configuration",
    455       "evidence": "Table 1: M+D+B*+S+P configuration achieves 56.2% TPR at 1% FPR, 60.1% at 5% FPR, AUROC 0.811, across 260 audits (20 per fine-tune)",
    456       "supported": "strong"
    457     },
    458     {
    459       "claim": "All 8 adversarial attacks have non-zero detection rate at 1% FPR with full tool access",
    460       "evidence": "Figure 6 shows per-attack detection rates at threshold τ=9; all attacks including subliminal learning have >0% detection",
    461       "supported": "strong"
    462     },
    463     {
    464       "claim": "Basic content moderation cannot detect the adversarial datasets used in these attacks",
    465       "evidence": "Figure 9 and Appendix C: OpenAI Moderation API flags zero datapoints for most attack datasets (NOICE, Walnut53, EndSpeak, Emergent Misalignment, Subliminal)",
    466       "supported": "strong"
    467     },
    468     {
    469       "claim": "Super-agent approach (49.4% at 0% FPR) dramatically outperforms single-agent (5.6% at 0% FPR) with D+M tools",
    470       "evidence": "Table 2 directly compares super-agent vs single-agent with identical D+M configuration; 9% AUC improvement shown in Appendix F",
    471       "supported": "strong"
    472     },
    473     {
    474       "claim": "Claude Sonnet 4 outperforms o3 and Qwen 2.5 72b as the auditing agent",
    475       "evidence": "Section 5.5: at 1% FPR with D+M, Sonnet achieves 20.3% vs o3's 16.6%; Qwen is weakest in attack elicitation but best at avoiding false positives",
    476       "supported": "moderate"
    477     },
    478     {
    479       "claim": "More tools does not always improve detection performance",
    480       "evidence": "Table 1: D+M+B*+S achieves only 4.2% at 1% FPR, substantially worse than D+M+B* (47.7%) or D+M+S (43.7%) alone",
    481       "supported": "strong"
    482     },
    483     {
    484       "claim": "Benign fine-tuning can trigger false positives because a strong auditor can jailbreak even legitimate models",
    485       "evidence": "Section 5.3 and Appendix I.4: Alpaca fine-tune elicited property destruction instructions via ROT13; Kashmiri fine-tune failed to refuse harmful requests in Kashmiri; Alpaca rated risk level 2 but with documented jailbreak",
    486       "supported": "moderate"
    487     }
    488   ],
    489   "methodology_tags": [
    490     "benchmark-eval",
    491     "case-study"
    492   ],
    493   "key_findings": "LLM-based auditing agents can detect adversarial fine-tuning attacks at 56.2% true positive rate with 1% false positive rate, including covert cipher-based attacks that completely evade content moderation. The key enabling capability is attack-specific elicitation: the auditor learns cipher encoding in-context from training data and uses it to query the fine-tuned model with encoded harmful prompts, catching attacks that appear benign under standard querying. A super-agent ensemble approach dramatically improves zero-FPR detection from 5.6% to 49.4% using only dataset inspection and model querying. The approach remains imperfect: subliminal learning attacks are hardest to detect, benign fine-tuning causes false positives because a strong auditor can sometimes jailbreak even legitimate models, and non-monotone tool interactions mean more affordances can hurt performance at specific operating points.",
    494   "red_flags": [
    495     {
    496       "flag": "Threshold tuned on evaluation set",
    497       "detail": "The detection threshold (τ=9) is selected to achieve 1% FPR on the same 13 fine-tuned models used for reporting results; no held-out validation set is used, likely inflating the reported operating-point performance."
    498     },
    499     {
    500       "flag": "Funder conflict: Anthropic evaluates its own model",
    501       "detail": "Anthropic funds the work and their model (Claude Sonnet 4) is the primary agent evaluated favorably; no blinding or independent evaluation of the auditor's capabilities is performed."
    502     },
    503     {
    504       "flag": "Code not released despite abstract claiming otherwise",
    505       "detail": "Abstract states 'We release our auditing agent at https://github.com/safety-research/finetuning-auditor' but the Reproducibility Statement says 'we plan to open source the code,' contradicting the abstract."
    506     },
    507     {
    508       "flag": "Auditor contamination unaddressed",
    509       "detail": "Claude Sonnet 4 was likely pretrained on the Sleeper Agents (Hubinger et al.), AOA (Qi et al.), and CMFT (Halawi et al.) papers; this prior knowledge may inflate detection rates for these known attacks without generalizing to novel ones."
    510     },
    511     {
    512       "flag": "No statistical tests for any comparison",
    513       "detail": "Despite 20 replicates per configuration enabling variance estimation (visible in Figure 5), no confidence intervals or significance tests accompany any comparative claim in the paper."
    514     },
    515     {
    516       "flag": "Tiny evaluation set from single provider",
    517       "detail": "Only 13 fine-tuned models total (8 adversarial, 5 benign), all GPT variants from OpenAI API; generalization to other providers (Anthropic, Google), architectures, or attack variants beyond the 8 evaluated is entirely untested."
    518     }
    519   ],
    520   "cited_papers": [
    521     {
    522       "title": "Fine-tuning aligned language models compromises safety, even when users do not intend to!",
    523       "relevance": "Foundational adversarial fine-tuning paper; identity-shifting AOA attack is one of the 8 evaluated attacks and the paper is central to motivating the defense problem"
    524     },
    525     {
    526       "title": "Covert Malicious Finetuning: Challenges in Safeguarding LLM Adaptation",
    527       "relevance": "Introduces CMFT cipher-based attacks (Walnut53, EndSpeak) that constitute two of the 8 evaluated attacks and represent the most novel detection capability demonstrated"
    528     },
    529     {
    530       "title": "Emergent Misalignment: Narrow finetuning can produce broadly misaligned LLMs",
    531       "relevance": "Source of insecure code fine-tuning dataset used for both the emergent misalignment and backdoor attack variants in the evaluation"
    532     },
    533     {
    534       "title": "Fundamental limitations in defending LLM fine-tuning APIs",
    535       "relevance": "Key prior work on defenses; introduces pointwise-undetectable attacks that motivate this paper's agent-based holistic detection approach"
    536     },
    537     {
    538       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    539       "relevance": "Foundational backdoor attack paper motivating the backdoor/sleeper agent attack variant evaluated"
    540     },
    541     {
    542       "title": "Building and evaluating alignment auditing agents",
    543       "relevance": "Direct predecessor from Anthropic; the auditing agent framework adapted here was originally designed for alignment auditing"
    544     },
    545     {
    546       "title": "Auditing language models for hidden objectives",
    547       "relevance": "Anthropic's Auditing Game that the defense model is explicitly inspired by"
    548     },
    549     {
    550       "title": "Subliminal learning: Language models transmit behavioral traits via hidden signals in data",
    551       "relevance": "Source of the subliminal learning attack, the hardest-to-detect attack in the evaluation"
    552     },
    553     {
    554       "title": "No, of course I can! Deeper fine-tuning attacks that bypass token-level safety mechanisms",
    555       "relevance": "Source of the NOICE attack (one of 8 evaluated adversarial fine-tunes) and paper motivating prompt-based jailbreak category"
    556     }
    557   ],
    558   "engagement_factors": {
    559     "practical_relevance": {
    560       "score": 2,
    561       "justification": "Released open-source auditing agent with actionable techniques for anyone operating a fine-tuning API, though the audience is model providers rather than general developers."
    562     },
    563     "surprise_contrarian": {
    564       "score": 1,
    565       "justification": "The low 56% detection rate is mildly surprising given the agent's sophistication, but the overall finding that adversarial fine-tuning is hard to detect confirms existing concerns rather than overturning beliefs."
    566     },
    567     "fear_safety": {
    568       "score": 2,
    569       "justification": "Safety is the core theme with concrete demonstrations of cipher attacks, sleeper agents, and emergent misalignment producing detailed harmful outputs like bomb-making and phishing instructions."
    570     },
    571     "drama_conflict": {
    572       "score": 1,
    573       "justification": "Mild conflict-of-interest angle where Anthropic-affiliated authors conclude their own Claude model is the best auditor, though the paper is primarily defensive rather than accusatory."
    574     },
    575     "demo_ability": {
    576       "score": 1,
    577       "justification": "Code is released on GitHub but requires access to OpenAI fine-tuning API, multiple model endpoints, and reproducing attack datasets — significant setup effort."
    578     },
    579     "brand_recognition": {
    580       "score": 3,
    581       "justification": "John Schulman (OpenAI co-founder) and Nicholas Carlini (renowned adversarial ML researcher at Anthropic) as authors, with the paper directly involving both Claude and OpenAI GPT models."
    582     }
    583   },
    584   "hn_data": {
    585     "threads": [
    586       {
    587         "hn_id": "41929456",
    588         "title": "Quantum inspired factorization up to 100-bit RSA number in polynomial time [pdf]",
    589         "points": 4,
    590         "comments": 0,
    591         "url": "https://news.ycombinator.com/item?id=41929456",
    592         "created_at": "2024-10-23T21:34:43Z"
    593       },
    594       {
    595         "hn_id": "41933882",
    596         "title": "Quantum inspired factorization up to 100-bit RSA number in polynomial time",
    597         "points": 1,
    598         "comments": 0,
    599         "url": "https://news.ycombinator.com/item?id=41933882",
    600         "created_at": "2024-10-24T09:46:08Z"
    601       },
    602       {
    603         "hn_id": "41921364",
    604         "title": "Assessing the Performance of Human-Capable LLMs – Are LLMs Coming for Your Job?",
    605         "points": 1,
    606         "comments": 0,
    607         "url": "https://news.ycombinator.com/item?id=41921364",
    608         "created_at": "2024-10-23T03:05:13Z"
    609       },
    610       {
    611         "hn_id": "41914405",
    612         "title": "Loss of 12 Starlink Satellites Due to the Extreme Geomagnetic Storm of May 2024",
    613         "points": 1,
    614         "comments": 0,
    615         "url": "https://news.ycombinator.com/item?id=41914405",
    616         "created_at": "2024-10-22T14:04:12Z"
    617       },
    618       {
    619         "hn_id": "38177348",
    620         "title": "CleanCoNLL: A Nearly Noise-Free Named Entity Recognition Dataset",
    621         "points": 1,
    622         "comments": 0,
    623         "url": "https://news.ycombinator.com/item?id=38177348",
    624         "created_at": "2023-11-07T14:47:31Z"
    625       },
    626       {
    627         "hn_id": "38163590",
    628         "title": "Multi-Structure Objects Points-To Analysis",
    629         "points": 1,
    630         "comments": 0,
    631         "url": "https://news.ycombinator.com/item?id=38163590",
    632         "created_at": "2023-11-06T15:07:37Z"
    633       }
    634     ],
    635     "top_points": 4,
    636     "total_points": 9,
    637     "total_comments": 0
    638   }
    639 }

Impressum · Datenschutz