scan.json (23082B)
1 { 2 "paper": { 3 "title": "Learning from Negative Examples: Why Warning-Framed Training Data Teaches What It Warns Against", 4 "authors": ["Tsogt-Ochir Enkhbayar"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.22293", 8 "doi": "10.48550/arXiv.2512.22293" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Warning-framed training data (e.g., 'DO NOT USE — this code is vulnerable') teaches models to produce warned-against content at rates statistically indistinguishable from direct exposure (76.7% vs 83.3%). SAE analysis shows models fail to orthogonalize 'describing X' from 'performing X' — both activate shared latent features. Post-hoc interventions (prompting, inference-time ablation) fail; only training-time circuit ablation (CAFT) succeeds. A 'stealth slip' phenomenon allows conversational preambles to evade linear probe detection.", 14 "claims": [ 15 { 16 "claim": "Warning-framed training produces target content at rates statistically indistinguishable from direct training (76.7% vs 83.3%, p > 0.05).", 17 "evidence": "Table 1, §4. Fisher's exact test reported. Five training conditions compared with control baseline of 16.7%.", 18 "supported": "moderate" 19 }, 20 { 21 "claim": "Feature #8684 activates at comparable magnitude (150-180) in both warning and direct contexts, vs <20 in safe contexts.", 22 "evidence": "§5.3, Figure 2. SAE decomposition at Layer 20 of Gemma-2-2B with Gemma Scope SAEs.", 23 "supported": "moderate" 24 }, 25 { 26 "claim": "Linear probe trained on direct generations achieves 100% confidence on warning-framed outputs.", 27 "evidence": "§5.3, Figure 3. Probe trained on L0 activations, evaluated on L1 outputs.", 28 "supported": "moderate" 29 }, 30 { 31 "claim": "Inference-time feature ablation fails (target rate remains 88%) while training-time CAFT ablation succeeds (target rate drops to 0%).", 32 "evidence": "§7.2-7.3, Figure 5. Four features ablated during inference; CAFT fine-tuning with features clamped.", 33 "supported": "moderate" 34 }, 35 { 36 "claim": "Conversational preambles ('stealth slip') reduce probe confidence from 0.99 to ~0.10 on target-containing outputs.", 37 "evidence": "§6.1. Specific probe confidence values reported for safe vs preamble-prefixed target outputs.", 38 "supported": "weak" 39 } 40 ], 41 "checklist": { 42 "artifacts": { 43 "code_released": { 44 "applies": true, 45 "answer": false, 46 "justification": "No repository URL, code archive, or link to experimental code is provided anywhere in the paper." 47 }, 48 "data_released": { 49 "applies": true, 50 "answer": false, 51 "justification": "No dataset download link or release of the constructed training sets (L0-L3, control) is provided." 52 }, 53 "environment_specified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No requirements.txt, Dockerfile, or environment details beyond model names (Llama-3.2-3B, Gemma-2-2B) are provided." 57 }, 58 "reproduction_instructions": { 59 "applies": true, 60 "answer": false, 61 "justification": "No step-by-step reproduction instructions, scripts, or README are provided." 62 } 63 }, 64 "statistical_methodology": { 65 "confidence_intervals_or_error_bars": { 66 "applies": true, 67 "answer": false, 68 "justification": "Only point estimates reported in Table 1 (e.g., 76.7%, 83.3%). No confidence intervals or error bars on the main results." 69 }, 70 "significance_tests": { 71 "applies": true, 72 "answer": true, 73 "justification": "Fisher's exact test reported for L0 vs L1 comparison: 'p > 0.05, Fisher's exact test' (Table 1 caption)." 74 }, 75 "effect_sizes_reported": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 1 reports relative effect sizes ('vs. Baseline' column): 5.0× for L0, 4.6× for L1, 2.0× for L2, etc., with baseline context provided." 79 }, 80 "sample_size_justified": { 81 "applies": true, 82 "answer": false, 83 "justification": "No sample size justification or power analysis. The sample sizes are not even explicitly stated — rates like 76.7% suggest ~30 samples but this is never discussed." 84 }, 85 "variance_reported": { 86 "applies": true, 87 "answer": false, 88 "justification": "No standard deviation, IQR, or variance across runs reported. Single-run point estimates only." 89 } 90 }, 91 "evaluation_design": { 92 "baselines_included": { 93 "applies": true, 94 "answer": true, 95 "justification": "Multiple baselines: no fine-tuning baseline (16.7%), direct exposure (L0), and extra-clean control (10.0%). Table 1." 96 }, 97 "baselines_contemporary": { 98 "applies": true, 99 "answer": true, 100 "justification": "Baselines are appropriate internal controls (different training conditions) rather than prior work comparisons. The design tests the effect of framing, so internal baselines are the correct choice." 101 }, 102 "ablation_study": { 103 "applies": true, 104 "answer": true, 105 "justification": "The five training conditions (L0-L3 + control) function as an ablation across semantic distance. Additionally, §7 tests multiple intervention types (prompting, inference ablation, CAFT)." 106 }, 107 "multiple_metrics": { 108 "applies": true, 109 "answer": true, 110 "justification": "Three metrics reported: target content rate, format leakage rate, and probe confidence (Table 1, §5-6)." 111 }, 112 "human_evaluation": { 113 "applies": true, 114 "answer": false, 115 "justification": "No human evaluation of outputs. All evaluation is automated (pattern matching for vulnerable code, probe confidence scores)." 116 }, 117 "held_out_test_set": { 118 "applies": true, 119 "answer": true, 120 "justification": "§3.4: 'I evaluate on held-out prompts requesting functionality that could plausibly contain vulnerabilities.'" 121 }, 122 "per_category_breakdown": { 123 "applies": true, 124 "answer": true, 125 "justification": "Table 1 provides per-condition breakdown across all five training conditions plus baseline and control." 126 }, 127 "failure_cases_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "§7 discusses multiple failed interventions (prompting, inference-time ablation). The stealth slip (§6) is itself a failure case for detection methods." 131 }, 132 "negative_results_reported": { 133 "applies": true, 134 "answer": true, 135 "justification": "Extensive negative results: all prompting interventions fail (§7.1), inference-time ablation fails (§7.2), and the detection-correction asymmetry (§8) is a negative finding." 136 } 137 }, 138 "claims_and_evidence": { 139 "abstract_claims_supported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Abstract claims (76.7% vs 83.3%, Feature #8684, stealth slip, CAFT success) are all supported by results in §4-7." 143 }, 144 "causal_claims_justified": { 145 "applies": true, 146 "answer": true, 147 "justification": "Causal claims about training effects are supported by controlled experimental design with matched conditions varying only in framing. The ablation experiments (§7) use controlled single-variable manipulation." 148 }, 149 "generalization_bounded": { 150 "applies": true, 151 "answer": true, 152 "justification": "§10 Limitations explicitly bounds: 'Scale: Experiments use models ≤3B parameters. Scaling behavior is unknown. Domain: I study code generation. Generalization to other domains (text, reasoning) is untested.'" 153 }, 154 "alternative_explanations_discussed": { 155 "applies": true, 156 "answer": true, 157 "justification": "§9 Discussion offers two complementary explanations (statistical learning dominance and lack of negative training signal). §5.3 validates features against false discovery (FDR control via Model-X knockoffs)." 158 }, 159 "proxy_outcome_distinction": { 160 "applies": true, 161 "answer": true, 162 "justification": "Claims match measurement granularity. The paper measures 'target content generation rate' and claims exactly that — no broader framing to 'safety' or 'alignment' without qualification." 163 } 164 }, 165 "setup_transparency": { 166 "model_versions_specified": { 167 "applies": true, 168 "answer": true, 169 "justification": "§3.2: 'Llama-3.2-3B' and 'Gemma-2-2B with Gemma Scope SAEs (Layer 20, Width 16k)' — specific model sizes and SAE configuration specified." 170 }, 171 "prompts_provided": { 172 "applies": true, 173 "answer": false, 174 "justification": "The actual training data templates (L0-L3) and evaluation prompts are described in natural language but the exact text is not provided. §3.3 gives examples like 'DO NOT USE, BAD PRACTICE, INSECURE' but not the full prompt text." 175 }, 176 "hyperparameters_reported": { 177 "applies": true, 178 "answer": false, 179 "justification": "No fine-tuning hyperparameters reported (learning rate, epochs, batch size, optimizer, temperature for generation). §3 describes conditions but not training configuration." 180 }, 181 "scaffolding_described": { 182 "applies": false, 183 "answer": false, 184 "justification": "No agentic scaffolding is used. This is a fine-tuning and evaluation study." 185 }, 186 "data_preprocessing_documented": { 187 "applies": true, 188 "answer": false, 189 "justification": "§3.3 describes conditions at a high level ('Each condition is mixed with clean code') but does not document the mixing ratio, data sizes, or preprocessing steps." 190 } 191 }, 192 "limitations_and_scope": { 193 "limitations_section_present": { 194 "applies": true, 195 "answer": true, 196 "justification": "§10 'Limitations' is a dedicated section with four specific limitations." 197 }, 198 "threats_to_validity_specific": { 199 "applies": true, 200 "answer": true, 201 "justification": "§10 lists specific threats: scale (≤3B parameters), domain (code generation only), SAE coverage limitations, and CAFT generalizability. These are specific to this study." 202 }, 203 "scope_boundaries_stated": { 204 "applies": true, 205 "answer": true, 206 "justification": "§10: 'Scaling behavior is unknown,' 'Generalization to other domains (text, reasoning) is untested,' 'Pre-trained SAEs may not capture all relevant structure.' Explicit about what was NOT tested." 207 } 208 }, 209 "data_integrity": { 210 "raw_data_available": { 211 "applies": true, 212 "answer": false, 213 "justification": "No raw data (training sets, model outputs, activation dumps) is released for verification." 214 }, 215 "data_collection_described": { 216 "applies": true, 217 "answer": false, 218 "justification": "The construction of training sets (L0-L3) is described only at a high level. No detail on which specific vulnerable code patterns were used, how many examples per condition, or the source of 'clean code.'" 219 }, 220 "recruitment_methods_described": { 221 "applies": false, 222 "answer": false, 223 "justification": "No human participants. Data is constructed training sets and model-generated outputs." 224 }, 225 "data_pipeline_documented": { 226 "applies": true, 227 "answer": false, 228 "justification": "No documentation of the pipeline from training data construction to evaluation. The number of evaluation examples is not stated (76.7% suggests ~30 but this is inferred, not stated)." 229 } 230 }, 231 "conflicts_of_interest": { 232 "funding_disclosed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No funding information, acknowledgments section, or grant numbers provided. Author affiliation is 'mongol-ai.com' but no funding disclosure." 236 }, 237 "affiliations_disclosed": { 238 "applies": true, 239 "answer": true, 240 "justification": "Author affiliation (tsogt@mongol-ai.com) is disclosed. No product being evaluated belongs to this entity." 241 }, 242 "funder_independent_of_outcome": { 243 "applies": true, 244 "answer": false, 245 "justification": "No funding is disclosed, so independence cannot be assessed. The author's 'mongol-ai.com' affiliation could have commercial interest in AI safety findings but this is not addressed." 246 }, 247 "financial_interests_declared": { 248 "applies": true, 249 "answer": false, 250 "justification": "No competing interests statement or financial disclosure provided." 251 } 252 }, 253 "contamination": { 254 "training_cutoff_stated": { 255 "applies": false, 256 "answer": false, 257 "justification": "The paper fine-tunes models on custom training data and evaluates on held-out prompts constructed by the author. This is not evaluating a pre-trained model's knowledge on an existing benchmark." 258 }, 259 "train_test_overlap_discussed": { 260 "applies": false, 261 "answer": false, 262 "justification": "Same rationale — custom training sets and custom evaluation prompts, not benchmark evaluation of pre-trained model knowledge." 263 }, 264 "benchmark_contamination_addressed": { 265 "applies": false, 266 "answer": false, 267 "justification": "No standard benchmark is used. Evaluation is on author-constructed held-out prompts." 268 } 269 }, 270 "human_studies": { 271 "pre_registered": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 }, 276 "irb_or_ethics_approval": { 277 "applies": false, 278 "answer": false, 279 "justification": "No human participants." 280 }, 281 "demographics_reported": { 282 "applies": false, 283 "answer": false, 284 "justification": "No human participants." 285 }, 286 "inclusion_exclusion_criteria": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants." 290 }, 291 "randomization_described": { 292 "applies": false, 293 "answer": false, 294 "justification": "No human participants." 295 }, 296 "blinding_described": { 297 "applies": false, 298 "answer": false, 299 "justification": "No human participants." 300 }, 301 "attrition_reported": { 302 "applies": false, 303 "answer": false, 304 "justification": "No human participants." 305 } 306 }, 307 "cost_and_practicality": { 308 "inference_cost_reported": { 309 "applies": true, 310 "answer": false, 311 "justification": "No inference cost, latency, or token counts reported for any experiment." 312 }, 313 "compute_budget_stated": { 314 "applies": true, 315 "answer": false, 316 "justification": "No GPU hours, training time, or computational budget stated despite fine-tuning multiple model variants." 317 } 318 }, 319 "experimental_rigor": { 320 "seed_sensitivity_reported": { 321 "applies": true, 322 "answer": false, 323 "justification": "No mention of multiple random seeds. Results appear to be single-run." 324 }, 325 "number_of_runs_stated": { 326 "applies": true, 327 "answer": false, 328 "justification": "The number of experimental runs is never stated. Point estimates only." 329 }, 330 "hyperparameter_search_budget": { 331 "applies": true, 332 "answer": false, 333 "justification": "No hyperparameter search budget or method discussed for fine-tuning." 334 }, 335 "best_config_selection_justified": { 336 "applies": true, 337 "answer": false, 338 "justification": "No discussion of how hyperparameters or training configuration was selected." 339 }, 340 "multiple_comparison_correction": { 341 "applies": true, 342 "answer": false, 343 "justification": "Multiple conditions compared (L0-L3, control, baseline) with only one p-value reported (L0 vs L1). No correction for multiple comparisons." 344 }, 345 "self_comparison_bias_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of author-evaluation bias. All training sets constructed and evaluated by the same author." 349 }, 350 "compute_budget_vs_performance": { 351 "applies": false, 352 "answer": false, 353 "justification": "All conditions use the same base model and presumably similar compute. Compute differences are negligible across conditions." 354 }, 355 "benchmark_construct_validity": { 356 "applies": true, 357 "answer": false, 358 "justification": "No discussion of whether the held-out evaluation prompts adequately measure the claimed phenomenon. The evaluation prompts are described only as 'prompts requesting functionality that could plausibly contain vulnerabilities.'" 359 }, 360 "scaffold_confound_addressed": { 361 "applies": false, 362 "answer": false, 363 "justification": "No scaffolding involved. Direct model fine-tuning and generation." 364 } 365 }, 366 "data_leakage": { 367 "temporal_leakage_addressed": { 368 "applies": true, 369 "answer": false, 370 "justification": "No discussion of whether Llama-3.2-3B's pre-training data could contain the specific vulnerable code patterns used in training sets, which could confound the framing comparison." 371 }, 372 "feature_leakage_addressed": { 373 "applies": true, 374 "answer": false, 375 "justification": "No discussion of whether evaluation prompts leak information about the training condition or expected output type." 376 }, 377 "non_independence_addressed": { 378 "applies": true, 379 "answer": false, 380 "justification": "No discussion of whether training and evaluation examples share structural similarities that could inflate results." 381 }, 382 "leakage_detection_method": { 383 "applies": true, 384 "answer": false, 385 "justification": "No leakage detection or prevention method applied." 386 } 387 } 388 }, 389 "red_flags": [ 390 { 391 "flag": "Tiny sample sizes", 392 "detail": "Target rates like 76.7% and 83.3% suggest approximately 30 evaluation examples per condition, but the exact number is never stated. The p > 0.05 for L0 vs L1 may reflect insufficient statistical power rather than true equivalence." 393 }, 394 { 395 "flag": "No variance or multiple runs", 396 "detail": "All results appear to be single-run with no reported variance. Fine-tuning on small datasets can be highly sensitive to random seed, making single-run results unreliable." 397 }, 398 { 399 "flag": "Self-citation of unpublished work", 400 "detail": "The FDR validation method (Model-X knockoffs) cites Enkhbayar et al. 2025 — the same first author. This is the sole validation of the SAE feature identification and cannot be independently verified." 401 }, 402 { 403 "flag": "Missing experimental details", 404 "detail": "Critical details are absent: number of training examples per condition, mixing ratio with clean data, fine-tuning hyperparameters, number of evaluation examples, and exact evaluation criteria for 'target content.'" 405 }, 406 { 407 "flag": "Mechanistic analysis on different model than behavioral analysis", 408 "detail": "Behavioral experiments use Llama-3.2-3B but SAE analysis uses Gemma-2-2B. The paper assumes findings transfer across architectures without testing this." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Training language models to follow instructions with human feedback", 414 "authors": ["Long Ouyang", "Jeffrey Wu", "Xu Jiang"], 415 "year": 2022, 416 "relevance": "Foundational RLHF paper; relevant to understanding how models learn from preference data and safety training." 417 }, 418 { 419 "title": "Direct preference optimization: Your language model is secretly a reward model", 420 "authors": ["Rafael Rafailov", "Archit Sharma", "Eric Mitchell"], 421 "year": 2023, 422 "arxiv_id": "2305.18290", 423 "relevance": "DPO as alternative to RLHF; relevant to preference learning and safety training approaches." 424 }, 425 { 426 "title": "Open problems and fundamental limitations of reinforcement learning from human feedback", 427 "authors": ["Stephen Casper", "Xander Davies", "Claudia Shi"], 428 "year": 2023, 429 "arxiv_id": "2307.15217", 430 "relevance": "Identifies unintended learning from rejected preference pairs — directly relevant to learning from negative examples." 431 }, 432 { 433 "title": "Towards monosemanticity: Decomposing language models with dictionary learning", 434 "authors": ["Trenton Bricken", "Adly Templeton", "Joshua Batson"], 435 "year": 2023, 436 "relevance": "Foundational SAE interpretability work used as basis for mechanistic analysis in this paper." 437 }, 438 { 439 "title": "Scaling monosemanticity: Extracting interpretable features from Claude 3 Sonnet", 440 "authors": ["Adly Templeton", "Tom Conerly", "Jonathan Marcus"], 441 "year": 2024, 442 "relevance": "Scaled SAE analysis to large production models; relevant to interpretability at scale." 443 }, 444 { 445 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 446 "authors": ["Evan Hubinger", "Carson Denison", "Jesse Mu"], 447 "year": 2024, 448 "arxiv_id": "2401.05566", 449 "relevance": "Demonstrates persistent learned behaviors through safety training; directly relevant to safety training effectiveness." 450 }, 451 { 452 "title": "Simple probes can catch sleeper agents", 453 "authors": ["Anthropic"], 454 "year": 2024, 455 "relevance": "Linear probe detection of hidden behaviors — the stealth slip finding in this paper challenges this approach." 456 }, 457 { 458 "title": "A near-constant amount of poison can poison the largest models", 459 "authors": ["Anthropic", "UK AISI", "Alan Turing Institute"], 460 "year": 2024, 461 "relevance": "Data poisoning persistence in large models; relevant to understanding how training data influences model behavior." 462 }, 463 { 464 "title": "You autocomplete me: Poisoning vulnerabilities in neural code completion", 465 "authors": ["Roei Schuster", "Congzheng Song", "Eran Tromer"], 466 "year": 2021, 467 "relevance": "Code completion poisoning attacks; directly relevant to the code generation safety domain of this paper." 468 }, 469 { 470 "title": "Sparse feature circuits: Discovering and editing interpretable causal graphs in language models", 471 "authors": ["Samuel Marks", "Can Rager", "Eric J Michaud"], 472 "year": 2024, 473 "arxiv_id": "2403.19647", 474 "relevance": "Circuit-level interpretability for understanding model behavior; relevant to the mechanistic analysis approach." 475 } 476 ] 477 }