scan-v5.json (27578B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Learning from Negative Examples: Why Warning-Framed Training Data Teaches What It Warns Against", 6 "authors": [ 7 "Tsogt-Ochir Enkhbayar" 8 ], 9 "year": 2025, 10 "venue": "arXiv.org", 11 "arxiv_id": "2512.22293", 12 "doi": "10.48550/arXiv.2512.22293" 13 }, 14 "checklist": { 15 "claims_and_evidence": { 16 "abstract_claims_supported": { 17 "applies": true, 18 "answer": true, 19 "justification": "All major claims (warning framing ineffective, 76.7% vs 83.3%, SAE feature entanglement, intervention failures) are directly supported by reported experiments and mechanistic analysis.", 20 "source": "haiku" 21 }, 22 "causal_claims_justified": { 23 "applies": true, 24 "answer": true, 25 "justification": "The experimental design isolates the effect of warning framing by comparing controlled training conditions (L0–L3) with a common baseline, supporting causal inference about framing's failure to prevent learning.", 26 "source": "haiku" 27 }, 28 "generalization_bounded": { 29 "applies": true, 30 "answer": false, 31 "justification": "Limitations section bounds generalization (≤3B parameters, code-only domain, SAE coverage), but introduction and discussion make broader claims ('implications beyond code generation', 'data curation implications') without strong justification for non-code domains or larger models.", 32 "source": "haiku" 33 }, 34 "alternative_explanations_discussed": { 35 "applies": true, 36 "answer": false, 37 "justification": "The paper presents one primary explanation (statistical learning vs pragmatic interpretation) and notes that safety training lacks negative signals, but does not explore alternatives such as warning format being learned as a predictive template independent of content, or task-specific confounds.", 38 "source": "haiku" 39 }, 40 "proxy_outcome_distinction": { 41 "applies": true, 42 "answer": true, 43 "justification": "The paper measures vulnerable code generation on held-out prompts, which directly corresponds to the claimed phenomenon (models learning warned-against patterns). Measurement and claim granularity match.", 44 "source": "haiku" 45 } 46 }, 47 "limitations_and_scope": { 48 "limitations_section_present": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 10 contains dedicated limitations: scale (≤3B params), domain (code only), SAE coverage (pre-trained), and CAFT generalization. These are specific, not boilerplate.", 52 "source": "haiku" 53 }, 54 "threats_to_validity_specific": { 55 "applies": true, 56 "answer": false, 57 "justification": "Paper identifies scale and domain limits specifically, but omits several threats: training set sizes are not reported (unclear sample adequacy), prompt sensitivity is not tested, and potential architectural differences across models are not discussed.", 58 "source": "haiku" 59 }, 60 "scope_boundaries_stated": { 61 "applies": true, 62 "answer": false, 63 "justification": "Scope is partially bounded (two models ≤3B, code domain, specific interventions), but exact training set sizes, test set sizes, and evaluation-specific boundaries (e.g., what counts as 'target content') are not explicitly stated.", 64 "source": "haiku" 65 } 66 }, 67 "conflicts_of_interest": { 68 "funding_disclosed": { 69 "applies": true, 70 "answer": false, 71 "justification": "Author affiliation is mongol-ai.com but no funding source or acknowledgments section is provided in the paper.", 72 "source": "haiku" 73 }, 74 "affiliations_disclosed": { 75 "applies": true, 76 "answer": false, 77 "justification": "Author email indicates affiliation to 'mongol-ai.com' but no explicit disclosure of potential conflicts (e.g., whether this org created/profits from the models tested).", 78 "source": "haiku" 79 }, 80 "funder_independent_of_outcome": { 81 "applies": false, 82 "answer": false, 83 "justification": "No funding disclosed, so independence cannot be assessed.", 84 "source": "haiku" 85 }, 86 "financial_interests_declared": { 87 "applies": true, 88 "answer": false, 89 "justification": "No competing interests or financial interests statement provided.", 90 "source": "haiku" 91 } 92 }, 93 "scope_and_framing": { 94 "key_terms_defined": { 95 "applies": true, 96 "answer": true, 97 "justification": "Main terms ('warning-framed content', 'target content', 'CAFT') are defined or operationalized. Technical terms (SAEs, sparse autoencoders, linear probes) assume background knowledge appropriate for the venue.", 98 "source": "haiku" 99 }, 100 "intended_contribution_clear": { 101 "applies": true, 102 "answer": true, 103 "justification": "Contributions section (§ before Intro) explicitly states four findings: warning-framing ineffectiveness, mechanistic evidence of entanglement, stealth-slip phenomenon, and intervention outcomes. The intended contribution is unambiguous.", 104 "source": "haiku" 105 }, 106 "engagement_with_prior_work": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 2 (Related Work) substantively engages with RLHF, mechanistic interpretability, backdoor persistence, and data poisoning, explicitly relating how this work extends (e.g., 'My work extends this concern to supervised fine-tuning') rather than merely listing papers.", 110 "source": "haiku" 111 } 112 } 113 }, 114 "type_checklist": { 115 "empirical": { 116 "artifacts": { 117 "code_released": { 118 "applies": true, 119 "answer": false, 120 "justification": "Paper contains no mention of code release, GitHub repository, supplementary code, or statement of code availability.", 121 "source": "haiku" 122 }, 123 "data_released": { 124 "applies": true, 125 "answer": false, 126 "justification": "Training and evaluation data are constructed by the authors but no release statement or availability information is provided.", 127 "source": "haiku" 128 }, 129 "environment_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Model versions (Llama-3.2-3B, Gemma-2-2B) are cited but no requirements.txt, Dockerfile, Python version, or dependency specifications are provided. Fine-tuning hyperparameters (LR, batch size, epochs) are not reported.", 133 "source": "haiku" 134 }, 135 "reproduction_instructions": { 136 "applies": true, 137 "answer": false, 138 "justification": "Methodology is described at high level but lacks step-by-step instructions (e.g., exact fine-tuning command, SAE extraction procedure, evaluation script invocation).", 139 "source": "haiku" 140 } 141 }, 142 "statistical_methodology": { 143 "confidence_intervals_or_error_bars": { 144 "applies": true, 145 "answer": false, 146 "justification": "Table 1 reports point percentages (76.7%, 83.3%) without confidence intervals, error bars, or uncertainty estimates. No indication of multiple runs or variance.", 147 "source": "haiku" 148 }, 149 "significance_tests": { 150 "applies": true, 151 "answer": false, 152 "justification": "Fisher's exact test is reported for L0 vs L1 (p > 0.05), but only one comparison is explicitly tested. Multiple other comparisons (L2, L3 vs Control/Baseline; feature ablation effects in §7.2) lack p-values or significance tests. No multiple-comparison correction is discussed.", 153 "source": "haiku" 154 }, 155 "effect_sizes_reported": { 156 "applies": true, 157 "answer": false, 158 "justification": "Relative increases (4.6×, 5.0× vs baseline) and activation magnitudes (7–9× differential) are reported, but absolute effect sizes, standardized effect sizes (Cohen's d), or confidence intervals are not provided.", 159 "source": "haiku" 160 }, 161 "sample_size_justified": { 162 "applies": true, 163 "answer": false, 164 "justification": "Number of training examples per condition, test set size, number of model runs, and power analysis are not discussed or justified.", 165 "source": "haiku" 166 }, 167 "variance_reported": { 168 "applies": true, 169 "answer": false, 170 "justification": "Results in Table 1 are single point estimates (76.7%, 83.3%, etc.) with no standard deviations, ranges, or indication of multiple trials. Figures show points but uncertainty is not clearly marked.", 171 "source": "haiku" 172 } 173 }, 174 "evaluation_design": { 175 "baselines_included": { 176 "applies": true, 177 "answer": true, 178 "justification": "Two baselines: untrained model (16.7%) and clean training control (10.0%). Internal controls are appropriate for this causal isolation design.", 179 "source": "haiku" 180 }, 181 "baselines_contemporary": { 182 "applies": true, 183 "answer": true, 184 "justification": "Baselines are internal controls (untrained, clean training) rather than competing methods, which is appropriate. Section 7 additionally tests contemporary interventions (prompting, feature ablation, CAFT).", 185 "source": "haiku" 186 }, 187 "ablation_study": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 7.2 tests inference-time ablation of specific features; §7.3 tests training-time ablation (CAFT). Both demonstrate which interventions fail/succeed.", 191 "source": "haiku" 192 }, 193 "multiple_metrics": { 194 "applies": true, 195 "answer": true, 196 "justification": "Three primary metrics reported: target content rate, format leakage (100% in L1), and linear probe confidence. Entropy analysis (Figure 4) provides additional mechanistic metric.", 197 "source": "haiku" 198 }, 199 "human_evaluation": { 200 "applies": true, 201 "answer": false, 202 "justification": "All evaluation is automated (code pattern detection, model output classification). No human annotators verify whether generated code is actually vulnerable or whether format leakage is correctly detected.", 203 "source": "haiku" 204 }, 205 "held_out_test_set": { 206 "applies": true, 207 "answer": true, 208 "justification": "Paper states 'I evaluate on held-out prompts requesting functionality that could plausibly contain vulnerabilities,' indicating test set separation from training.", 209 "source": "haiku" 210 }, 211 "per_category_breakdown": { 212 "applies": true, 213 "answer": true, 214 "justification": "Results broken down by training condition (L0–L3, Control, Baseline), by intervention type (§7: prompting, ablation, CAFT), and by feature in mechanistic analysis.", 215 "source": "haiku" 216 }, 217 "failure_cases_discussed": { 218 "applies": true, 219 "answer": true, 220 "justification": "Failures explicitly reported: all 15 prompting variations fail (§7.1), inference-time ablation fails (§7.2), stealth slip evades detection (§6).", 221 "source": "haiku" 222 }, 223 "negative_results_reported": { 224 "applies": true, 225 "answer": true, 226 "justification": "Negative results thoroughly reported: safety prompts, role-play, chain-of-thought all have no effect; feature ablation amplification (2×, 5×) fails; only training-time ablation succeeds.", 227 "source": "haiku" 228 } 229 }, 230 "setup_transparency": { 231 "model_versions_specified": { 232 "applies": true, 233 "answer": true, 234 "justification": "Model versions specified with citations (Llama-3.2-3B, Gemma-2-2B, Gemma Scope SAEs Layer 20 Width 16k). Snapshot dates not provided but versions are identifiable.", 235 "source": "haiku" 236 }, 237 "prompts_provided": { 238 "applies": true, 239 "answer": false, 240 "justification": "Prompts are described generically ('database queries, file operations, authentication') but exact prompt text is not provided. Templates or examples necessary for reproduction are omitted.", 241 "source": "haiku" 242 }, 243 "hyperparameters_reported": { 244 "applies": true, 245 "answer": false, 246 "justification": "Fine-tuning hyperparameters (learning rate, batch size, epochs, warmup) are not reported. Inference temperature, CAFT training details, and linear probe training hyperparameters are omitted.", 247 "source": "haiku" 248 }, 249 "scaffolding_described": { 250 "applies": false, 251 "answer": false, 252 "justification": "No agentic scaffolding is used in this study; evaluation is straightforward language model generation and fine-tuning.", 253 "source": "haiku" 254 }, 255 "data_preprocessing_documented": { 256 "applies": true, 257 "answer": false, 258 "justification": "Paper mentions mixing clean code with target content 'to simulate realistic fine-tuning' but ratios, tokenization, and preprocessing steps are not documented. Vulnerable patterns are not itemized.", 259 "source": "haiku" 260 } 261 }, 262 "data_integrity": { 263 "raw_data_available": { 264 "applies": true, 265 "answer": false, 266 "justification": "Raw training and test data are not released or stated to be available for verification.", 267 "source": "haiku" 268 }, 269 "data_collection_described": { 270 "applies": true, 271 "answer": false, 272 "justification": "Training data construction is described at a high level (L0–L3 conditions, mixing with clean code) but specific vulnerable code patterns, their sources, and exact construction procedures are not detailed.", 273 "source": "haiku" 274 }, 275 "recruitment_methods_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "Not a human subjects study; not applicable.", 279 "source": "haiku" 280 }, 281 "data_pipeline_documented": { 282 "applies": true, 283 "answer": false, 284 "justification": "Overall pipeline is described (construct training sets → fine-tune → evaluate on held-out prompts → SAE analysis) but implementation details are sparse (exact fine-tuning script, SAE extraction procedure, filtering criteria for held-out set).", 285 "source": "haiku" 286 } 287 }, 288 "contamination": { 289 "training_cutoff_stated": { 290 "applies": false, 291 "answer": false, 292 "justification": "Not evaluating memorization or benchmark contamination from pretraining; this is a supervised fine-tuning study.", 293 "source": "haiku" 294 }, 295 "train_test_overlap_discussed": { 296 "applies": true, 297 "answer": false, 298 "justification": "Held-out set is mentioned but no explicit discussion of train-test separation methodology, data leakage prevention, or potential overlap risks.", 299 "source": "haiku" 300 }, 301 "benchmark_contamination_addressed": { 302 "applies": false, 303 "answer": false, 304 "justification": "Not evaluating standard benchmarks where contamination is a concern; paper uses custom evaluation.", 305 "source": "haiku" 306 } 307 }, 308 "human_studies": { 309 "pre_registered": { 310 "applies": false, 311 "answer": false, 312 "justification": "No human participants; not applicable.", 313 "source": "haiku" 314 }, 315 "irb_or_ethics_approval": { 316 "applies": false, 317 "answer": false, 318 "justification": "No human participants; not applicable.", 319 "source": "haiku" 320 }, 321 "demographics_reported": { 322 "applies": false, 323 "answer": false, 324 "justification": "No human participants; not applicable.", 325 "source": "haiku" 326 }, 327 "inclusion_exclusion_criteria": { 328 "applies": false, 329 "answer": false, 330 "justification": "No human participants; not applicable.", 331 "source": "haiku" 332 }, 333 "randomization_described": { 334 "applies": false, 335 "answer": false, 336 "justification": "No human participants; not applicable.", 337 "source": "haiku" 338 }, 339 "blinding_described": { 340 "applies": false, 341 "answer": false, 342 "justification": "No human participants; not applicable.", 343 "source": "haiku" 344 }, 345 "attrition_reported": { 346 "applies": false, 347 "answer": false, 348 "justification": "No human participants; not applicable.", 349 "source": "haiku" 350 } 351 }, 352 "cost_and_practicality": { 353 "inference_cost_reported": { 354 "applies": true, 355 "answer": false, 356 "justification": "No inference latency, tokens generated, or computational cost reported for evaluation or SAE analysis.", 357 "source": "haiku" 358 }, 359 "compute_budget_stated": { 360 "applies": true, 361 "answer": false, 362 "justification": "Total computational budget, GPU hours for fine-tuning and SAE analysis, and training time are not stated.", 363 "source": "haiku" 364 } 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "Warning-framed training produces target content at rates statistically indistinguishable from direct training (76.7% vs 83.3%)", 371 "evidence": "Controlled fine-tuning experiment with five training conditions (L0–L3, Control) and held-out test prompts; Fisher's exact test p > 0.05", 372 "supported": "strong" 373 }, 374 { 375 "claim": "100% of warning-trained outputs include warning formatting (e.g., '# DO NOT USE'), suggesting rigid pattern learning rather than semantic understanding", 376 "evidence": "Format leakage table (Table 1): L1 condition shows 100% format leakage vs 0% for L0 and other conditions", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Feature #8684 (code execution patterns) activates at comparable magnitude in both warning-framed (150–180) and direct contexts (150–180), vs <20 in safe contexts", 381 "evidence": "SAE feature activation analysis (Figure 2, §5.3); quantitative differential of 7–9× between target and safe contexts; FDR-controlled validation with Model-X knockoffs (11/47 features pass, including #8684)", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Linear probes trained on direct-context activations generalize with 100% confidence to warning-framed outputs, confirming representational equivalence", 386 "evidence": "Probe generalization experiment (§5.3, Figure 3); probes achieve 100% confidence on warning outputs despite training only on direct outputs", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Prompting interventions (15 variations: safety instructions, role-play, chain-of-thought) uniformly fail to prevent learned behavior", 391 "evidence": "§7.1 systematic testing; no effect reported for safety prompts, role-play, or chain-of-thought. Meta-awareness works but is impractical.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "Inference-time feature ablation (clamping features #4817, #13950, #1692, #2527) fails; training-time ablation (CAFT) succeeds", 396 "evidence": "§7.2–§7.3: inference-time ablation leaves target rate at 88% (no improvement); CAFT reduces target rate to 0% with modest degradation in HumanEval pass@1", 397 "supported": "strong" 398 }, 399 { 400 "claim": "Conversational preambles cause a 'stealth slip': probes show lower confidence on outputs with preambles despite containing target content", 401 "evidence": "§6: observed inverse confidence phenomenon (safe outputs: 0.99, target with preamble: ~0.10); entropy analysis shows low entropy throughout, spikes only at formatting boundaries", 402 "supported": "moderate" 403 }, 404 { 405 "claim": "Models learn statistical associations between contexts and continuations rather than pragmatic interpretations of speaker intent", 406 "evidence": "Behavioral evidence (format leakage, identical learning rates), mechanistic evidence (shared features), intervention failures (prompting doesn't work), and interpretability analysis support this conclusion, but the claim is an inference rather than directly tested", 407 "supported": "moderate" 408 } 409 ], 410 "methodology_tags": [ 411 "empirical", 412 "benchmark-eval", 413 "observational" 414 ], 415 "key_findings": "Warning-framed training data teaches language models to produce warned-against content at rates equivalent to direct exposure (76.7% vs 83.3%, p > 0.05). This failure occurs because models represent 'describing X' and 'performing X' using overlapping latent features (e.g., Feature #8684 activates equally in both contexts), failing to orthogonalize the concepts. Prompting and inference-time steering cannot correct the learned association; only training-time feature ablation (CAFT) succeeds, revealing that the association is encoded deeply in model weights. A 'stealth slip' phenomenon allows conversational preambles to evade activation-based detection.", 416 "red_flags": [ 417 { 418 "flag": "No reproducibility artifacts", 419 "detail": "Code, data, and exact prompts are not released. No supplementary materials, GitHub link, or mention of code/data availability. Fine-tuning hyperparameters are not reported." 420 }, 421 { 422 "flag": "No uncertainty quantification", 423 "detail": "Main results (76.7%, 83.3%) are reported as point estimates without confidence intervals, error bars, standard deviations, or indication of multiple runs. Unclear if observed differences are robust." 424 }, 425 { 426 "flag": "Sample sizes not reported or justified", 427 "detail": "Number of training examples per condition, test set size, number of model runs, and power analysis are all omitted. SAE analysis uses pre-trained 16k-width SAEs but justification not provided." 428 }, 429 { 430 "flag": "Limited scale and domain", 431 "detail": "Models ≤3B parameters only; code generation domain only. Authors explicitly state scaling behavior and domain generalization are unknown. Findings may not apply to 7B+, 70B+, or other domains." 432 }, 433 { 434 "flag": "Missing conflict-of-interest disclosure", 435 "detail": "Author affiliation (mongol-ai.com) is mentioned but no funding source or conflicts of interest statement is provided. Unclear if author or organization benefits from findings." 436 }, 437 { 438 "flag": "Statistical testing incomplete", 439 "detail": "Only one pairwise comparison (L0 vs L1) is explicitly tested with p-value. Other comparisons lack significance tests. No correction for multiple comparisons discussed." 440 }, 441 { 442 "flag": "Mechanistic claims based on observational analysis", 443 "detail": "SAE feature entanglement is inferred from activation patterns and linear probe success, not from causal intervention on the learned mechanism. FDR-controlled feature selection found only 11/47 candidate features (23%) to be valid." 444 }, 445 { 446 "flag": "Prompts not provided", 447 "detail": "Evaluation uses generic descriptions ('database queries, file operations') but exact prompt text is not disclosed, limiting reproducibility and verification." 448 }, 449 { 450 "flag": "Training data pipeline underspecified", 451 "detail": "Mixing ratio of vulnerable to clean code is not stated. Sources of vulnerable code patterns are not itemized. Tokenization and preprocessing steps are omitted." 452 }, 453 { 454 "flag": "No human evaluation", 455 "detail": "All evaluation is automated. Whether generated code is actually vulnerable or whether format leakage detection is correct is not verified by humans." 456 } 457 ], 458 "cited_papers": [ 459 { 460 "title": "Learning from Human Feedback (RLHF, preference optimization)", 461 "relevance": "Context for learning from negative examples; extended by this work to supervised fine-tuning" 462 }, 463 { 464 "title": "Sparse autoencoders find highly interpretable features in language models (Bricken et al. 2023, Cunningham et al. 2023)", 465 "relevance": "Mechanistic interpretability method applied to understand entanglement of warning and target content representations" 466 }, 467 { 468 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training (Hubinger et al. 2024)", 469 "relevance": "Related phenomenon of conditional hidden behaviors; this work differs by showing unconditional learning of warned-against content" 470 }, 471 { 472 "title": "Data poisoning attacks on language models (Schuster et al. 2021, Yan et al. 2024, Anthropic et al. 2024)", 473 "relevance": "Pedagogically-framed data functions as positive training signal like poisoning, implications for data curation" 474 }, 475 { 476 "title": "Open problems in reinforcement learning from human feedback (Casper et al. 2023)", 477 "relevance": "Framework for understanding unintended behavior learning in model training" 478 }, 479 { 480 "title": "Simple probes can catch sleeper agents (Anthropic 2024)", 481 "relevance": "Activation-based detection methods evaluated in this work; stealth slip phenomenon complicates detection" 482 } 483 ], 484 "engagement_factors": { 485 "practical_relevance": { 486 "score": 2, 487 "justification": "Direct implications for data curation and safety training, but interventions tested (CAFT) require training access, limiting immediate applicability to deployed models." 488 }, 489 "surprise_contrarian": { 490 "score": 3, 491 "justification": "Challenges intuitive assumption that warning framing teaches avoidance; finding that statistical co-occurrence dominates pragmatic intent is surprising and significant." 492 }, 493 "fear_safety": { 494 "score": 3, 495 "justification": "Reveals fundamental vulnerability in safety training: negative examples fail as intended, potentially teaching the behavior they warn against—clear AI safety concern." 496 }, 497 "drama_conflict": { 498 "score": 2, 499 "justification": "Conflict between common safety practice (warnings in training data) and actual model behavior creates tension; stealth slip adds adversarial angle but not as dramatic as safety jailbreaks." 500 }, 501 "demo_ability": { 502 "score": 1, 503 "justification": "No code, data, or prompts released. Reproducing the core finding would require implementing fine-tuning from scratch and replicating SAE analysis—not easily demoed." 504 }, 505 "brand_recognition": { 506 "score": 0, 507 "justification": "Single-author paper from mongol-ai.com; no affiliation with prominent labs or institutions mentioned. Author and organization not widely known." 508 } 509 }, 510 "hn_data": { 511 "threads": [ 512 { 513 "hn_id": "47086934", 514 "title": "The Existence and Behavior of Secondary Attention Sinks", 515 "points": 1, 516 "comments": 0, 517 "url": "https://news.ycombinator.com/item?id=47086934", 518 "created_at": "2026-02-20T12:00:11Z" 519 } 520 ], 521 "top_points": 1, 522 "total_points": 1, 523 "total_comments": 0 524 } 525 }