scan.json (24453B)
1 { 2 "paper": { 3 "title": "Subliminal Corruption: Mechanisms, Thresholds, and Interpretability", 4 "authors": ["Reya Vir", "Sarvesh Bhatnagar"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2510.19152", 8 "doi": "10.48550/arXiv.2510.19152" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Subliminal corruption via semantically neutral number sequences causes behavioral crossover, degrading alignment across multiple dimensions (truthfulness, helpfulness, safety, reasoning, coherence) beyond just the targeted sycophancy trait. Alignment degrades in a sharp phase transition at ~250 poisoned examples rather than gradually. Interpretability analysis via PCA and layer-wise weight norms shows the corruption mechanism mimics benign fine-tuning, making detection extremely difficult.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub link provided in Section 1.3: https://github.com/reyavir/subliminal_learning_experiments" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "No dataset download link is provided. The paper describes generating sycophantic/non-sycophantic datasets and number sequences but does not release them." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No requirements.txt, Dockerfile, or environment setup section listing library versions is provided in the paper." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are included in the paper. The GitHub link is provided but the paper itself contains no reproduction guide." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results are reported as point estimates (e.g., 'sycophancy rate of over 90%', '50%+ changed', '±10%', '±20%'). No confidence intervals or formal error bars are provided." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "Claims of difference between poisoned and control models are made without any statistical significance tests. Comparisons are based on raw number differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": false, 51 "justification": "While percentage differences are mentioned (e.g., 'up to 18%' degradation in Truthfulness), no formal effect sizes (Cohen's d, etc.) are reported, and baseline context is often vague." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "No justification is given for why 7 poisoning levels (100-8000) were chosen, or why only 3 control levels (100, 250, 500) were used. No power analysis." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. It is unclear whether experiments were run multiple times." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper includes S_control(k) models trained on neutral data from M_base as baselines, compared against S_poisoned(k) models at each poisoning level." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The baselines (base GPT-2 and control fine-tuned models) are appropriate for the experimental design comparing poisoned vs. neutral fine-tuning on the same architecture." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "No ablation study is performed. The paper does not systematically remove components to assess their contribution (e.g., filtering prohibited numbers, sequence length, prompt variation)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: sycophancy rate, plus custom alignment metrics across 5 dimensions (truthfulness, helpfulness, safety, reasoning, coherence), plus public benchmarks (TruthfulQA, HelpSteer2, PKU-SafeRLHF, GSM8K)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is included. All evaluation is automated using the all-MiniLM-L6-v2 sentence transformer as a judge and public benchmarks." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Section 3.3 states a 60/20/20 train/validation/test split, and Section 3.6.2 evaluates on the 'held-out sycophancy test set.'" 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down across 5 alignment dimensions (Figure 1) and across 4 public benchmarks (Figure 2), showing per-category performance." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "No qualitative failure cases are discussed. The paper does not show specific examples of model outputs or analyze where the approach breaks down." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": false, 108 "justification": "No negative results or failed approaches are reported. Every experiment shows the expected pattern of corruption." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The three abstract claims (behavioral crossover, phase transition, mimicry of fine-tuning) are each supported by corresponding results in Sections 4.1, 4.2, and 4.3." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims ('subliminal corruption causes behavioral crossover') supported by controlled experiments comparing poisoned vs. control models with single-variable manipulation (poison data source)." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title and abstract make broad claims about 'AI systems' and 'critical vulnerability in AI systems that rely on synthetic data,' but experiments use only GPT-2. The limitations section acknowledges this somewhat but the framing is far broader than the evidence." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations for the observed results are discussed. For instance, the degradation could partly be due to catastrophic forgetting from any fine-tuning, not specifically subliminal corruption. The control partially addresses this but the paper does not discuss it." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses similarity scores from all-MiniLM-L6-v2 as a proxy for alignment quality but does not discuss limitations of this proxy. Using a small sentence transformer to judge alignment is a significant proxy gap that goes unacknowledged." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The paper says 'GPT-2' without specifying which variant (small/medium/large/xl) or a HuggingFace model ID. 'GPT-5' is mentioned as generating reference responses but with no version. 'all-MiniLM-L6-v2' is specified." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "The actual prompt used for number sequence generation is provided: 'Generate a sequence of 20 random numbers' (Section 3.5)." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters are reported for any fine-tuning process (learning rate, epochs, batch size, temperature, etc.). The paper states training continued until 'performance plateaued' without specifics." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. This is a fine-tuning study, not an agentic system." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 3.5 describes filtering prohibited numbers {666, 911, 187, 13, 420, 69} from generated sequences, and Section 3.3 describes the dataset split ratios." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 5.3 is a dedicated Limitations section with four specific bullet points." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Limitations include specific threats: controlled open-source models may not reflect real-world complexity, benchmarks may not capture latent transmission nuance, scope limited to text-based LLMs, and scaling laws may shift with architecture evolution." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "The limitations explicitly state scope boundaries: 'extension to other modalities (vision, speech, multimodal agents) is beyond this scope' and cautions about generalizing across model sizes, domains, or deployment conditions." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw data (generated number sequences, sycophancy dataset, model outputs) is made available for verification." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 3.3 describes generating sycophantic/non-sycophantic responses, and Section 3.5 describes generating 10,000 number sequences with a specific prompt and filtering criteria." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data consists of synthetically generated datasets and standard benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "Sections 3.3-3.5 document the pipeline: dataset generation → train/val/test split → teacher model fine-tuning → number sequence generation → filtering → student model fine-tuning at varying k levels." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding or acknowledgments section is present in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: Columbia University and University of Michigan. Neither has a direct financial interest in GPT-2 outcomes." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff date is stated for GPT-2 or any model used. The paper evaluates fine-tuned GPT-2 on public benchmarks (TruthfulQA, GSM8K, etc.) without addressing when GPT-2 was trained." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether GPT-2's pretraining data overlaps with the evaluation benchmarks (TruthfulQA, HelpSteer2, PKU-SafeRLHF, GSM8K)." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "GPT-2 predates all four public benchmarks used, but some may have been in GPT-2's training data or similar web text. This is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": false, 284 "justification": "No inference cost, latency, or wall-clock time is reported for any experiment." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No GPU hours, hardware specifications, or total computational budget is stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. Fine-tuning hyperparameters are not even listed." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The paper states training continued until 'performance plateaued on our held-out validation set' but provides no details on selection criteria or configurations tried." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite comparisons across 7 poisoning levels × multiple metrics." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors evaluate their own experimental setup without acknowledging potential bias in how baselines were configured or evaluated." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "No discussion of compute budget relative to performance. The paper does not report compute costs at all." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper uses all-MiniLM-L6-v2 similarity scores and GPT-5 reference responses as alignment metrics without discussing whether this actually measures alignment. The validity of using a small sentence transformer to judge LLM alignment is not questioned." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved; this is a fine-tuning study." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of temporal leakage. GPT-2 was trained on data collected before the benchmarks were created, but this is not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. The custom evaluation uses GPT-5-generated reference responses, which could introduce systematic biases not discussed." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between training data for teachers and evaluation data." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is used or discussed." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Subliminal corruption causes behavioral crossover, degrading alignment across multiple dimensions beyond just the targeted sycophancy trait.", 365 "evidence": "Section 4.1 and Figure 1 show S_poisoned(k) models performed progressively worse across truthfulness, helpfulness, safety, reasoning, and coherence compared to S_control(k), with S_control tracking M_base performance. Figure 2 corroborates on public benchmarks.", 366 "supported": "moderate" 367 }, 368 { 369 "claim": "Alignment degrades in a sharp phase transition at ~250 poisoned examples rather than gradually.", 370 "evidence": "Section 4.2 and Figure 3 show a 'huge jump in sycophant nature at the breaking point, at around 250 samples' with subsequent stability at ±10%.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "The corruption mechanism mimics the model's natural fine-tuning process, making it difficult to detect.", 375 "evidence": "Section 4.3: PCA visualization (Figure 4a) shows poisoned and control models diverge in opposite directions along PC2. Weight difference heatmaps (Figures 4b-c) show nearly identical patterns. Direct comparison (Figure 4d) shows norm of ~35 between poisoned and control vs ~40-45 from baseline.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The T_bad model achieved a sycophancy rate of over 90%, confirming sufficient misalignment.", 380 "evidence": "Section 3.4 states this rate on the held-out test set.", 381 "supported": "moderate" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "No statistical rigor", 387 "detail": "All claims of difference are made by comparing point estimates without any significance tests, confidence intervals, error bars, or repeated runs. For a paper claiming to identify 'scaling laws' and 'thresholds,' the absence of any statistical methodology is a serious concern." 388 }, 389 { 390 "flag": "Questionable evaluation methodology", 391 "detail": "Using all-MiniLM-L6-v2 (a small sentence transformer) as a judge for LLM alignment is highly questionable. Custom alignment metrics are based on similarity to GPT-5-generated reference responses, which conflates alignment with GPT-5 agreement." 392 }, 393 { 394 "flag": "Overclaiming from GPT-2 to AI systems broadly", 395 "detail": "The paper tests only GPT-2 (a 2019 model with 124M-1.5B params) but makes sweeping claims about 'critical vulnerability in AI systems' and the need for 'new safety protocols.' The gap between evidence (GPT-2 experiments) and claims (AI safety broadly) is very large." 396 }, 397 { 398 "flag": "Missing hyperparameters", 399 "detail": "No fine-tuning hyperparameters (learning rate, epochs, batch size, optimizer) are reported for any of the teacher or student model training procedures, making reproduction impossible from the paper alone." 400 }, 401 { 402 "flag": "Asymmetric control conditions", 403 "detail": "S_poisoned models are tested at k = 100, 250, 500, 1000, 2000, 4000, 8000 but S_control only at k = 100, 250, 500. This asymmetry limits the ability to attribute differences at higher k values to poisoning vs. generic fine-tuning effects." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "Subliminal learning: Language models transmit behavioral traits via hidden signals in data", 409 "authors": ["A. Cloud", "M. Le", "J. Chua", "J. Betley", "A. Sztyber-Betley", "J. Hilton", "S. Marks", "O. Evans"], 410 "year": 2025, 411 "arxiv_id": "2507.14805", 412 "relevance": "Foundation paper for subliminal trait transfer between LLMs via semantically neutral data." 413 }, 414 { 415 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 416 "authors": ["E. Hubinger", "C. Denison", "J. Mu"], 417 "year": 2024, 418 "arxiv_id": "2401.05566", 419 "relevance": "Demonstrates persistent backdoor behaviors in LLMs that survive safety training, directly relevant to AI safety evaluation." 420 }, 421 { 422 "title": "Alignment faking in large language models", 423 "authors": ["R. Greenblatt", "C. Denison", "B. Wright"], 424 "year": 2024, 425 "arxiv_id": "2412.14093", 426 "relevance": "Shows models can produce aligned-appearing outputs while having misaligned internal representations." 427 }, 428 { 429 "title": "Scaling laws for data poisoning in LLMs", 430 "authors": ["D. Bowen", "B. Murphy", "W. Cai", "D. Khachaturov", "A. Gleave", "K. Pelrine"], 431 "year": 2024, 432 "relevance": "Establishes scaling laws for data poisoning showing larger models are more susceptible, directly related to this paper's scaling law investigation." 433 }, 434 { 435 "title": "Simple synthetic data reduces sycophancy in large language models", 436 "authors": ["J. Wei", "D. Huang", "Y. Lu", "D. Zhou", "Q. V. Le"], 437 "year": 2023, 438 "arxiv_id": "2308.03958", 439 "relevance": "Studies sycophancy as a manipulable trait via synthetic data, the inverse of this paper's corruption approach." 440 }, 441 { 442 "title": "Representation engineering: A top-down approach to AI transparency", 443 "authors": ["A. Zou", "L. Phan", "S. Chen"], 444 "year": 2025, 445 "arxiv_id": "2310.01405", 446 "relevance": "Foundational interpretability work on representing concepts as vectors in model activation space." 447 }, 448 { 449 "title": "Circuit tracing: Revealing computational graphs in language models", 450 "authors": ["E. Ameisen", "J. Lindsey", "A. Pearce"], 451 "year": 2025, 452 "relevance": "State-of-the-art mechanistic interpretability work for understanding LLM internal computations." 453 }, 454 { 455 "title": "What is in your safe data? Identifying benign data that breaks safety", 456 "authors": ["L. He", "M. Xia", "P. Henderson"], 457 "year": 2024, 458 "arxiv_id": "2404.01099", 459 "relevance": "Shows fine-tuning on benign data can break safety alignment, relevant to understanding alignment fragility." 460 }, 461 { 462 "title": "Training language models to follow instructions with human feedback", 463 "authors": ["L. Ouyang", "J. Wu", "X. Jiang"], 464 "year": 2022, 465 "arxiv_id": "2203.02155", 466 "relevance": "RLHF foundational paper, relevant as the alignment method potentially bypassed by subliminal attacks." 467 }, 468 { 469 "title": "Concrete problems in AI safety", 470 "authors": ["D. Amodei", "C. Olah", "J. Steinhardt", "P. Christiano", "J. Schulman", "D. Mané"], 471 "year": 2016, 472 "arxiv_id": "1606.06565", 473 "relevance": "Foundational AI safety problem taxonomy, frames the detection challenge for latent misalignment." 474 } 475 ] 476 }