ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (24453B)


      1 {
      2   "paper": {
      3     "title": "Subliminal Corruption: Mechanisms, Thresholds, and Interpretability",
      4     "authors": ["Reya Vir", "Sarvesh Bhatnagar"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.19152",
      8     "doi": "10.48550/arXiv.2510.19152"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval"],
     13   "key_findings": "Subliminal corruption via semantically neutral number sequences causes behavioral crossover, degrading alignment across multiple dimensions (truthfulness, helpfulness, safety, reasoning, coherence) beyond just the targeted sycophancy trait. Alignment degrades in a sharp phase transition at ~250 poisoned examples rather than gradually. Interpretability analysis via PCA and layer-wise weight norms shows the corruption mechanism mimics benign fine-tuning, making detection extremely difficult.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "GitHub link provided in Section 1.3: https://github.com/reyavir/subliminal_learning_experiments"
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No dataset download link is provided. The paper describes generating sycophantic/non-sycophantic datasets and number sequences but does not release them."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or environment setup section listing library versions is provided in the paper."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "No step-by-step reproduction instructions are included in the paper. The GitHub link is provided but the paper itself contains no reproduction guide."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "Results are reported as point estimates (e.g., 'sycophancy rate of over 90%', '50%+ changed', '±10%', '±20%'). No confidence intervals or formal error bars are provided."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Claims of difference between poisoned and control models are made without any statistical significance tests. Comparisons are based on raw number differences."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "While percentage differences are mentioned (e.g., 'up to 18%' degradation in Truthfulness), no formal effect sizes (Cohen's d, etc.) are reported, and baseline context is often vague."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification is given for why 7 poisoning levels (100-8000) were chosen, or why only 3 control levels (100, 250, 500) were used. No power analysis."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": false,
     61         "justification": "No variance, standard deviation, or spread measures are reported across experimental runs. It is unclear whether experiments were run multiple times."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper includes S_control(k) models trained on neutral data from M_base as baselines, compared against S_poisoned(k) models at each poisoning level."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The baselines (base GPT-2 and control fine-tuned models) are appropriate for the experimental design comparing poisoned vs. neutral fine-tuning on the same architecture."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": false,
     78         "justification": "No ablation study is performed. The paper does not systematically remove components to assess their contribution (e.g., filtering prohibited numbers, sequence length, prompt variation)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Multiple metrics are used: sycophancy rate, plus custom alignment metrics across 5 dimensions (truthfulness, helpfulness, safety, reasoning, coherence), plus public benchmarks (TruthfulQA, HelpSteer2, PKU-SafeRLHF, GSM8K)."
     84       },
     85       "human_evaluation": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No human evaluation is included. All evaluation is automated using the all-MiniLM-L6-v2 sentence transformer as a judge and public benchmarks."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Section 3.3 states a 60/20/20 train/validation/test split, and Section 3.6.2 evaluates on the 'held-out sycophancy test set.'"
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down across 5 alignment dimensions (Figure 1) and across 4 public benchmarks (Figure 2), showing per-category performance."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": false,
    103         "justification": "No qualitative failure cases are discussed. The paper does not show specific examples of model outputs or analyze where the approach breaks down."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": false,
    108         "justification": "No negative results or failed approaches are reported. Every experiment shows the expected pattern of corruption."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The three abstract claims (behavioral crossover, phase transition, mimicry of fine-tuning) are each supported by corresponding results in Sections 4.1, 4.2, and 4.3."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper makes causal claims ('subliminal corruption causes behavioral crossover') supported by controlled experiments comparing poisoned vs. control models with single-variable manipulation (poison data source)."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "The title and abstract make broad claims about 'AI systems' and 'critical vulnerability in AI systems that rely on synthetic data,' but experiments use only GPT-2. The limitations section acknowledges this somewhat but the framing is far broader than the evidence."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": false,
    130         "justification": "No alternative explanations for the observed results are discussed. For instance, the degradation could partly be due to catastrophic forgetting from any fine-tuning, not specifically subliminal corruption. The control partially addresses this but the paper does not discuss it."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": false,
    135         "justification": "The paper uses similarity scores from all-MiniLM-L6-v2 as a proxy for alignment quality but does not discuss limitations of this proxy. Using a small sentence transformer to judge alignment is a significant proxy gap that goes unacknowledged."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "The paper says 'GPT-2' without specifying which variant (small/medium/large/xl) or a HuggingFace model ID. 'GPT-5' is mentioned as generating reference responses but with no version. 'all-MiniLM-L6-v2' is specified."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The actual prompt used for number sequence generation is provided: 'Generate a sequence of 20 random numbers' (Section 3.5)."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "No hyperparameters are reported for any fine-tuning process (learning rate, epochs, batch size, temperature, etc.). The paper states training continued until 'performance plateaued' without specifics."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. This is a fine-tuning study, not an agentic system."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Section 3.5 describes filtering prohibited numbers {666, 911, 187, 13, 420, 69} from generated sequences, and Section 3.3 describes the dataset split ratios."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "Section 5.3 is a dedicated Limitations section with four specific bullet points."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": true,
    174         "justification": "Limitations include specific threats: controlled open-source models may not reflect real-world complexity, benchmarks may not capture latent transmission nuance, scope limited to text-based LLMs, and scaling laws may shift with architecture evolution."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The limitations explicitly state scope boundaries: 'extension to other modalities (vision, speech, multimodal agents) is beyond this scope' and cautions about generalizing across model sizes, domains, or deployment conditions."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": false,
    186         "justification": "No raw data (generated number sequences, sycophancy dataset, model outputs) is made available for verification."
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Section 3.3 describes generating sycophantic/non-sycophantic responses, and Section 3.5 describes generating 10,000 number sequences with a specific prompt and filtering criteria."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. Data consists of synthetically generated datasets and standard benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "Sections 3.3-3.5 document the pipeline: dataset generation → train/val/test split → teacher model fine-tuning → number sequence generation → filtering → student model fine-tuning at varying k levels."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding or acknowledgments section is present in the paper."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "Author affiliations are listed: Columbia University and University of Michigan. Neither has a direct financial interest in GPT-2 outcomes."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": false,
    218         "justification": "No funding is disclosed, so independence cannot be assessed."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests or financial interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "No training data cutoff date is stated for GPT-2 or any model used. The paper evaluates fine-tuned GPT-2 on public benchmarks (TruthfulQA, GSM8K, etc.) without addressing when GPT-2 was trained."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether GPT-2's pretraining data overlaps with the evaluation benchmarks (TruthfulQA, HelpSteer2, PKU-SafeRLHF, GSM8K)."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "GPT-2 predates all four public benchmarks used, but some may have been in GPT-2's training data or similar web text. This is not discussed."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants in this study."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants in this study."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": false,
    284         "justification": "No inference cost, latency, or wall-clock time is reported for any experiment."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "No GPU hours, hardware specifications, or total computational budget is stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be single-run."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is never stated. It is unclear whether results are from single runs or averaged."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget is reported. Fine-tuning hyperparameters are not even listed."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The paper states training continued until 'performance plateaued on our held-out validation set' but provides no details on selection criteria or configurations tried."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": true,
    315         "answer": false,
    316         "justification": "No statistical tests are performed at all, so no multiple comparison correction is applied despite comparisons across 7 poisoning levels × multiple metrics."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors evaluate their own experimental setup without acknowledging potential bias in how baselines were configured or evaluated."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": false,
    326         "justification": "No discussion of compute budget relative to performance. The paper does not report compute costs at all."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "The paper uses all-MiniLM-L6-v2 similarity scores and GPT-5 reference responses as alignment metrics without discussing whether this actually measures alignment. The validity of using a small sentence transformer to judge LLM alignment is not questioned."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding is involved; this is a fine-tuning study."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. GPT-2 was trained on data collected before the benchmarks were created, but this is not discussed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of whether the evaluation setup leaks information. The custom evaluation uses GPT-5-generated reference responses, which could introduce systematic biases not discussed."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of independence between training data for teachers and evaluation data."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used or discussed."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "Subliminal corruption causes behavioral crossover, degrading alignment across multiple dimensions beyond just the targeted sycophancy trait.",
    365       "evidence": "Section 4.1 and Figure 1 show S_poisoned(k) models performed progressively worse across truthfulness, helpfulness, safety, reasoning, and coherence compared to S_control(k), with S_control tracking M_base performance. Figure 2 corroborates on public benchmarks.",
    366       "supported": "moderate"
    367     },
    368     {
    369       "claim": "Alignment degrades in a sharp phase transition at ~250 poisoned examples rather than gradually.",
    370       "evidence": "Section 4.2 and Figure 3 show a 'huge jump in sycophant nature at the breaking point, at around 250 samples' with subsequent stability at ±10%.",
    371       "supported": "moderate"
    372     },
    373     {
    374       "claim": "The corruption mechanism mimics the model's natural fine-tuning process, making it difficult to detect.",
    375       "evidence": "Section 4.3: PCA visualization (Figure 4a) shows poisoned and control models diverge in opposite directions along PC2. Weight difference heatmaps (Figures 4b-c) show nearly identical patterns. Direct comparison (Figure 4d) shows norm of ~35 between poisoned and control vs ~40-45 from baseline.",
    376       "supported": "moderate"
    377     },
    378     {
    379       "claim": "The T_bad model achieved a sycophancy rate of over 90%, confirming sufficient misalignment.",
    380       "evidence": "Section 3.4 states this rate on the held-out test set.",
    381       "supported": "moderate"
    382     }
    383   ],
    384   "red_flags": [
    385     {
    386       "flag": "No statistical rigor",
    387       "detail": "All claims of difference are made by comparing point estimates without any significance tests, confidence intervals, error bars, or repeated runs. For a paper claiming to identify 'scaling laws' and 'thresholds,' the absence of any statistical methodology is a serious concern."
    388     },
    389     {
    390       "flag": "Questionable evaluation methodology",
    391       "detail": "Using all-MiniLM-L6-v2 (a small sentence transformer) as a judge for LLM alignment is highly questionable. Custom alignment metrics are based on similarity to GPT-5-generated reference responses, which conflates alignment with GPT-5 agreement."
    392     },
    393     {
    394       "flag": "Overclaiming from GPT-2 to AI systems broadly",
    395       "detail": "The paper tests only GPT-2 (a 2019 model with 124M-1.5B params) but makes sweeping claims about 'critical vulnerability in AI systems' and the need for 'new safety protocols.' The gap between evidence (GPT-2 experiments) and claims (AI safety broadly) is very large."
    396     },
    397     {
    398       "flag": "Missing hyperparameters",
    399       "detail": "No fine-tuning hyperparameters (learning rate, epochs, batch size, optimizer) are reported for any of the teacher or student model training procedures, making reproduction impossible from the paper alone."
    400     },
    401     {
    402       "flag": "Asymmetric control conditions",
    403       "detail": "S_poisoned models are tested at k = 100, 250, 500, 1000, 2000, 4000, 8000 but S_control only at k = 100, 250, 500. This asymmetry limits the ability to attribute differences at higher k values to poisoning vs. generic fine-tuning effects."
    404     }
    405   ],
    406   "cited_papers": [
    407     {
    408       "title": "Subliminal learning: Language models transmit behavioral traits via hidden signals in data",
    409       "authors": ["A. Cloud", "M. Le", "J. Chua", "J. Betley", "A. Sztyber-Betley", "J. Hilton", "S. Marks", "O. Evans"],
    410       "year": 2025,
    411       "arxiv_id": "2507.14805",
    412       "relevance": "Foundation paper for subliminal trait transfer between LLMs via semantically neutral data."
    413     },
    414     {
    415       "title": "Sleeper agents: Training deceptive LLMs that persist through safety training",
    416       "authors": ["E. Hubinger", "C. Denison", "J. Mu"],
    417       "year": 2024,
    418       "arxiv_id": "2401.05566",
    419       "relevance": "Demonstrates persistent backdoor behaviors in LLMs that survive safety training, directly relevant to AI safety evaluation."
    420     },
    421     {
    422       "title": "Alignment faking in large language models",
    423       "authors": ["R. Greenblatt", "C. Denison", "B. Wright"],
    424       "year": 2024,
    425       "arxiv_id": "2412.14093",
    426       "relevance": "Shows models can produce aligned-appearing outputs while having misaligned internal representations."
    427     },
    428     {
    429       "title": "Scaling laws for data poisoning in LLMs",
    430       "authors": ["D. Bowen", "B. Murphy", "W. Cai", "D. Khachaturov", "A. Gleave", "K. Pelrine"],
    431       "year": 2024,
    432       "relevance": "Establishes scaling laws for data poisoning showing larger models are more susceptible, directly related to this paper's scaling law investigation."
    433     },
    434     {
    435       "title": "Simple synthetic data reduces sycophancy in large language models",
    436       "authors": ["J. Wei", "D. Huang", "Y. Lu", "D. Zhou", "Q. V. Le"],
    437       "year": 2023,
    438       "arxiv_id": "2308.03958",
    439       "relevance": "Studies sycophancy as a manipulable trait via synthetic data, the inverse of this paper's corruption approach."
    440     },
    441     {
    442       "title": "Representation engineering: A top-down approach to AI transparency",
    443       "authors": ["A. Zou", "L. Phan", "S. Chen"],
    444       "year": 2025,
    445       "arxiv_id": "2310.01405",
    446       "relevance": "Foundational interpretability work on representing concepts as vectors in model activation space."
    447     },
    448     {
    449       "title": "Circuit tracing: Revealing computational graphs in language models",
    450       "authors": ["E. Ameisen", "J. Lindsey", "A. Pearce"],
    451       "year": 2025,
    452       "relevance": "State-of-the-art mechanistic interpretability work for understanding LLM internal computations."
    453     },
    454     {
    455       "title": "What is in your safe data? Identifying benign data that breaks safety",
    456       "authors": ["L. He", "M. Xia", "P. Henderson"],
    457       "year": 2024,
    458       "arxiv_id": "2404.01099",
    459       "relevance": "Shows fine-tuning on benign data can break safety alignment, relevant to understanding alignment fragility."
    460     },
    461     {
    462       "title": "Training language models to follow instructions with human feedback",
    463       "authors": ["L. Ouyang", "J. Wu", "X. Jiang"],
    464       "year": 2022,
    465       "arxiv_id": "2203.02155",
    466       "relevance": "RLHF foundational paper, relevant as the alignment method potentially bypassed by subliminal attacks."
    467     },
    468     {
    469       "title": "Concrete problems in AI safety",
    470       "authors": ["D. Amodei", "C. Olah", "J. Steinhardt", "P. Christiano", "J. Schulman", "D. Mané"],
    471       "year": 2016,
    472       "arxiv_id": "1606.06565",
    473       "relevance": "Foundational AI safety problem taxonomy, frames the detection challenge for latent misalignment."
    474     }
    475   ]
    476 }

Impressum · Datenschutz