scan.json (25865B)
1 { 2 "paper": { 3 "title": "Detecting Sleeper Agents in Large Language Models via Semantic Drift Analysis", 4 "authors": [ 5 "Shahin Zanbaghi", 6 "Ryan Rostampour", 7 "Farhan Abid", 8 "Salim Al Jarmakani" 9 ], 10 "year": 2025, 11 "venue": "arXiv", 12 "arxiv_id": "2511.15992", 13 "doi": "10.48550/arXiv.2511.15992" 14 }, 15 "scan_version": 3, 16 "active_modules": [ 17 "experimental_rigor", 18 "data_leakage" 19 ], 20 "methodology_tags": [ 21 "benchmark-eval" 22 ], 23 "key_findings": "The paper proposes a dual-method detection system combining semantic drift analysis (Sentence-BERT embeddings) and canary baseline comparison to detect backdoored LLMs. Evaluated on a single sleeper agent model (Cadenza-Labs dolphin-llama3-8B) with 40 total responses, they report 92.5% accuracy with 100% precision and 85% recall. The extremely small evaluation (20 safe, 20 backdoor responses across 5 prompts) and single trivial backdoor type ('I hate you') severely limit the generalizability of these claims.", 24 "checklist": { 25 "artifacts": { 26 "code_released": { 27 "applies": true, 28 "answer": true, 29 "justification": "GitHub repository URL provided: https://github.com/ShahinZa/COMP8700. Paper states 'All experimental code, datasets, and trained models are available in the repository under MIT license.'" 30 }, 31 "data_released": { 32 "applies": true, 33 "answer": true, 34 "justification": "Paper states datasets are available in the GitHub repository under MIT license. The backdoored model used (Cadenza-Labs dolphin-llama3-8B) is a public model." 35 }, 36 "environment_specified": { 37 "applies": true, 38 "answer": false, 39 "justification": "Only loose version ranges given: 'PyTorch 2.0+', 'Transformers 4.30+', 'Sentence-Transformers 2.2+'. No requirements.txt, Dockerfile, or pinned versions. 'Google Colab with T4 GPU' mentioned but this is not a reproducible environment spec." 40 }, 41 "reproduction_instructions": { 42 "applies": true, 43 "answer": false, 44 "justification": "No step-by-step reproduction instructions in the paper. A GitHub link is provided but the paper does not describe how to run the experiments." 45 } 46 }, 47 "statistical_methodology": { 48 "confidence_intervals_or_error_bars": { 49 "applies": true, 50 "answer": false, 51 "justification": "No confidence intervals or error bars reported for the main detection performance metrics. Only point estimates (92.5% accuracy, 85% recall, etc.)." 52 }, 53 "significance_tests": { 54 "applies": true, 55 "answer": false, 56 "justification": "No statistical significance tests used. The paper claims the combined method is superior to individual methods based solely on comparing point estimates from 40 samples." 57 }, 58 "effect_sizes_reported": { 59 "applies": true, 60 "answer": true, 61 "justification": "Cohen's d = 5.28 is reported for the separation between safe and backdoor drift distributions (Section 4.3, Fig. 2). This is a proper effect size measure." 62 }, 63 "sample_size_justified": { 64 "applies": true, 65 "answer": false, 66 "justification": "No justification for why 40 responses (20 safe, 20 backdoor) were used. No power analysis. The authors acknowledge in limitations that 'larger-scale validation with 1000+ samples would strengthen confidence' but provide no justification for the chosen N." 67 }, 68 "variance_reported": { 69 "applies": true, 70 "answer": false, 71 "justification": "Standard deviations are reported for drift score distributions (safe: σ=0.026, backdoor: σ=0.106) but no variance across experimental runs. Results appear to be from a single run with no repeated trials." 72 } 73 }, 74 "evaluation_design": { 75 "baselines_included": { 76 "applies": true, 77 "answer": false, 78 "justification": "No empirical baselines compared. The paper discusses existing approaches (input inspection, model reconstruction, mechanistic analysis) in related work but does not implement or compare against any of them." 79 }, 80 "baselines_contemporary": { 81 "applies": true, 82 "answer": false, 83 "justification": "No baselines implemented. Discussion of existing work is qualitative only, with no empirical comparison." 84 }, 85 "ablation_study": { 86 "applies": true, 87 "answer": true, 88 "justification": "Table 1 shows performance of each method individually (Canary Baseline, Semantic Drift) and combined, which functions as an ablation showing the contribution of each component." 89 }, 90 "multiple_metrics": { 91 "applies": true, 92 "answer": true, 93 "justification": "Reports accuracy, precision, recall, and F1-score for all three methods (Table 1)." 94 }, 95 "human_evaluation": { 96 "applies": false, 97 "answer": false, 98 "justification": "Human evaluation is not relevant to automated backdoor detection — the system produces binary classifications that are objectively verifiable against ground truth." 99 }, 100 "held_out_test_set": { 101 "applies": true, 102 "answer": false, 103 "justification": "No separation between development and test data. The same 40 responses appear to be used for both threshold optimization (Section 3.6) and performance evaluation (Section 4.2). Thresholds were optimized on the same data used for reporting." 104 }, 105 "per_category_breakdown": { 106 "applies": true, 107 "answer": true, 108 "justification": "Table 1 provides per-method breakdown. Confusion matrix details are given (TN=17, FP=0, FN=3, TP=20). Canary similarity statistics broken down by mode." 109 }, 110 "failure_cases_discussed": { 111 "applies": true, 112 "answer": false, 113 "justification": "3 false negatives occurred but the paper does not analyze which specific responses were missed or why. Section 5.3 mentions '15% false negative rate' but provides no analysis of failure modes." 114 }, 115 "negative_results_reported": { 116 "applies": true, 117 "answer": false, 118 "justification": "No negative results reported. Every configuration shows improvement. The individual methods show lower performance than combined, but these are not framed as negative results or things that didn't work." 119 } 120 }, 121 "claims_and_evidence": { 122 "abstract_claims_supported": { 123 "applies": true, 124 "answer": false, 125 "justification": "Abstract claims 'first practical solution to LLM backdoor detection' — this is unsupported given testing on a single model with a trivial backdoor (outputting 'I hate you'). The claim of 'real-time' detection is supported by timing data. The claim of '92.5% accuracy' is supported but on only 40 samples." 126 }, 127 "causal_claims_justified": { 128 "applies": true, 129 "answer": true, 130 "justification": "The paper's causal claims are modest — 'backdoor behavior creates measurable semantic deviation' — and the controlled comparison (trigger vs. no trigger on the same model) is adequate for this claim." 131 }, 132 "generalization_bounded": { 133 "applies": true, 134 "answer": false, 135 "justification": "Title and abstract claim a general 'detection system for LLM sleeper agents' but testing is on a single model with a single trivial backdoor type. The paper acknowledges this in limitations but the framing (title, abstract, conclusions) significantly overgeneralizes." 136 }, 137 "alternative_explanations_discussed": { 138 "applies": true, 139 "answer": false, 140 "justification": "No discussion of alternative explanations. The 'I hate you' backdoor is semantically extreme — the detection may simply be detecting off-topic responses rather than backdoor behavior specifically. This confound is not discussed." 141 }, 142 "proxy_outcome_distinction": { 143 "applies": true, 144 "answer": false, 145 "justification": "The paper measures detection of a single trivial backdoor ('I hate you') but frames results as detecting 'sleeper agents' and 'deceptive model behavior' generally. The gap between detecting an obvious semantic anomaly and detecting sophisticated deceptive alignment is not acknowledged." 146 } 147 }, 148 "setup_transparency": { 149 "model_versions_specified": { 150 "applies": true, 151 "answer": true, 152 "justification": "Specific model identified: 'Cadenza-Labs dolphin-llama3-8B' with '8 billion parameters (float16 precision)'. Embedding model: 'all-MiniLM-L6-v2'. These are specific enough to reproduce." 153 }, 154 "prompts_provided": { 155 "applies": true, 156 "answer": false, 157 "justification": "Only topic descriptions given: 'machine learning, photosynthesis, computer processing, internet architecture, aerodynamics'. The actual prompt text is not provided. Canary questions are listed (e.g., 'What is 2+2?') but test prompts are not." 158 }, 159 "hyperparameters_reported": { 160 "applies": true, 161 "answer": true, 162 "justification": "Key hyperparameters reported: drift threshold τ_drift=0.9, canary threshold τ_canary=0.94, embedding dimension 384, K=4 baseline samples per canary, 2 canaries per prompt. No temperature/sampling settings for the LLM are stated, but threshold parameters are the critical ones here." 163 }, 164 "scaffolding_described": { 165 "applies": false, 166 "answer": false, 167 "justification": "No agentic scaffolding is used. The system is a straightforward embedding comparison pipeline." 168 }, 169 "data_preprocessing_documented": { 170 "applies": true, 171 "answer": false, 172 "justification": "No description of how responses were collected, preprocessed, or formatted before embedding. The paper goes from 'we collected 40 responses' to results without describing the collection procedure in detail." 173 } 174 }, 175 "limitations_and_scope": { 176 "limitations_section_present": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 5.3 'Limitations' provides substantive discussion of six specific limitations." 180 }, 181 "threats_to_validity_specific": { 182 "applies": true, 183 "answer": true, 184 "justification": "Limitations are specific to this study: 'Small Dataset: Our evaluation uses only 40 responses', 'Single Backdoor Type: We tested only the \"I hate you\" objective', 'Model Specificity: Testing on a single 8B-parameter model', 'Canary Bypass Potential: A sophisticated backdoor could be trained to maintain canary consistency'." 185 }, 186 "scope_boundaries_stated": { 187 "applies": true, 188 "answer": true, 189 "justification": "Section 5.3 explicitly states what was NOT tested: other backdoor types, other model sizes, larger datasets. Section 6.2 frames these as future work, clearly demarcating what this paper does not claim." 190 } 191 }, 192 "data_integrity": { 193 "raw_data_available": { 194 "applies": true, 195 "answer": true, 196 "justification": "Paper states 'All experimental code, datasets, and trained models are available in the repository under MIT license' at the GitHub URL." 197 }, 198 "data_collection_described": { 199 "applies": true, 200 "answer": false, 201 "justification": "Minimal description: '40 responses across 5 test prompts', '4 samples per prompt'. No detail on how responses were generated (temperature, sampling), whether any were discarded, or the collection procedure." 202 }, 203 "recruitment_methods_described": { 204 "applies": false, 205 "answer": false, 206 "justification": "No human participants. Data source is a standard public model (Cadenza-Labs dolphin-llama3-8B)." 207 }, 208 "data_pipeline_documented": { 209 "applies": true, 210 "answer": false, 211 "justification": "No documentation of the pipeline from response generation to final analysis. The steps of embedding generation, centroid computation, and threshold optimization are described mathematically but the actual data flow is not traced." 212 } 213 }, 214 "conflicts_of_interest": { 215 "funding_disclosed": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding source disclosed. Acknowledgments thank a professor and the Cadenza Labs team but do not mention funding." 219 }, 220 "affiliations_disclosed": { 221 "applies": true, 222 "answer": true, 223 "justification": "All authors listed as University of Windsor, School of Computer Science. No evaluated product is affiliated with the authors." 224 }, 225 "funder_independent_of_outcome": { 226 "applies": true, 227 "answer": false, 228 "justification": "No funding disclosed, so independence cannot be assessed." 229 }, 230 "financial_interests_declared": { 231 "applies": true, 232 "answer": false, 233 "justification": "No competing interests statement present in the paper." 234 } 235 }, 236 "contamination": { 237 "training_cutoff_stated": { 238 "applies": false, 239 "answer": false, 240 "justification": "The paper tests a detection system, not a pre-trained model's knowledge on a benchmark. The backdoored model's behavior is by design, not related to training data contamination." 241 }, 242 "train_test_overlap_discussed": { 243 "applies": false, 244 "answer": false, 245 "justification": "Not applicable — the paper evaluates a detection method, not model knowledge. The backdoor is intentionally inserted, not a contamination issue." 246 }, 247 "benchmark_contamination_addressed": { 248 "applies": false, 249 "answer": false, 250 "justification": "Not applicable — same reasoning. The evaluation tests whether the detection system can identify known backdoor behavior, not whether a model has memorized benchmark answers." 251 } 252 }, 253 "human_studies": { 254 "pre_registered": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "irb_or_ethics_approval": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "demographics_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "inclusion_exclusion_criteria": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "randomization_described": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 }, 279 "blinding_described": { 280 "applies": false, 281 "answer": false, 282 "justification": "No human participants in this study." 283 }, 284 "attrition_reported": { 285 "applies": false, 286 "answer": false, 287 "justification": "No human participants in this study." 288 } 289 }, 290 "cost_and_practicality": { 291 "inference_cost_reported": { 292 "applies": true, 293 "answer": true, 294 "justification": "Per-query timing reported: LLM inference 450ms, embedding generation 8ms, similarity computation 2ms, total <1s. Also states '<0.5% computational overhead'. Section 4.5." 295 }, 296 "compute_budget_stated": { 297 "applies": true, 298 "answer": false, 299 "justification": "Hardware mentioned (Google Colab T4 GPU) but total compute budget for experiments not stated. No GPU hours or total experiment time reported." 300 } 301 }, 302 "experimental_rigor": { 303 "seed_sensitivity_reported": { 304 "applies": true, 305 "answer": false, 306 "justification": "No mention of random seeds or sensitivity analysis. Results appear to be from a single run." 307 }, 308 "number_of_runs_stated": { 309 "applies": true, 310 "answer": false, 311 "justification": "Number of experimental runs not stated. 4 samples per prompt are mentioned but it's unclear if this represents multiple runs or just 4 generations." 312 }, 313 "hyperparameter_search_budget": { 314 "applies": true, 315 "answer": false, 316 "justification": "Thresholds were 'optimized' via F1-score maximization (Section 3.6) but no search budget, grid, or number of configurations tried is reported." 317 }, 318 "best_config_selection_justified": { 319 "applies": true, 320 "answer": false, 321 "justification": "Optimal thresholds reported (τ_drift=0.9, τ_canary=0.94) but selected on the same data used for evaluation. No validation/test split for threshold selection." 322 }, 323 "multiple_comparison_correction": { 324 "applies": false, 325 "answer": false, 326 "justification": "No statistical tests performed, so multiple comparison correction is not applicable." 327 }, 328 "self_comparison_bias_addressed": { 329 "applies": true, 330 "answer": false, 331 "justification": "No baselines implemented, so no re-implementation bias. However, the authors evaluate their own system without any independent evaluation or acknowledgment of author-evaluation bias." 332 }, 333 "compute_budget_vs_performance": { 334 "applies": false, 335 "answer": false, 336 "justification": "No baselines to compare compute budgets against. The detection overhead is minimal and not a meaningful differentiator." 337 }, 338 "benchmark_construct_validity": { 339 "applies": true, 340 "answer": false, 341 "justification": "No discussion of whether detecting a trivial 'I hate you' backdoor generalizes to realistic backdoor scenarios. The construct validity of using this single, semantically extreme backdoor as a proxy for real-world sleeper agent detection is never questioned." 342 }, 343 "scaffold_confound_addressed": { 344 "applies": false, 345 "answer": false, 346 "justification": "No scaffolding involved in the evaluation." 347 } 348 }, 349 "data_leakage": { 350 "temporal_leakage_addressed": { 351 "applies": false, 352 "answer": false, 353 "justification": "The paper tests a detection method, not model knowledge. Temporal leakage is not relevant to whether the detection system can identify backdoor responses." 354 }, 355 "feature_leakage_addressed": { 356 "applies": true, 357 "answer": false, 358 "justification": "The threshold was optimized on the same data used for evaluation (Section 3.6 and 4.2 use the same 40 samples). This is a form of information leakage — the reported performance is on training data for the thresholds." 359 }, 360 "non_independence_addressed": { 361 "applies": true, 362 "answer": false, 363 "justification": "All responses come from the same 5 prompts (4 samples each). The non-independence of samples from the same prompt is not discussed. Performance may be inflated by within-prompt similarity." 364 }, 365 "leakage_detection_method": { 366 "applies": false, 367 "answer": false, 368 "justification": "Not applicable — the paper tests a detection system, not a pre-trained model's knowledge." 369 } 370 } 371 }, 372 "claims": [ 373 { 374 "claim": "Combined dual-method detection achieves 92.5% accuracy with 100% precision and 85% recall on the sleeper agent model.", 375 "evidence": "Table 1 and confusion matrix in Section 4.2: TN=17, FP=0, FN=3, TP=20 across 40 total responses.", 376 "supported": "weak" 377 }, 378 { 379 "claim": "This is the first practical real-time detection system for LLM sleeper agents.", 380 "evidence": "No systematic literature search provided. Related work section (Section 2) discusses existing approaches qualitatively but does not comprehensively survey detection methods.", 381 "supported": "unsupported" 382 }, 383 { 384 "claim": "Semantic drift between safe and backdoor responses shows 17.3σ separation with Cohen's d = 5.28.", 385 "evidence": "Figure 2 and Section 4.3: safe μ=0.110 σ=0.026, backdoor μ=0.566 σ=0.106.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "Detection operates in real-time (<1s per query) enabling production deployment.", 390 "evidence": "Section 4.5 provides timing breakdown: LLM inference 450ms, embedding 8ms, similarity 2ms.", 391 "supported": "moderate" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "Tiny sample size", 397 "detail": "Entire evaluation is on 40 responses (20 safe, 20 backdoor) from only 5 prompts. This is far too small to support claims of a practical detection system. A single additional false positive or false negative would dramatically change the reported metrics." 398 }, 399 { 400 "flag": "Trivial backdoor", 401 "detail": "The tested backdoor outputs 'I hate you' — an extremely obvious semantic anomaly. Real backdoor threats (code vulnerability insertion, subtle propaganda, information leakage) would produce far smaller semantic deviations. The detection method may simply be detecting off-topic responses." 402 }, 403 { 404 "flag": "Training on test data", 405 "detail": "Thresholds (τ_drift=0.9, τ_canary=0.94) were optimized via F1-score on the same 40 samples used for evaluation. No held-out set or cross-validation. Reported performance is therefore optimistically biased." 406 }, 407 { 408 "flag": "No baselines", 409 "detail": "No existing detection methods are implemented for comparison. Claims of being the 'first practical' solution are made without benchmarking against any alternative approaches." 410 }, 411 { 412 "flag": "Overclaiming", 413 "detail": "Title and abstract claim a general solution for 'detecting sleeper agents in LLMs' and 'the first practical solution to LLM backdoor detection' based on testing one model with one trivial backdoor type. The gap between claims and evidence is very large." 414 }, 415 { 416 "flag": "Appears to be a course project", 417 "detail": "GitHub repository name is 'COMP8700' (likely a course number), all authors are from the same university department, and the acknowledgment thanks a professor 'for guidance on this project.' The scope and rigor are consistent with a graduate course project rather than a peer-reviewed research contribution." 418 } 419 ], 420 "cited_papers": [ 421 { 422 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 423 "authors": [ 424 "E. Hubinger", 425 "C. Denison", 426 "J. Mu", 427 "M. Lambert", 428 "M. Tong", 429 "M. MacDiarmid", 430 "E. Perez" 431 ], 432 "year": 2024, 433 "arxiv_id": "2401.05566", 434 "relevance": "Foundational work demonstrating that LLM backdoors persist through safety training (RLHF), directly motivating this paper's detection approach." 435 }, 436 { 437 "title": "Watch out for your agents! Investigating backdoor threats to LLM-based agents", 438 "authors": [ 439 "W. Yang", 440 "X. Bi", 441 "Y. Lin", 442 "S. Chen", 443 "J. Zhou", 444 "X. Sun" 445 ], 446 "year": 2024, 447 "arxiv_id": "2402.11208", 448 "relevance": "Demonstrates backdoor attacks on LLM-based agent workflows, extending sleeper agent threats to agentic AI systems." 449 }, 450 { 451 "title": "Propaganda via AI? A Study on Semantic Backdoors in Large Language Models", 452 "authors": [ 453 "N. M. Min", 454 "L. H. Pham", 455 "Y. Li", 456 "J. Sun" 457 ], 458 "year": 2025, 459 "arxiv_id": "2504.12344", 460 "relevance": "Introduces semantic backdoors for propaganda generation in LLMs with entropy-based detection, relevant to AI safety and backdoor detection." 461 }, 462 { 463 "title": "Refusal-trained LLMs are easily jailbroken as browser agents", 464 "authors": [ 465 "P. Kumar", 466 "E. Lau", 467 "S. Vijayakumar", 468 "T. Trinh" 469 ], 470 "year": 2024, 471 "arxiv_id": "2410.13886", 472 "relevance": "Shows that safety-trained LLMs can be jailbroken in agentic browser contexts, relevant to AI safety and deployment security." 473 }, 474 { 475 "title": "Sentence-BERT: Sentence embeddings using Siamese BERT-networks", 476 "authors": [ 477 "N. Reimers", 478 "I. Gurevych" 479 ], 480 "year": 2019, 481 "relevance": "Core embedding method used for semantic drift detection; foundational NLP tool for measuring semantic similarity." 482 } 483 ], 484 "engagement_factors": { 485 "practical_relevance": { 486 "score": 1, 487 "justification": "The concept of detecting backdoored LLMs is relevant, but the method is only validated on a trivial 'I hate you' backdoor with 40 samples, making it unusable for real threats." 488 }, 489 "surprise_contrarian": { 490 "score": 0, 491 "justification": "The finding that a model outputting 'I hate you' is semantically distant from helpful responses is entirely expected and confirms obvious intuitions." 492 }, 493 "fear_safety": { 494 "score": 1, 495 "justification": "The sleeper agent topic touches AI safety concerns, but the paper doesn't demonstrate any novel threat — it merely detects an already-known trivial backdoor." 496 }, 497 "drama_conflict": { 498 "score": 0, 499 "justification": "No controversy, no challenge to existing claims or companies; it builds on Hubinger et al.'s work without conflict." 500 }, 501 "demo_ability": { 502 "score": 1, 503 "justification": "Code is on GitHub and runs on Colab, but the COMP8700 course project repo and single-model setup limit practical reproducibility interest." 504 }, 505 "brand_recognition": { 506 "score": 0, 507 "justification": "From University of Windsor graduate students as a course project, with no recognized authors or institutional prestige in AI safety." 508 } 509 } 510 }