scan.json (27376B)
1 { 2 "paper": { 3 "title": "SoK: a Comprehensive Causality Analysis Framework for Large Language Model Security", 4 "authors": ["Wei Zhao", "Zhe Li", "Jun Sun"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.04841", 8 "doi": "10.48550/arXiv.2512.04841" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage", "survey_methodology"], 12 "methodology_tags": ["benchmark-eval", "meta-analysis"], 13 "key_findings": "The paper presents a unified multi-level causality analysis framework for LLM security, demonstrating that targeted interventions on causally critical components can reliably modify safety behavior. Safety mechanisms are found to be highly localized in early-to-middle transformer layers (2-12) with only 1-2% of neurons exhibiting causal influence. Causal features extracted from the framework achieve over 95% detection accuracy (F1) for jailbreak, backdoor, and fairness tasks across LLaMA2-7B, Qwen2.5-7B, and LLaMA3.1-8B, though hallucination detection remains challenging without multi-level feature fusion.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository URL provided: https://github.com/Amadeuszhao/SOK_Casuality (mentioned in abstract and Section 1)." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available datasets: Alpaca, AdvBench, TruthfulQA, RealToxicityPrompts, and standard backdoor attack datasets (BadNets, CTBA, MTBA, Sleeper). All are public benchmarks." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions 'H100-80Gb server' but provides no requirements.txt, library versions, or detailed environment setup information." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README details or reproduction scripts are described." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 1, 3, and 4 are reported as single point estimates (e.g., '92.8%', '0.994') with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes numerous comparative claims (e.g., neuron-level vs layer-level efficacy, model comparisons) without any statistical significance tests." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Effect sizes are reported with baseline context throughout: e.g., ASR changes from 100% to 26.6% (Table 1), from 0% to 92.8%, and F1 scores with absolute values across conditions. The reader can assess magnitude of effects." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "500 prompts per dataset are used without justification for why this number was chosen or whether it provides sufficient statistical power." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported for any results. It is unclear whether results are from single runs or averaged across multiple runs." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares five detection methods (neuron, layer, token, representation, consistency) against each other across multiple tasks and models, and includes before/after intervention comparisons." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The attack methods used (GCG, AutoDAN, PAIR, AmpleGCG) and models (LLaMA3.1-8B, Qwen2.5-7B) are recent and representative of the current state of the art." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 5.5 presents a layer-specific ablation analysis, partitioning layers into early (2-8), middle (12-18), and late (22-28) groups to test which components are critical (Table 2)." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Multiple metrics are used: ASR for intervention efficacy, F1 score and DSR (Detection Success Rate) for detection performance (Tables 3 and 4)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "Safety evaluation relies entirely on GPT-4o as an automated judge. No human evaluation of the framework's outputs or safety assessments is performed." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "For detection experiments, 50% of data is used for training classifiers and 50% for testing (Section 6.2). Cross-attack transferability is tested by training on GCG/AutoDAN and evaluating on PAIR/AmpleGCG." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Tables 3 and 4 provide per-task (jailbreak, hallucination, backdoor, fairness), per-benchmark, per-model, and per-detection-method breakdowns." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses hallucination detection as a failure case where single-level methods achieve F1 below 0.7, and discusses why token-level detection performs poorly for GCG (Section 6.3). Table 5 addresses this with multi-level fusion." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Token-level GCG detection F1 of 0.430 is reported honestly. Hallucination detection failure across all single methods is highlighted. Neuron-level interventions' moderate success (46.8-57.8%) compared to layer-level (>92%) is discussed." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The three abstract claims are supported: (1) targeted interventions modify safety behavior (Table 1), (2) safety mechanisms are localized in early-to-middle layers with 1-2% of neurons (Section 5, Figure 7), (3) >95% detection accuracy is achieved for jailbreak/backdoor/fairness but notably NOT hallucination — the abstract's phrasing 'across multiple threat types' is somewhat generous but the data is there." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper uses intervention-based causal analysis grounded in Pearl's do-calculus and SCM framework (Section 3). Interventions are controlled single-variable manipulations (token replacement, neuron deactivation, layer ablation, representation steering), which is adequate for the causal claims made." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The abstract and conclusions claim the framework is a 'general foundation for causal analysis of LLM vulnerabilities' but experiments are limited to three 7-8B parameter models. No testing on larger models, closed-source models, or non-English settings. The title claims 'Comprehensive' without bounding to these specific models." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not discuss alternative explanations for its findings. For example, the localization of safety in early layers could be an artifact of the specific safety-training approach used in these models rather than a general property. No alternative interpretations are considered." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": false, 135 "justification": "The paper uses GPT-4o as a safety judge (proxy) but claims to measure actual safety behavior. No discussion of whether GPT-4o judgments accurately reflect real safety outcomes. ASR depends entirely on this automated judge's accuracy, which is not validated." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model names with sizes are given: LLaMA2-7B, Qwen2.5-7B, LLaMA3.1-8B (Section 4.1). These are specific enough to identify exact model weights." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper mentions 'detailed evaluation templates provided in Appendix 8' for GPT-4o safety evaluation but the appendix content is not included in the paper text. No actual prompt text is visible." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters are reported for model inference (temperature, top-p, max tokens). The z-test threshold of |zi| > 2.5 is stated, and MLP architecture (128/64 neurons) is described, but LLM inference parameters are missing." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The framework performs direct model interventions and feature extraction, not agentic workflows." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.1 describes dataset construction: 500 benign from Alpaca, 500 harmful from AdvBench, 500 GCG-generated and 500 AutoDAN-generated adversarial prompts. Section 6.2 describes train/test splits and balanced sampling for each task." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations section. The conclusion (Section 7) mentions 'future research' directions but does not discuss limitations of the current work." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. Issues like reliance on GPT-4o as judge, limited model sizes, or potential overfitting of detection classifiers are not addressed." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The paper does not clarify what the results do NOT show or what settings/models are excluded from the claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (model outputs, judge decisions, per-prompt results) is made available. Only aggregate statistics are reported." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.1 describes the three datasets (benign, harmful, adversarial) with their sources and sizes. Section 6.2 describes data collection for each detection task." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All data comes from standard public benchmarks and automated attack generation." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline from prompt collection to intervention to evaluation is described: datasets are defined (Section 4.1), interventions are specified mathematically (Section 4.2), evaluation via GPT-4o judge is described (Section 4.3), and detection train/test splits are documented (Section 6.2)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is disclosed anywhere in the paper." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Authors are disclosed as affiliated with Singapore Management University." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed. Absence of disclosure is not absence of conflict." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": false, 229 "answer": false, 230 "justification": "The paper tests defenses/attacks and internal model mechanisms rather than evaluating model knowledge on benchmarks. Contamination of safety training is a different issue not captured by this criterion." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": false, 234 "answer": false, 235 "justification": "Same rationale: the paper evaluates security interventions and detection methods, not model capability on knowledge benchmarks." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": false, 239 "answer": false, 240 "justification": "Not applicable — the benchmarks test safety behavior (jailbreaking, backdoor detection) rather than knowledge, so training data contamination is not the relevant concern." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 6 reports average detection time per input across all analysis levels and models (e.g., neuron-level 0.12s, token-level 2.87-4.37s)." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Hardware is mentioned (H100-80GB) but total compute budget (GPU hours, total experiment time) is not stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be from single runs." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is never stated. It is unclear whether results are single-run or averaged." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "The z-score threshold of 2.5 and MLP architecture (128/64) appear chosen without reporting any search budget or justification for these specific values." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The representation-level intervention coefficient (0.5 in Equation 14) and z-test threshold (2.5) are used without justification for how these values were selected." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so multiple comparison correction is moot. But the paper makes many implicit comparisons across methods, models, and tasks without any correction." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors propose the framework and evaluate it themselves. Three of the cited attack/defense methods (CASPER [71], LED [70], LLMScan [66]) are by the same authors. No acknowledgment of self-evaluation bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Table 6 compares detection time across methods, allowing readers to assess the performance-cost tradeoff. The paper explicitly discusses that neuron/representation methods are faster (0.07-0.14s) with better performance than token/layer methods (1.92-4.37s)." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether the benchmarks (AdvBench, TruthfulQA, etc.) adequately measure what is claimed. For example, whether GPT-4o's safety judgments are valid measures of actual harmfulness is not questioned." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved. The framework operates directly on model internals." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether the models' safety training was influenced by knowledge of the specific attack methods being tested (e.g., GCG was published before LLaMA3.1's training)." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the detection classifiers' features could leak information about the attack type rather than measuring genuine causal signals." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "Detection classifiers are trained on 50% GCG/AutoDAN and tested on remaining data including same-distribution splits. No discussion of whether train and test prompts share structural similarities." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied." 359 } 360 }, 361 "survey_methodology": { 362 "prisma_or_structured_protocol": { 363 "applies": true, 364 "answer": false, 365 "justification": "The survey component (Sections 2.1-2.2) presents taxonomies of attacks and defenses but does not follow PRISMA or any structured review protocol. No search strategy, inclusion criteria, or systematic methodology is described." 366 }, 367 "quality_assessment_of_sources": { 368 "applies": true, 369 "answer": false, 370 "justification": "The survey treats all cited papers equally. No quality assessment or risk-of-bias evaluation of the surveyed attack/defense methods is performed." 371 }, 372 "publication_bias_discussed": { 373 "applies": true, 374 "answer": false, 375 "justification": "No discussion of publication bias in the surveyed literature. The survey does not consider whether negative results or failed attack/defense methods are underrepresented." 376 } 377 } 378 }, 379 "claims": [ 380 { 381 "claim": "Targeted interventions on causally critical components can reliably modify safety behavior, with layer-level and representation-level interventions achieving >92% ASR on harmful prompts.", 382 "evidence": "Table 1 shows layer-level interventions increase ASR from 0% to 92.6-92.8% and representation-level from 0% to 92.8-96.0% across three models (Section 4.4).", 383 "supported": "strong" 384 }, 385 { 386 "claim": "Safety-related mechanisms are highly localized in early-to-middle layers (2-12), with only 1-2% of neurons exhibiting causal influence.", 387 "evidence": "Figure 7(b) shows layer ACE peaks at layer 2 (~0.76) and declines sharply through middle layers. Figure 7(c) shows 1.88% (Layer 1) and 0.88% (Layer 3) toxic neurons. Table 2 ablation confirms early layers are most critical (Section 5).", 388 "supported": "moderate" 389 }, 390 { 391 "claim": "Causal features achieve over 95% detection accuracy across jailbreak, backdoor, and fairness tasks.", 392 "evidence": "Tables 3-4 show neuron-level F1 >0.977 for jailbreak, >0.939 for backdoor, >0.990 for fairness. However, hallucination detection F1 is only 0.476-0.698 for single methods. The >95% claim requires cherry-picking the best method per task.", 393 "supported": "moderate" 394 }, 395 { 396 "claim": "Multi-level feature fusion achieves F1 0.956-0.987 and DSR 97-100% for hallucination detection.", 397 "evidence": "Table 5 shows combined features achieve these results on LLaMA2 and Qwen, but LLaMA3 achieves only 0.971 F1/97% DSR, and single models without fusion perform much worse (Section 6.3).", 398 "supported": "moderate" 399 }, 400 { 401 "claim": "AutoDAN-generated prompts are more robust to token-level interventions than GCG prompts.", 402 "evidence": "Table 1: GCG residual ASR 24.2-30.4% after token intervention vs AutoDAN 64.4-72.0%. Figure 7(a) shows GCG has concentrated high-ACE tokens while AutoDAN ACE is distributed. Section 4.4 discusses this difference.", 403 "supported": "strong" 404 } 405 ], 406 "red_flags": [ 407 { 408 "flag": "Self-evaluation of own prior work", 409 "detail": "Three of the key methods evaluated (CASPER [71], LED [70], LLMScan [66]) are by the same authors. The framework builds on and validates their own prior contributions without acknowledging this potential bias." 410 }, 411 { 412 "flag": "No uncertainty quantification", 413 "detail": "All results are point estimates with no error bars, confidence intervals, or variance across runs. For a paper making strong quantitative claims (e.g., '1-2% of neurons'), the absence of uncertainty measures is concerning." 414 }, 415 { 416 "flag": "No limitations section", 417 "detail": "The paper lacks any limitations discussion despite testing on only three 7-8B parameter models. Claims of generality are unbounded." 418 }, 419 { 420 "flag": "GPT-4o judge not validated", 421 "detail": "Safety evaluation relies entirely on GPT-4o as an automated judge, but the accuracy of this judge is not validated against human ratings. ASR results are only as reliable as this proxy." 422 }, 423 { 424 "flag": "Selective framing of detection results", 425 "detail": "The abstract claims '>95% detection accuracy across multiple threat types' but hallucination detection with single methods achieves F1 of only 0.476-0.698. The >95% claim requires selecting the best method per task or using multi-level fusion." 426 } 427 ], 428 "cited_papers": [ 429 { 430 "title": "Universal and transferable adversarial attacks on aligned language models", 431 "authors": ["Andy Zou", "Zifan Wang", "J. Zico Kolter", "Matt Fredrikson"], 432 "year": 2023, 433 "arxiv_id": "2307.15043", 434 "relevance": "GCG attack used as primary evaluation benchmark; foundational adversarial attack method for LLM safety research." 435 }, 436 { 437 "title": "AutoDAN: Generating stealthy jailbreak prompts on aligned large language models", 438 "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"], 439 "year": 2024, 440 "relevance": "AutoDAN attack used as key evaluation benchmark; demonstrates semantic-level adversarial prompt generation." 441 }, 442 { 443 "title": "Jailbreaking Black Box Large Language Models in Twenty Queries", 444 "authors": ["Patrick Chao", "Alexander Robey", "Edgar Dobriban", "Hamed Hassani", "George J. Pappas", "Eric Wong"], 445 "year": 2025, 446 "arxiv_id": "2310.08419", 447 "relevance": "PAIR attack method used in evaluation; representative of iterative refinement jailbreak approaches." 448 }, 449 { 450 "title": "Improving alignment and robustness with circuit breakers", 451 "authors": ["Andy Zou", "Long Phan", "Justin Wang"], 452 "year": 2024, 453 "relevance": "CircuitBreaker defense using representation rerouting — directly related to causality-based safety interventions." 454 }, 455 { 456 "title": "Constitutional AI: Harmlessness from AI feedback", 457 "authors": ["Yuntao Bai"], 458 "year": 2022, 459 "arxiv_id": "2212.08073", 460 "relevance": "Foundational safety alignment approach evaluated in the defense taxonomy." 461 }, 462 { 463 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 464 "authors": ["Evan Hubinger"], 465 "year": 2024, 466 "arxiv_id": "2401.05566", 467 "relevance": "Sleeper agent backdoor attack used in detection evaluation; relevant to AI safety and deceptive alignment." 468 }, 469 { 470 "title": "Llama Guard: LLM-based input-output safeguard for human-AI conversations", 471 "authors": ["Hakan Inan"], 472 "year": 2023, 473 "arxiv_id": "2312.06674", 474 "relevance": "Safety classifier for LLM outputs; part of the defense taxonomy in the survey." 475 }, 476 { 477 "title": "SmoothLLM: defending large language models against jailbreaking attacks", 478 "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J. Pappas"], 479 "year": 2023, 480 "arxiv_id": "2310.03684", 481 "relevance": "Perturbation-based defense method; representative of inference-level safety approaches." 482 }, 483 { 484 "title": "TruthfulQA: Measuring how models mimic human falsehoods", 485 "authors": ["Stephanie Lin", "Jacob Hilton", "Owain Evans"], 486 "year": 2022, 487 "relevance": "Hallucination benchmark used in detection evaluation; measures model truthfulness." 488 }, 489 { 490 "title": "Jailbroken: How does LLM safety training fail?", 491 "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"], 492 "year": 2023, 493 "relevance": "Foundational analysis of safety training failure modes; key reference in the causality-guided attack taxonomy." 494 }, 495 { 496 "title": "GradSafe: Detecting unsafe prompts for LLMs via safety-critical gradient analysis", 497 "authors": ["Yueqi Xie"], 498 "year": 2024, 499 "relevance": "Gradient-based safety detection method; related process-based defense approach." 500 } 501 ] 502 }