scan.json (24959B)
1 { 2 "paper": { 3 "title": "Backdoor Samples Detection Based on Perturbation Discrepancy Consistency in Pre-trained Language Models", 4 "authors": [ 5 "Zuquan Peng", 6 "Jianming Fu", 7 "Lixin Zou", 8 "Li Zheng", 9 "Yanzhen Ren", 10 "Guojun Peng" 11 ], 12 "year": 2025, 13 "venue": "Neural Networks (preprint)", 14 "arxiv_id": "2509.05318" 15 }, 16 "checklist": { 17 "artifacts": { 18 "code_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "A GitHub repository URL is provided in Section 5 (footnote 1): https://github.com/pzq7025/BackdoorDetection." 22 }, 23 "data_released": { 24 "applies": true, 25 "answer": true, 26 "justification": "The paper uses publicly available datasets: YELP, OLID, and COVID, and uses public toolkits OpenBackdoor and TextAttack for attack generation. All datasets are referenced with citations and are publicly available." 27 }, 28 "environment_specified": { 29 "applies": true, 30 "answer": true, 31 "justification": "Section 6.1 specifies: Ubuntu OS v20.04, Intel Xeon E5-2680 CPU, 126GB RAM, Quadro RTX 5000 GPU, Python v3.7.5, PyTorch v1.13.0+cu117, OpenAttack v2.1.1, OpenBackdoor v1.0.0, TextAttack v0.3.7, and Transformers library v3.3.0." 32 }, 33 "reproduction_instructions": { 34 "applies": true, 35 "answer": false, 36 "justification": "While a code repository link is provided and the environment is specified, the paper does not include step-by-step reproduction instructions, a README with commands, or a dedicated 'Reproducing Results' section. The experimental setup describes parameters but not how to run the experiments end-to-end." 37 } 38 }, 39 "statistical_methodology": { 40 "confidence_intervals_or_error_bars": { 41 "applies": true, 42 "answer": false, 43 "justification": "All results (Tables 4, 5, 7, 8 and figures) report single AUROC point estimates without confidence intervals or error bars." 44 }, 45 "significance_tests": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper claims NETE outperforms existing methods based solely on comparing AUROC values in tables. No statistical significance tests (p-values, t-tests, etc.) are reported." 49 }, 50 "effect_sizes_reported": { 51 "applies": true, 52 "answer": false, 53 "justification": "The paper reports raw AUROC values but does not provide effect sizes (e.g., Cohen's d) or contextualize improvements with baseline context beyond raw numbers. The absolute AUROC differences are visible but not characterized as effect sizes." 54 }, 55 "sample_size_justified": { 56 "applies": true, 57 "answer": false, 58 "justification": "The paper uses 500 samples for the PDC test (Sec. 4.2) and 10,000 samples (5,000 clean + 5,000 backdoor) for runtime evaluation (Sec. 6.3), but no justification or power analysis is provided for why these specific sample sizes were chosen." 59 }, 60 "variance_reported": { 61 "applies": true, 62 "answer": false, 63 "justification": "No standard deviations, variance across runs, or multiple-run results with spread measures are reported anywhere in the paper. All results appear to be single-run point estimates." 64 } 65 }, 66 "evaluation_design": { 67 "baselines_included": { 68 "applies": true, 69 "answer": true, 70 "justification": "The paper compares against five zero-shot detection baselines: Log, Rank, LogRank, Entropy, and ONION (Section 6.1). Additionally, roberta-base and roberta-large are compared for potential attack detection (Sec. 6.4)." 71 }, 72 "baselines_contemporary": { 73 "applies": true, 74 "answer": true, 75 "justification": "The baselines include ONION (2021), STRIP (2019), and the paper discusses recent works like Len-Meo (2024) and UnToken (2025). The paper explicitly justifies why white-box methods are excluded (they require poisoned model access) and discusses recent defense literature in Section 2.2." 76 }, 77 "ablation_study": { 78 "applies": true, 79 "answer": true, 80 "justification": "Section 6.3 includes ablation-style analyses: impact of perturbation numbers (Fig. 4), impact of different pre-trained scoring models (GPT-xl vs GPT-medium, Fig. 5), and impact of different mask-filling models (T5-small vs T5-base vs T5-large, Fig. 6)." 81 }, 82 "multiple_metrics": { 83 "applies": true, 84 "answer": false, 85 "justification": "Only AUROC is used as the evaluation metric throughout the paper. Section 6.1 justifies the choice of AUROC but no additional metrics (e.g., precision, recall, F1) are reported." 86 }, 87 "human_evaluation": { 88 "applies": false, 89 "answer": false, 90 "justification": "This is a detection method evaluated entirely via automated metrics (AUROC) on programmatically constructed backdoor/clean samples. Human evaluation of the system's outputs is not relevant." 91 }, 92 "held_out_test_set": { 93 "applies": true, 94 "answer": false, 95 "justification": "The paper does not clearly describe a separation between development/tuning sets and test sets. The threshold analysis in Sec. 6.3 uses word-level backdoor samples to set thresholds for other attacks, but there is no explicit held-out test set protocol." 96 }, 97 "per_category_breakdown": { 98 "applies": true, 99 "answer": true, 100 "justification": "Results are broken down by attack type (word, sentence, syntactic, style with 4 sub-styles), by dataset (COVID, OLID, YELP), and by additional complex scenarios in Table 7 (CBA, BadChain, BadEdit, VPI, Sleepagent across 13 datasets)." 101 }, 102 "failure_cases_discussed": { 103 "applies": true, 104 "answer": true, 105 "justification": "Section 6.2 explicitly acknowledges that 'current black-box detection methods, including our proposed approach, show relatively poor detection performance under word and sentence insertion' and explains why. Section 7.2 discusses limitations in correcting syntactic/style backdoor samples." 106 }, 107 "negative_results_reported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The paper reports that word-level and sentence-level attacks are harder to detect (Tables 4-5 show lower AUROC for these), and Section 6.2 explicitly discusses the suboptimal performance on these attack types. The threshold analysis in Sec. 6.3 shows only marginal improvement over ONION (0.77 vs 0.76)." 111 } 112 }, 113 "claims_and_evidence": { 114 "abstract_claims_supported": { 115 "applies": true, 116 "answer": true, 117 "justification": "The abstract claims NETE 'outperforms existing zero-shot black-box detection methods,' which is supported by Tables 4, 5, 7, and 8 showing NETE achieves the highest AUROC in most settings. The abstract also claims applicability to both pre-training and post-training phases, which is discussed in Sections 3 and 6.4." 118 }, 119 "causal_claims_justified": { 120 "applies": true, 121 "answer": true, 122 "justification": "The paper's causal claims center on the perturbation discrepancy consistency phenomenon. The claim that backdoor samples exhibit smaller perturbation discrepancy is supported by both empirical observation (Fig. 1, Sec. 4.2) and mathematical derivation connecting perturbation discrepancy to curvature (Sec. 5.3). Ablation studies in Sec. 6.3 provide controlled single-variable manipulation." 123 }, 124 "generalization_bounded": { 125 "applies": true, 126 "answer": true, 127 "justification": "The paper tests across multiple attack types (4 traditional + 5 LLM backdoor attacks + adversarial + jailbreak), multiple datasets (3 core + 13 additional), and acknowledges limitations for word/sentence-level attacks. Section 7.3 explicitly states time consumption limitations for large-scale datasets." 128 }, 129 "alternative_explanations_discussed": { 130 "applies": true, 131 "answer": false, 132 "justification": "The paper does not discuss alternative explanations for why NETE works beyond the perturbation discrepancy phenomenon. No threats-to-validity or confound analysis is provided. The paper does not consider whether the performance gains could be attributed to other factors." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": false, 139 "justification": "The paper uses 'GPT-2', 'GPT-medium', 'GPT-xl', 'T5-small', 'T5-base', 'T5-large' without specific version identifiers or snapshot dates. For the LLM-adversary experiment, 'ChatGPT (i.e., GPT-3.5)' is used without a specific API version." 140 }, 141 "prompts_provided": { 142 "applies": false, 143 "answer": false, 144 "justification": "The method does not use prompting. It computes log probabilities and generates perturbations via mask-filling, which are algorithmic operations not involving prompt design." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Key hyperparameters are reported: mask-filling ratio of 10%, span of up to 2 words (Sec. 5.2), 50/200 perturbation numbers (Sec. 6.3), batch size of 32 (Sec. 6.3), cosine similarity 0.8 for adversarial attacks, 3 injected triggers for word-level attacker, temperature settings {0.3, 0.5, 0.7} for Temperature-adversary." 150 }, 151 "scaffolding_described": { 152 "applies": false, 153 "answer": false, 154 "justification": "No agentic scaffolding is used. The method is a standalone detection algorithm without agent-like components." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 6.1 describes the attack generation process using OpenBackdoor with specified parameters, the STRAP model for style transfer, TextAttack for adversarial examples, and specific settings (cosine similarity 0.8, max 3 perturbed words). Dataset statistics are provided in Table 3." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": true, 166 "justification": "Section 7.3 is titled 'Limitation and Future Work' and provides substantive discussion of time consumption issues and strategies for improvement." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": true, 171 "justification": "Section 7.3 identifies a specific limitation: increasing data length leads to greater time expenditure, which 'imposes a substantial adverse effect on real-time detection systems.' Section 7.1 discusses the specific design choice of low span setting and its implications. Section 7.2 discusses the specific limitation that correction is only possible for word/sentence-level triggers." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "While the paper mentions some limitations, it does not explicitly state what the results do NOT show or what populations/settings are excluded. The paper does not systematically bound its claims to specific domains or model types beyond the tested ones." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "The paper does not release raw experimental data (e.g., per-sample detection scores, generated perturbation samples, or detailed per-experiment logs). Only aggregated AUROC values are reported in tables." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Section 6.1 describes how datasets were selected (YELP, OLID, COVID from real-world scenarios), how backdoor samples were generated (using OpenBackdoor with specified parameters), how adversarial examples were created (TextAttack with cosine similarity 0.8), and Table 3 provides dataset statistics." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants are involved. Data sources are standard public benchmark datasets." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline is documented: (1) clean datasets selected (Table 3), (2) backdoor samples generated via OpenBackdoor with four trigger schemes (Sec. 6.1), (3) perturbations generated via T5 mask-filling (Sec. 5.2), (4) log probabilities computed via GPT models (Sec. 5.2), (5) curvature threshold applied (Sec. 5.2, Algorithm 1)." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": true, 205 "justification": "The Acknowledge section states: 'This research was supported by the National Natural Science Foundation of China (Grant No. 62272351).'" 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "All authors are affiliated with Wuhan University, Key Laboratory of Aerospace Information Security and Trusted Computing, Ministry of Education. No commercial product is being evaluated." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": true, 215 "justification": "The funder is the National Natural Science Foundation of China, a government research funding agency with no financial stake in the specific detection method's performance." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests statement or financial interests declaration is present in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": false, 226 "answer": false, 227 "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It uses pre-trained models (GPT-2, T5) as tools for computing log probabilities and generating perturbations, not to test their knowledge. The evaluation is of the detection method, not of model knowledge." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": false, 231 "answer": false, 232 "justification": "Same rationale: the paper evaluates a detection algorithm, not a pre-trained model's performance on a knowledge benchmark. Train/test overlap for the language models is not relevant to the detection task." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": false, 236 "answer": false, 237 "justification": "Same rationale: benchmark contamination is not relevant since the paper tests a detection method, not model knowledge or generative capability on benchmarks." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants are involved in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants are involved in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants are involved in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants are involved in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants are involved in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants are involved in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants are involved in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Section 6.3 reports GPU memory consumption (4166 MB for models) and detailed runtime results in Table 6 across datasets and trigger types, with per-sample timing data." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "Table 6 provides comprehensive runtime and GPU memory consumption data. The hardware is specified (Quadro RTX 5000). Runtime ranges from ~35 seconds to ~650 seconds per 10,000 samples depending on dataset and trigger type." 287 } 288 } 289 }, 290 "claims": [ 291 { 292 "claim": "NETE outperforms existing zero-shot black-box detection methods for backdoor sample detection across multiple attack types and datasets.", 293 "evidence": "Tables 4, 5, 7, and 8 show NETE achieves the highest AUROC in the majority of experimental settings across style, word, sentence, and syntactic attacks on COVID, OLID, and YELP datasets, as well as 5 LLM-specific backdoor attacks across 13 datasets.", 294 "supported": "strong" 295 }, 296 { 297 "claim": "Backdoor samples exhibit anomalous perturbation discrepancy consistency, with smaller perturbation discrepancy compared to clean samples.", 298 "evidence": "Figure 1 and Section 4.2 demonstrate the density distribution gap between backdoor and clean samples across four trigger types on the YELP dataset using 500 samples. Section 5.3 provides mathematical justification connecting this to curvature.", 299 "supported": "strong" 300 }, 301 { 302 "claim": "The detection performance converges at approximately 50 perturbation steps.", 303 "evidence": "Figure 4 (Section 6.3) shows AUROC plateaus around 50 perturbations across all four attack types and three datasets.", 304 "supported": "moderate" 305 }, 306 { 307 "claim": "The choice of pre-trained scoring model and mask-filling model has minimal impact on detection results.", 308 "evidence": "Figures 5 and 6 compare GPT-xl vs GPT-medium and T5-small vs T5-large respectively, showing negligible differences. However, these comparisons are limited to models within the same family.", 309 "supported": "moderate" 310 }, 311 { 312 "claim": "NETE can effectively detect adversarial examples and jailbreak samples.", 313 "evidence": "Figure 8 shows NETE outperforms ONION on Textbugger and Textfooler adversarial examples. Section 6.4 reports AUROC of 0.91 for jailbreak detection vs 0.74 for Log baseline. However, only one jailbreak attack type (linguistic mutations) is tested.", 314 "supported": "moderate" 315 }, 316 { 317 "claim": "Current black-box methods, including NETE, show relatively poor detection for word-level and sentence-level attacks.", 318 "evidence": "Table 5 shows lower AUROC values for word and sentence attacks compared to syntactic and style attacks. Section 6.2 provides an explanation based on rare words occurring within normal output distributions.", 319 "supported": "strong" 320 } 321 ], 322 "methodology_tags": [ 323 "benchmark-eval" 324 ], 325 "key_findings": "The paper proposes NETE, a zero-shot black-box backdoor sample detection method based on the observation that backdoor samples exhibit smaller perturbation discrepancy than clean samples. The method uses T5 mask-filling to generate perturbations and GPT-2 to compute log probabilities, then applies a curvature-based threshold to classify samples. NETE outperforms existing zero-shot detection methods (Log, Rank, LogRank, Entropy, ONION) across four traditional and five LLM-specific backdoor attacks on multiple datasets, while also showing effectiveness against adversarial examples and jailbreak attacks. The method is lightweight, requiring only 4166 MB GPU memory, and detection performance converges at ~50 perturbations.", 326 "red_flags": [ 327 { 328 "flag": "No uncertainty quantification", 329 "detail": "All AUROC results are single point estimates without confidence intervals, error bars, or variance across runs. Without knowing the variability, it is impossible to determine whether performance differences between methods are meaningful or within noise." 330 }, 331 { 332 "flag": "No statistical significance tests", 333 "detail": "Claims of outperforming baselines are based solely on comparing raw AUROC values. Some differences are very small (e.g., 0.77 vs 0.76 in threshold analysis) and could be within random variation." 334 }, 335 { 336 "flag": "Single evaluation metric", 337 "detail": "Only AUROC is reported. In practical deployment, precision and recall at specific thresholds matter (e.g., high false positive rates could make the system unusable). The paper does not report precision-recall curves or F1 scores." 338 }, 339 { 340 "flag": "Threshold selection relies on known attack knowledge", 341 "detail": "The threshold analysis in Section 6.3 uses word-level backdoor samples (known attacks) to set thresholds for unknown attacks, which contradicts the zero-shot claim to some extent. The practical question of how to set thresholds without any backdoor samples is not fully addressed." 342 } 343 ], 344 "cited_papers": [ 345 { 346 "title": "Sleeper agents: Training deceptive LLMs that persist through safety training", 347 "authors": ["E. Hubinger", "C. Denison", "J. Mu"], 348 "year": 2024, 349 "arxiv_id": "2401.05566", 350 "relevance": "Directly relevant to AI safety: demonstrates persistent deceptive behaviors in LLMs that survive safety training, used as an attack scenario in this paper." 351 }, 352 { 353 "title": "BackdoorLLM: A comprehensive benchmark for backdoor attacks on large language models", 354 "authors": ["Y. Li", "H. Huang", "Y. Zhao"], 355 "year": 2025, 356 "arxiv_id": "2408.12798", 357 "relevance": "Benchmark framework for evaluating backdoor attacks on LLMs, used to ensure consistency in this paper's empirical analysis." 358 }, 359 { 360 "title": "BadChain: Backdoor chain-of-thought prompting for large language models", 361 "authors": ["Z. Xiang", "F. Jiang", "Z. Xiong"], 362 "year": 2024, 363 "relevance": "Novel backdoor attack exploiting chain-of-thought prompting in LLMs, relevant to agentic AI security." 364 }, 365 { 366 "title": "BadEdit: Backdooring large language models by model editing", 367 "authors": ["Y. Li", "T. Li", "K. Chen"], 368 "year": 2024, 369 "relevance": "Backdoor injection through model editing rather than poisoning, relevant to LLM security research." 370 }, 371 { 372 "title": "Composite backdoor attacks against large language models", 373 "authors": ["H. Huang", "Z. Zhao", "M. Backes"], 374 "year": 2024, 375 "relevance": "Multi-component backdoor attacks targeting LLMs with composite triggers, relevant to understanding attack complexity." 376 }, 377 { 378 "title": "Backdooring instruction-tuned large language models with virtual prompt injection", 379 "authors": ["J. Yan", "V. Yadav", "S. Li"], 380 "year": 2024, 381 "relevance": "Backdoor attacks targeting instruction-tuned LLMs, relevant to safety of agentic LLM systems." 382 }, 383 { 384 "title": "Poison attack and poison detection on deep source code processing models", 385 "authors": ["J. Li", "Z. Li", "H. Zhang"], 386 "year": 2024, 387 "doi": "10.1145/3605943", 388 "relevance": "Backdoor attacks and detection specifically on code processing models, directly relevant to LLM programming safety." 389 }, 390 { 391 "title": "DetectGPT: Zero-shot machine-generated text detection using probability curvature", 392 "authors": ["E. Mitchell", "Y. Lee", "A. Khazatsky"], 393 "year": 2023, 394 "relevance": "Foundational method for zero-shot text detection using curvature that NETE builds upon, relevant to LLM-generated content detection." 395 }, 396 { 397 "title": "BDMMT: Backdoor sample detection for language models through model mutation testing", 398 "authors": ["J. Wei", "M. Fan", "W. Jiao"], 399 "year": 2024, 400 "relevance": "Recent backdoor detection method using model mutation testing, relevant baseline for LLM security." 401 }, 402 { 403 "title": "ArtPrompt: ASCII art-based jailbreak attacks against aligned LLMs", 404 "authors": ["F. Jiang", "Z. Xu", "L. Niu"], 405 "year": 2024, 406 "relevance": "Novel jailbreak attack method against aligned LLMs, relevant to AI safety evaluation." 407 }, 408 { 409 "title": "CodeT5+: Open code large language models for code understanding and generation", 410 "authors": ["Y. Wang", "H. Le", "A. Gotmare"], 411 "year": 2023, 412 "relevance": "Code LLM that could be targeted by backdoor attacks, relevant to understanding the scope of backdoor threats in code generation." 413 } 414 ] 415 }