scan.json (26393B)
1 { 2 "paper": { 3 "title": "Microsaccade-Inspired Probing: Positional Encoding Perturbations Reveal LLM Misbehaviours", 4 "authors": ["Rui Melo", "Rui Abreu", "Corina S. Pasareanu"], 5 "year": 2025, 6 "venue": "Under review at ICLR 2026", 7 "arxiv_id": "2510.01288", 8 "doi": "10.48550/arXiv.2510.01288" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "MIP introduces lightweight positional encoding perturbations to detect LLM misbehaviours across factuality, jailbreak, toxicity, and backdoor detection tasks. The method achieves near-perfect AUC on jailbreak and backdoor detection (up to 1.0), outperforming the LLMScan baseline in most settings across three models (Llama-3.2-3B, Llama-3.1-8B, Qwen2.5-14B). Toxicity detection remains the weakest area, with PCA visualizations confirming minimal class separation. The O(1) intervention complexity offers significant computational advantages over per-token or per-layer approaches.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "Appendix A states 'implementation of MIP' and 'intervention scripts' will be made publicly available on GitHub and Hugging Face 'upon publication.' This is a promise of future release, not a current release." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available datasets (Questions1000, WikiData, SciQ, Surge AI Toxicity, Real Toxicity Prompts, MTBA, Sleeper, VPI, AutoDAN, GCG, PAP). Appendix C describes the sources and Appendix A promises preprocessed datasets will also be released." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Appendix B specifies NVIDIA RTX A6000 GPU (48GB VRAM), driver version 550.144.03, CUDA 12.4, Intel Xeon Gold 5315Y CPU, 44 GiB RAM. Appendix D.1 describes 4-bit quantization with BitsAndBytes using nf4 and bfloat16." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions are provided. The paper promises code release 'upon publication' but does not include a README or commands to replicate experiments." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Table 1 reports only point estimates for ACC and AUC. No confidence intervals or error bars are provided for the main results. Appendix G reports ± std dev for ablation studies but not for the main results." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims MIP outperforms LLMScan baselines but provides no statistical significance tests. Comparisons are made by visually comparing numbers in Table 1." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Cohen's d effect sizes are reported for head-wise attribution analysis (Section 5.2, Figure 4a), providing magnitude of perturbation-induced differences across attention heads and layers." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "Appendix C mentions datasets of 'at least 1,000 entries' but does not justify why this size is sufficient. No power analysis is provided. The 80/10/10 split is described but not justified." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Main results in Table 1 are single-run numbers with no variance reported. Appendix G reports std dev for ablation noise types but not for the primary experiments." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "LLMScan (Zhang et al., 2025) is used as the primary baseline across all tasks. Table 1 includes baseline ACC-AUC ranges." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "LLMScan (Zhang et al., 2025) is a contemporary ICML 2025 paper and represents the state-of-the-art probing method for LLM misbehaviour detection." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 5.3 and Appendix G ablate the perturbation mechanism, comparing sinusoidal PE intervention against random and Gaussian noise perturbations." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table 1 reports both AUC and ACC (accuracy) for all experiments." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant for this automated misbehaviour detection task — the ground truth labels come from the benchmark datasets." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "Appendix E describes an 80/10/10 train/val/test split with stratified sampling and fixed random seed. Results are reported on the held-out test set." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 1 provides per-dataset breakdowns across all 4 task categories (factuality, jailbreak, toxicity, backdoor) with 12 individual dataset results." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 5.1 discusses toxicity detection as a failure case where 'PCA visualizations show minimal class separation, and LDA confirms that toxicity is intrinsically harder to discriminate.' Also notes weakness on SciQ with Qwen2.5-14B." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that toxicity detection is comparatively weaker, that WikiData with Llama-3.2-3B shows reduced performance, and that SciQ with Qwen2.5-14B achieves only 0.52 ACC (near chance). These are explicitly discussed as challenges." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims MIP 'detects failures across diverse settings including factuality, safety, toxicity, and backdoor attacks' and is 'computationally efficient.' Table 1 supports cross-task detection and Section 5.3/Appendix F supports O(1) efficiency." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper makes causal claims like 'positional perturbations are highly effective at surfacing latent signals of alignment violations' and that 'misbehaviours are associated with localized deviations.' The ablation compares perturbation types but does not isolate the causal mechanism — the observed differences could be due to other factors in the pipeline." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper claims MIP is 'model-agnostic' (Section 7) but tests only 3 models from 2 families (Llama and Qwen), all open-source. No testing on closed-source models (GPT-4, Claude) or non-instruct models. The title claims to 'Reveal LLM Misbehaviours' broadly." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "The paper does not consider alternative explanations for why the perturbation features are discriminative. For example, the MLP classifier might be picking up on input length or complexity differences between normal and adversarial inputs rather than positional encoding effects specifically." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper directly measures what it claims: binary classification accuracy and AUC for misbehaviour detection. The metrics match the claims without proxy gaps." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Appendix D specifies exact model names: Llama-3.2-3B-Instruct, Llama-3.1-8B-Instruct, Qwen2.5-14B-Instruct, with parameter counts, layer counts, and head counts." 143 }, 144 "prompts_provided": { 145 "applies": false, 146 "answer": false, 147 "justification": "This paper does not use prompting for its evaluation — it perturbs positional encodings and analyzes internal representations. The prompts/inputs come from the benchmark datasets." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Appendix E reports MLP hyperparameters: hidden sizes (128, 64), dropout 0.3, learning rate 1e-3, weight decay 1e-4, AdamW optimizer, 80 epochs with early stopping patience 10. Appendix D.1 reports quantization settings." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. MIP is a single forward pass perturbation + classifier pipeline." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Appendix C describes dataset sources and curation. Appendix E documents the 80/10/10 split strategy. For toxicity, they explicitly state selection of 500 toxic and 500 non-toxic examples for balance." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 6 (Discussion) includes a 'Limitations' subsection that discusses unexplored failure modes (bias, subtle misinformation, fairness violations)." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "The limitations section is brief and generic: 'other forms of failure modes such as bias, subtle misinformation, or fairness violations remain unexplored.' No specific threats to the validity of the presented results are discussed." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Section 6 states: 'our claim of generality should be interpreted as potential generality, pending further empirical confirmation' and lists unexplored failure types. This bounds the scope of the generalization claims." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "The intervention features, classifier inputs, and trained models are not released. Only the datasets used are public. The actual experimental data (perturbation features, split indices) are promised for future release." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Appendix C describes the source and nature of each dataset. The paper explains which versions and curations were used (e.g., 'Following Zhang et al. (2025), we adopt their curated versions')." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. All datasets are standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is documented: input text → token embedding → PE perturbation → forward pass → extract attention matrices and next-token distributions → compute features → MLP classifier. Appendix E documents the split and training process." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding sources are mentioned anywhere in the paper. There is no acknowledgments section listing grants or sponsors." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are listed: Carnegie Mellon University, FEUP, INESC-ID. The paper does not evaluate products from these institutions." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "No funding is disclosed, so independence cannot be assessed." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is provided. There is an ethics statement but it does not address financial interests." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses Llama-3.2, Llama-3.1, and Qwen2.5 models but does not state their training data cutoff dates. This matters because some benchmark datasets (Questions1000, WikiData) could be in the training data." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether the LLMs may have seen the benchmark datasets during pretraining. The factuality tasks (Questions1000, WikiData) are particularly vulnerable since they test factual knowledge." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "Many benchmarks used (Questions1000, WikiData, SciQ) predate the models' training cutoffs. No contamination analysis is performed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Section 5.3 and Appendix F discuss computational efficiency with O(1) complexity analysis. Figure 5 shows cumulative FLOPs comparison across intervention strategies on the Sleeper dataset." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "While the hardware is specified (Appendix B) and FLOPs are compared relatively, no total compute budget (GPU hours, wall-clock time for all 66 experiment configurations) is stated." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Appendix E mentions a fixed random seed for data splits but does not report results across multiple seeds. Main results appear to be single-seed." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The paper does not state how many runs produced the results in Table 1. Appendix E describes training with early stopping but does not specify number of independent runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "MLP architecture (128, 64) and hyperparameters are stated but no search budget or justification for why these were chosen is provided." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No explanation of how the MLP architecture and hyperparameters were selected. The paper presents one configuration without discussing alternatives tried." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "The paper compares across 66 experiment configurations (12 datasets × 3 models + variants) with no multiple comparison correction." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement MIP and compare against their own implementation/reproduction of LLMScan baselines. No acknowledgment of self-comparison bias per Lucic et al. (2018)." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Figure 5 directly plots cumulative FLOPs vs. examples for MIP, Layer Intervention, Token Intervention, and LLMScan, showing performance at matched compute." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper does not discuss whether the binary classification task (normal vs. misbehaviour) with pre-labeled datasets actually measures real-world misbehaviour detection capability. No discussion of construct validity." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved in the evaluation." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether benchmark datasets (many from 2017-2022) were in the training data of models released in 2024-2025. This is critical for the factuality task." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the features (attention perturbation differences) contain information that leaks the label. For example, adversarial inputs (GCG) have very different surface characteristics than normal inputs." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of potential overlap or non-independence between train and test splits beyond stating the split is stratified." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No concrete leakage detection or prevention method is applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "MIP achieves near-perfect separation on adversarial jailbreak prompting, with AUC/ACC reaching 1.0 on GCG across all three models.", 365 "evidence": "Table 1 shows AUC=1.0 for GCG across all models and ACC of 1.0 for Llama models, 0.96 for Qwen.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "MIP outperforms LLMScan baselines across most tasks and datasets.", 370 "evidence": "Table 1 compares MIP ACC/AUC against LLMScan baseline ranges across 12 datasets × 3 models. MIP exceeds the baseline range in most configurations.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "MIP requires O(1) intervention complexity, compared to O(n) for per-token and O(L) for per-layer approaches.", 375 "evidence": "Section 5.3 provides the complexity analysis and Appendix F (Figure 5) shows cumulative FLOPs comparison.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Toxicity is intrinsically harder to discriminate via positional perturbations.", 380 "evidence": "Section 5.1 notes lower performance, PCA shows minimal class separation, Cohen's d shows little localized signal in attention heads for toxicity datasets.", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Backdoor activations manifest in distinct internal patterns reliably exposed by perturbations, with discriminative signals concentrated in mid-to-late layers.", 385 "evidence": "Figures 4a and 4b show Cohen's d hotspots and per-head AUC scores concentrated in layers 21-23 for Llama-3.2-3B.", 386 "supported": "moderate" 387 }, 388 { 389 "claim": "MIP is model-agnostic.", 390 "evidence": "Tested on 3 models (Llama-3.2-3B, Llama-3.1-8B, Qwen2.5-14B). However, all are open-source instruct models — no testing on closed-source or non-instruction-tuned models.", 391 "supported": "weak" 392 } 393 ], 394 "red_flags": [ 395 { 396 "flag": "No variance or significance testing on main results", 397 "detail": "Table 1 reports only point estimates with no error bars, confidence intervals, or significance tests. With 66 configurations and single-run results, it is impossible to tell whether observed differences are meaningful or due to random variation in the MLP training." 398 }, 399 { 400 "flag": "Potential confound: input surface characteristics", 401 "detail": "Some adversarial inputs (especially GCG) have radically different surface-level characteristics than normal inputs (gibberish tokens, unusual lengths). The MLP may be detecting these surface differences rather than positional-encoding-specific signals. This alternative explanation is not discussed." 402 }, 403 { 404 "flag": "Contamination risk for factuality benchmarks", 405 "detail": "Questions1000, WikiData, and SciQ predate the models' training cutoffs. The factuality task tests whether the model 'knows' it is lying — but contamination could affect both the model's factual knowledge and its attention patterns, confounding the results." 406 }, 407 { 408 "flag": "Limited model diversity for 'model-agnostic' claim", 409 "detail": "Only 3 open-source instruction-tuned models from 2 families tested. The claim of model-agnosticism would require testing across more diverse architectures (e.g., encoder-only, closed-source, non-instruction-tuned)." 410 }, 411 { 412 "flag": "Self-comparison bias with baseline", 413 "detail": "The LLMScan baseline results appear to be from the authors' own reproduction. The baseline ranges in Table 1 are not clearly sourced — they could underperform due to implementation differences." 414 } 415 ], 416 "cited_papers": [ 417 { 418 "title": "LLMScan: Causal Scan for LLM Misbehavior Detection", 419 "authors": ["Mengdi Zhang", "Goh Kai Kiat", "Peixin Zhang", "Jun Sun", "Lin Xin Rose", "Hongyu Zhang"], 420 "year": 2025, 421 "relevance": "Primary baseline for LLM misbehaviour detection via layer-wise interventions." 422 }, 423 { 424 "title": "Representation Engineering: A Top-Down Approach to AI Transparency", 425 "authors": ["Andy Zou"], 426 "year": 2023, 427 "arxiv_id": "2304.12210", 428 "relevance": "Linear Artificial Tomography (LAT) probing technique for factuality detection in LLMs." 429 }, 430 { 431 "title": "Sleeper Agents: Training Deceptive LLMs That Persist Through Safety Training", 432 "authors": ["Evan Hubinger"], 433 "year": 2024, 434 "arxiv_id": "2401.05566", 435 "relevance": "Backdoor attack benchmark studying deceptive models that persist through safety training." 436 }, 437 { 438 "title": "Universal and Transferable Adversarial Attacks on Aligned Language Models", 439 "authors": ["Andy Zou", "Zifan Wang", "Nicholas Carlini"], 440 "year": 2023, 441 "arxiv_id": "2307.15043", 442 "relevance": "GCG adversarial attack method used as a jailbreak benchmark in this paper." 443 }, 444 { 445 "title": "SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks", 446 "authors": ["Alexander Robey", "Eric Wong", "Hamed Hassani", "George J. Pappas"], 447 "year": 2024, 448 "relevance": "Defense against jailbreak attacks using input perturbations." 449 }, 450 { 451 "title": "How to Catch an AI Liar: Lie Detection in Black-Box LLMs by Asking Unrelated Questions", 452 "authors": ["Lorenzo Pacchiardi"], 453 "year": 2024, 454 "arxiv_id": "2309.15840", 455 "relevance": "Lie detection in LLMs using behavioural analysis without model internals access." 456 }, 457 { 458 "title": "Jailbroken: How Does LLM Safety Training Fail?", 459 "authors": ["Alexander Wei", "Nika Haghtalab", "Jacob Steinhardt"], 460 "year": 2023, 461 "relevance": "Analysis of failure modes in LLM safety training and jailbreak attacks." 462 }, 463 { 464 "title": "AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models", 465 "authors": ["Xiaogeng Liu", "Nan Xu", "Muhao Chen", "Chaowei Xiao"], 466 "year": 2023, 467 "arxiv_id": "2310.04451", 468 "relevance": "Automated adversarial prompt generation for jailbreaking aligned LLMs." 469 }, 470 { 471 "title": "Real-ToxicityPrompts: Evaluating Neural Toxic Degeneration in Language Models", 472 "authors": ["Samuel Gehman"], 473 "year": 2020, 474 "arxiv_id": "2009.11462", 475 "relevance": "Toxicity benchmark for evaluating language model degeneration." 476 }, 477 { 478 "title": "Towards Monosemanticity: Decomposing Language Models with Dictionary Learning", 479 "authors": ["T. Bricken"], 480 "year": 2023, 481 "relevance": "Sparse autoencoder interpretability technique for understanding LLM internal representations." 482 }, 483 { 484 "title": "A Mixture of Linear Corrections Generates Secure Code", 485 "authors": ["Weichen Yu", "Ravi Mangal", "Terry Zhuo", "Matt Fredrikson", "Corina S. Pasareanu"], 486 "year": 2025, 487 "arxiv_id": "2507.09508", 488 "relevance": "Mechanistic editing approach for steering LLM behaviour toward secure code generation." 489 } 490 ] 491 }