scan.json (20008B)
1 { 2 "paper": { 3 "title": "Cross-LLM Generalization of Behavioral Backdoor Detection in AI Agent Supply Chains", 4 "authors": ["Arun Chowdary Sanna"], 5 "year": 2025, 6 "venue": "Preprint", 7 "doi": null 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub link provided: https://github.com/arunsanna/cross-llm-backdoor-detection (Section V-G and Section IX-D)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "Paper states 'We release our multi-LLM behavioral trace dataset and detection framework' (contribution 5) and 'We release our code, data, and reproducibility package' (Section IX-D)." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": true, 24 "justification": "Section V-G specifies Python 3.10, scikit-learn 1.3.0, Intel i7, 32GB RAM, CPU-only. This is sufficient to recreate the environment." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While code and data are released, the paper itself does not include step-by-step reproduction instructions. No README content or reproduction commands are described in the paper." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results are reported as point estimates (e.g., 92.7%, 49.2%, 90.6%) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": true, 41 "justification": "Section VI-C reports p < 0.001 for model-aware vs ensemble voting comparison." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Cohen's d reported for ensemble comparison (d = 1.87, Section VI-C) and per-model discriminative features (Table VI, d = 0.18-0.33)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "No justification for why 200 traces per model is sufficient. No power analysis. The limitations section acknowledges the dataset scale issue but provides no justification." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations or variance across experimental runs reported. Results appear to be from a single train/test split with fixed seed (42). No multi-run results." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Table I compares against BadAgent and AgentPoison. Table VII compares four detection strategies (single-model, pooled, ensemble voting, model-aware)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "BadAgent (2024) and AgentPoison (2024) are recent and represent the state of the art in agent backdoor detection." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": true, 73 "justification": "RQ2 analyzes feature stability by category (Table V) and identifies which feature categories drive performance. RQ3 compares four detection strategies as ablations of the approach." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "Reports accuracy, F1-score, AUC-ROC, precision, recall, and generalization gap (Section V-D, Tables III and VII)." 79 }, 80 "human_evaluation": { 81 "applies": false, 82 "answer": false, 83 "justification": "This is a ML classification study on execution traces. Human evaluation of system outputs is not relevant." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "Section V-F describes 80/20 train/test split with stratified sampling and fixed random seed (42)." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table III provides per-model precision/recall/F1. Table V provides per-feature-category stability analysis. The 6x6 matrix (Fig. 1) shows per-model-pair results." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "The cross-model failure (49.2% accuracy) is the central finding. Section VI-A discusses GPT-5.1's higher FN rate (15%). Section VII-C discusses false positive/negative trade-offs." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "Ensemble voting performing poorly (62.8%) is a negative result. The entire cross-model generalization gap (49.2%) is a negative finding about single-model detectors." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims (92.7% same-model, 49.2% cross-model, 43.4% gap, 90.6% model-aware) are all supported by results in Section VI and Tables III/VII." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The paper claims temporal features cause the generalization gap (RQ2). This is supported by CV analysis showing temporal features have CV > 0.8 while structural features have CV < 0.2. The ablation across feature categories provides adequate evidence for this causal claim." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section VIII explicitly bounds generalization: synthetic backdoors (not real attacks), non-adaptive adversaries, 6 LLMs only, temporal snapshot (Nov 2025), infrastructure confounding. The paper is careful about scope." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section VIII-f discusses infrastructure confounding as an alternative explanation for temporal feature instability: timing features may capture provider hardware/network differences rather than model-specific patterns." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Models are identified as 'GPT-5.1', 'Claude Sonnet 4.5', 'Grok 4.1 Fast', 'Llama 4 Maverick', 'GPT-OSS 120B', 'DeepSeek Chat V3.1'. These are marketing names without snapshot dates or API version identifiers." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": false, 137 "justification": "The paper describes backdoor injection (few-shot examples, trigger conditions) but does not provide the actual prompt text or system instructions used to generate traces." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section IV-C reports SVM (linear kernel, C=1.0, balanced class weights) and Random Forest (100 estimators, max_depth=10, balanced class weights). LLM API temperature/sampling settings are not reported, but the ML classifier hyperparameters are the primary ones for this study." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": false, 147 "justification": "The paper uses agent scaffolding to generate traces (agents performing tasks with tool invocations) but does not describe the agent scaffold architecture, tool implementations, or workflow details." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section IV-B describes 51 features across 4 categories with specific feature definitions. Section V-B describes backdoor generation methodology with trigger conditions and validation criteria. Z-score normalization mentioned in Section IV-C." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section VIII is a dedicated limitations section with 8 specific limitations (a through h)." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "Limitations are specific: synthetic backdoors vs real attacks, non-adaptive adversaries, infrastructure confounding of timing features, temporal validity of Nov 2025 snapshot, model identification assumption. These are study-specific, not boilerplate." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section III-D explicitly states out-of-scope items. Section VIII identifies specific things not tested: adaptive adversaries, weight-level backdoors, large-scale deployment, model architectures beyond the 6 tested." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "Paper states data is released at the GitHub repository (Section IX-D: 'We release our code, data, and reproducibility package')." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section V-A describes collection via OpenRouter API, 6 LLMs, 100 tasks from AgentBench, 200 traces per model (100 benign + 100 backdoor). Section V-B describes backdoor generation methodology." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Data is synthetically generated execution traces from LLM APIs." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "Pipeline described: trace collection via API → feature extraction (51 features, 4 categories) → z-score normalization → classification. Backdoor validation criteria stated (Section V-B-c)." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding disclosure or acknowledgments section found in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliation clearly stated: 'Enterprise AI Architect, Precise Software Solutions'." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information disclosed. Cannot assess funder independence without knowing funding source." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial disclosure statement found in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "This paper does not evaluate pre-trained model capability on a benchmark. It trains ML classifiers (SVM, Random Forest) on execution traces to detect backdoors. The LLMs are used to generate traces, not evaluated on knowledge tasks." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Same as above — no pre-trained model benchmark evaluation. The train/test split for the ML classifiers is described (80/20 stratified)." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not applicable — the study trains its own classifiers on collected traces rather than evaluating pre-trained model knowledge." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants in this study." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants in this study." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section IV-C states 'feature extraction and classification complete in under 1ms.' Section VII-C provides operational cost analysis (false positive analyst burden)." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "Hardware is stated (Intel i7, 32GB RAM) but total compute budget (API costs for generating 1,198 traces across 6 LLMs, training time) is not reported." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Single-model backdoor detectors achieve 92.7% accuracy within their training distribution but only 49.2% across different LLMs, a 43.4 percentage point generalization gap.", 286 "evidence": "6x6 cross-LLM detection matrix (Figure 1, Section VI-A) showing diagonal vs off-diagonal accuracy across 36 experiments.", 287 "supported": "moderate" 288 }, 289 { 290 "claim": "Temporal features (CV > 0.8) cause the cross-LLM generalization gap while structural features remain stable (CV < 0.2).", 291 "evidence": "Table IV shows feature stability analysis with CV values. Table V shows distribution by category (50% of temporal features unstable vs 0% of sequence features).", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Model-aware detection achieves 90.6% universal accuracy across all evaluated models.", 296 "evidence": "Table VII and Figure 2 (Section VI-C). Statistical significance reported: p < 0.001 vs ensemble voting, Cohen's d = 1.87.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "This is the first systematic study of cross-LLM behavioral backdoor detection.", 301 "evidence": "Table I compares with BadAgent and AgentPoison, showing neither evaluated cross-model detection. Section II-E provides positioning.", 302 "supported": "moderate" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "Single-model behavioral backdoor detectors fail catastrophically when applied to LLMs other than the one they were trained on (92.7% same-model accuracy drops to 49.2% cross-model, equivalent to random guessing). The root cause is temporal feature instability across LLM architectures (CV > 0.8). Model-aware detection, which incorporates model identity as an additional feature, achieves 90.6% universal accuracy across all 6 tested LLMs. The study evaluates 1,198 execution traces across 6 production LLMs from 5 providers.", 307 "red_flags": [ 308 { 309 "flag": "Synthetic backdoors only", 310 "detail": "All backdoor traces are synthetically generated via controlled injection (few-shot poisoning and tool replacement), not captured from real attacks. The paper acknowledges this but the ecological validity of the detection results is uncertain." 311 }, 312 { 313 "flag": "Single train/test split", 314 "detail": "Results are from a single 80/20 split with fixed seed (42). No cross-validation or multi-run variance reported. The 92.7% and 49.2% figures could be sensitive to the particular split." 315 }, 316 { 317 "flag": "Infrastructure confounding acknowledged but unresolved", 318 "detail": "Section VIII-f acknowledges that timing features may capture provider infrastructure differences rather than model-specific behavioral patterns, since all LLMs were accessed via OpenRouter on different provider hardware. This could explain the temporal instability finding." 319 }, 320 { 321 "flag": "Small per-model sample size", 322 "detail": "200 traces per model (100 benign + 100 backdoor) with only 40 traces in each test set. No power analysis or sample size justification provided." 323 }, 324 { 325 "flag": "Solo author with industry affiliation", 326 "detail": "Single author from 'Precise Software Solutions' with no funding disclosure, competing interests statement, or co-author review. No peer review venue indicated." 327 } 328 ], 329 "cited_papers": [ 330 { 331 "title": "Malice in Agentland: Down the Rabbit Hole of Backdoors in the AI Supply Chain", 332 "authors": ["L. Boisvert", "A. Puri", "C. K. R. Evuru"], 333 "year": 2024, 334 "arxiv_id": "2510.05159", 335 "relevance": "Comprehensive analysis of backdoors across the AI agent supply chain, directly relevant to agent security evaluation." 336 }, 337 { 338 "title": "BadAgent: Inserting and Activating Backdoor Attacks in LLM Agents", 339 "authors": ["Y. Wang", "D. Xue", "S. Zhang", "S. Qian"], 340 "year": 2024, 341 "arxiv_id": "2406.03007", 342 "relevance": "Demonstrates backdoor insertion through code manipulation in agent workflows, key baseline for agent backdoor detection." 343 }, 344 { 345 "title": "AgentPoison: Red-teaming LLM Agents via Poisoning Memory or Knowledge Bases", 346 "authors": ["Z. Chen", "Z. Xiang", "C. Xiao", "D. Song", "B. Li"], 347 "year": 2024, 348 "relevance": "Agent memory/knowledge base poisoning attack with watermarking-based detection, key baseline for this work." 349 }, 350 { 351 "title": "Sleeper Agents: Training Deceptive LLMs that Persist Through Safety Training", 352 "authors": ["E. Hubinger", "C. Denison", "J. Mu"], 353 "year": 2024, 354 "relevance": "Shows backdoored models maintain malicious behavior after safety training, relevant to AI safety and alignment research." 355 }, 356 { 357 "title": "Poisoning Web-Scale Training Datasets is Practical", 358 "authors": ["N. Carlini", "M. Jagielski"], 359 "year": 2024, 360 "relevance": "Demonstrates practical data poisoning at web scale requiring only 0.01% modification, foundational for supply chain attack research." 361 }, 362 { 363 "title": "AgentBench: Evaluating LLMs as Agents", 364 "authors": ["X. Liu", "H. Yu", "H. Zhang"], 365 "year": 2023, 366 "arxiv_id": "2308.03688", 367 "relevance": "Benchmark for evaluating LLMs as agents, used as task source in this study." 368 }, 369 { 370 "title": "EIA: Environmental Injection Attack on Generalist Web Agents for Privacy Leakage", 371 "authors": ["Z. Liao", "L. Mo", "C. Xu"], 372 "year": 2024, 373 "arxiv_id": "2409.11295", 374 "relevance": "Environmental injection attacks on web agents causing privacy leakage, relevant to agent security threat landscape." 375 }, 376 { 377 "title": "LlamaFirewall: An Open Source Guardrail System for Building Secure AI Agents", 378 "authors": ["S. Chennabasappa", "C. Nikolaidis", "D. Song"], 379 "year": 2025, 380 "relevance": "Open source guardrail system for agent security, relevant to defense mechanisms against agent attacks." 381 }, 382 { 383 "title": "Backdoor Attacks for In-Context Learning with Language Models", 384 "authors": ["N. Kandpal", "M. Jagielski", "F. Tramer", "N. Carlini"], 385 "year": 2023, 386 "arxiv_id": "2307.14692", 387 "relevance": "In-context learning backdoor attacks on language models, relevant to LLM security research." 388 } 389 ] 390 }