scan.json (19897B)
1 { 2 "paper": { 3 "title": "CoTDeceptor: Adversarial Code Obfuscation Against CoT-Enhanced LLM Code Agents", 4 "authors": ["Haoyang Li", "Mingjin Li", "Jinxin Zuo", "Siqi Li", "Xiao Li", "Hao Wu", "Yueming Lu", "Xiaochuan He"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2512.21250" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": true, 14 "justification": "GitHub link provided: https://github.com/hiki9712/CoT-Code-Obfuscation (footnote on page 1)." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "The malicious code dataset was 'provided through collaboration with QiAnXin' (Section 5.1) but no public download link or dataset release is mentioned." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or environment setup section is provided. The paper does not specify library versions or dependencies." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are described in the paper." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Tables 2-4 report only point estimates (pass/fail, scores, F1) with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims CoTDeceptor outperforms CodeBreaker and other baselines but provides no statistical significance tests." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": false, 46 "justification": "Results are reported as pass/fail counts and raw F1 scores. No effect sizes (Cohen's d, odds ratios) or baseline-contextualized percentage improvements are provided." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "500 vulnerable samples are used (Section 5.2) and 15 vulnerability categories tested, but no justification for these sizes is given." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No standard deviations, variance, or spread measures are reported across experimental runs." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "Table 2 compares CoTDeceptor against CodeBreaker across the same vulnerability categories and detectors." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": true, 68 "justification": "Baselines include CodeBreaker (USENIX Security 2024), ITGen (ICSE 2025), Flashboom (IEEE S&P 2025), and TrojanPuzzle (IEEE S&P 2024) — all recent." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study removing individual components (reflection module, strategy tree, MoE voting) to measure their contribution." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper uses evasion pass rate, potential scores, average rollout cycles, and precision/recall/F1 for the fine-tuning experiment (Tables 2-4)." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of the obfuscated code quality, readability, or semantic preservation is included. All evaluation is automated." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": false, 88 "justification": "No explicit separation of dev and test splits is described. The same vulnerability samples appear to be used for both strategy development and evaluation." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Table 2 provides per-vulnerability-category breakdown across 15 CWE types." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Table 2 shows avoid-pickle failed for both CoTDeceptor and CodeBreaker against DeepSeek-R1. Table 3 also shows avoid-pickle as a failure case. Section 5.6 discusses semantic drift and limitations." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The avoid-pickle vulnerability category consistently fails across experiments. Section 5.6 acknowledges computational overhead and occasional semantic drift." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "Abstract claims '14 out of 15 vulnerability categories bypassed' and 'only 2 bypassed by prior methods' are supported by Table 2 results." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "Causal claims about CoTDeceptor's obfuscation causing evasion are justified through controlled experiments comparing before/after obfuscation detection rates. The case study (Section 5.5) demonstrates the causal mechanism." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": false, 120 "justification": "The title claims applicability to 'CoT-Enhanced LLM Code Agents' generally, but evaluation covers only a few specific models (DeepSeek-R1, GPT-5, Qwen3 variants, Codex). The paper does not bound claims to tested models." 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": false, 125 "justification": "No discussion of alternative explanations for evasion success, such as whether the obfuscation simply makes code longer/more complex (confusing any analyzer, not just CoT), or whether the detector failures are due to general model limitations rather than CoT-specific weaknesses." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": false, 132 "justification": "Models are named as 'DeepSeek-R1', 'GPT-5', 'GPT-5.1', 'GPT-5.2', 'Gemini-3-Pro', 'Qwen3' without specific version identifiers, snapshot dates, or API versions." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "Appendix B provides concrete prompt templates for security analysis, strategy planning, and strategy reflection, with the actual text and placeholder variables clearly shown." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": false, 142 "justification": "No temperature, top-p, max tokens, or other LLM hyperparameters are reported for any of the models used." 143 }, 144 "scaffolding_described": { 145 "applies": true, 146 "answer": true, 147 "justification": "The multi-agent framework is described in detail in Section 4: generator, verifier (three phases), reflection module, and strategy tree exploration with Thompson Sampling." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": false, 152 "justification": "The malicious code samples are described as 'provided through collaboration with QiAnXin' covering diverse CWE categories (Section 5.1) but no details on how samples were selected, filtered, or preprocessed." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section 5.6 'Discussion and Limitations' discusses computational overhead, semantic drift, and dependence on detector hyperparameters." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": false, 164 "justification": "The limitations in Section 5.6 are somewhat generic ('computational overhead', 'occasional semantic drift'). No specific threats like dataset representativeness or potential overfitting to particular detector versions are discussed." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": false, 169 "justification": "The paper does not explicitly state what the results do NOT show — e.g., no acknowledgment that results may not generalize beyond the tested CWE categories, languages, or specific model versions." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": false, 176 "justification": "The malicious code dataset from QiAnXin is not publicly released. No raw experimental logs or intermediate results are provided." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": false, 181 "justification": "Section 5.1 says 'real-world vulnerable code provided through collaboration with QiAnXin, covering diverse CWE categories' but provides no details on how samples were selected, how many were initially available, or inclusion criteria." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants in this study." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": false, 191 "justification": "No documentation of how the 500 vulnerable samples were derived from the QiAnXin collaboration, or how CWE categories were selected." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": true, 198 "justification": "Acknowledgments section lists National Natural Science Foundation of China (Grant No. 62402057) and State Key Laboratory of Cyberspace Security Defense (Grant No. 2025-C08)." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are listed including QiAnXin Technology Group (author Xiaochuan He), which provided the malicious code dataset." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": true, 208 "justification": "Funders are the National Natural Science Foundation of China and a state key lab — neither has a financial interest in the specific attack framework succeeding." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests statement is present. One author is from QiAnXin (a security company), which could have commercial interest in the findings, but this is not acknowledged." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": false, 219 "answer": false, 220 "justification": "The paper tests an attack framework's ability to evade detectors, not a model's benchmark performance. Contamination is not relevant to the evasion claims." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": false, 224 "answer": false, 225 "justification": "Not a benchmark evaluation of model knowledge; this tests adversarial evasion capabilities." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": false, 229 "answer": false, 230 "justification": "Not a benchmark evaluation of model knowledge." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": false, 274 "justification": "The framework involves multi-round rollouts with multiple LLMs (1-46 cycles per vulnerability) but no API costs, token counts, or wall-clock times are reported." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total computational budget, GPU hours, or API spend is reported despite the approach requiring extensive multi-round LLM interaction." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "CoTDeceptor bypasses 14 out of 15 vulnerability categories against CoT-enhanced LLM detectors, compared to only 2 by CodeBreaker.", 286 "evidence": "Table 2 shows CoTDeceptor passing DeepSeek-R1 for 14/15 categories (all except avoid-pickle) while CodeBreaker passes only 2 (avoid-bind-to-all-interfaces and flask-wtf-csrf-disabled).", 287 "supported": "strong" 288 }, 289 { 290 "claim": "A weaker strategy model (deepseek-r1-distill-qwen-32b) can successfully evolve obfuscation strategies that mislead a stronger detector (deepseek-r1-distill-llama-70b).", 291 "evidence": "Table 3 shows the small model succeeding on 14/15 categories, though with higher average cycle counts in some cases (Section 5.3).", 292 "supported": "moderate" 293 }, 294 { 295 "claim": "Fine-tuning Qwen3-4B on CoTDeceptor-generated obfuscation data improves detection F1 from 0.5333 to 0.6667.", 296 "evidence": "Table 4 shows Qwen3-4B base at 0.5333 F1 vs Qwen3-4B-SFT at 0.6667 F1.", 297 "supported": "weak" 298 }, 299 { 300 "claim": "CoTDeceptor can bypass real-world code agents (Codex, Qwen Code) in an end-to-end case study.", 301 "evidence": "Section 5.5 demonstrates one case study with pyramid-csrf-check-disabled where both Codex and Qwen Code failed to detect the vulnerability after obfuscation (Figure 3).", 302 "supported": "weak" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval", "case-study"], 306 "key_findings": "CoTDeceptor is a multi-agent reinforcement-learning framework that generates adversarial code obfuscations to evade CoT-enhanced LLM vulnerability detectors. It bypasses 14 of 15 vulnerability categories against DeepSeek-R1 and GPT-5 detectors, compared to 2 for the best prior method (CodeBreaker). A weaker strategy model can evolve obfuscations that fool stronger detectors, demonstrating capability amplification. Fine-tuning small models on CoTDeceptor-generated data improves their detection robustness.", 307 "red_flags": [ 308 { 309 "flag": "No ablation study", 310 "detail": "The framework has multiple components (reflection module, strategy tree with Thompson Sampling, MoE voting, three-phase verification) but no ablation study isolates which components are responsible for the improvements." 311 }, 312 { 313 "flag": "Single case study for real-world agents", 314 "detail": "The end-to-end agent evaluation (Section 5.5) uses only one vulnerability (pyramid-csrf-check-disabled) against two agents. This is too narrow to support claims about real-world implications." 315 }, 316 { 317 "flag": "No cost reporting despite expensive approach", 318 "detail": "CoTDeceptor requires 1-46 multi-round rollouts per vulnerability with multiple LLM calls per round, but no costs or compute budget are reported." 319 }, 320 { 321 "flag": "Proprietary dataset", 322 "detail": "The malicious code dataset from QiAnXin is not publicly available, making independent verification impossible." 323 }, 324 { 325 "flag": "Potential conflict of interest not acknowledged", 326 "detail": "One author is from QiAnXin, which provided the dataset and is a security company that could benefit from demonstrating weaknesses in LLM-based detection. This conflict is not acknowledged." 327 }, 328 { 329 "flag": "Fine-tuning claim based on very small sample", 330 "detail": "Table 4 F1 improvements appear based on a very small evaluation set (9 samples based on recall denominators), making the fine-tuning claim unreliable." 331 } 332 ], 333 "cited_papers": [ 334 { 335 "title": "TrojanPuzzle: Covertly Poisoning Code-Suggestion Models", 336 "authors": ["Hojjat Aghakhani", "Wei Dai", "Andre Manoel"], 337 "year": 2024, 338 "relevance": "Adversarial attack on code completion models via template-based token masking, directly relevant baseline." 339 }, 340 { 341 "title": "An LLM-Assisted Easy-to-Trigger Backdoor Attack on Code Completion Models: Injecting Disguised Vulnerabilities Against Strong Detection", 342 "authors": ["Shenao Yan", "Shen Wang", "Yue Duan"], 343 "year": 2024, 344 "relevance": "CodeBreaker baseline that uses LLMs for malicious payload transformation against vulnerability detectors." 345 }, 346 { 347 "title": "Make a Feint to the East While Attacking in the West: Blinding LLM-based Code Auditors with Flashboom Attacks", 348 "authors": ["Xiao Li", "Yue Li", "Hao Wu"], 349 "year": 2025, 350 "relevance": "Attention manipulation attack against LLM-based code auditors, key baseline in adversarial code security." 351 }, 352 { 353 "title": "Iterative Generation of Adversarial Example for Deep Code Models", 354 "authors": ["Li Huang", "Weifeng Sun", "Meng Yan"], 355 "year": 2025, 356 "relevance": "ITGen baseline using iterative feedback-driven identifier replacement for adversarial code generation." 357 }, 358 { 359 "title": "Security Weaknesses of Copilot-Generated Code in GitHub Projects: An Empirical Study", 360 "authors": ["Yujia Fu", "Peng Liang", "Amjed Tahir"], 361 "year": 2023, 362 "relevance": "Empirical study of security vulnerabilities in AI-generated code, motivating the supply chain threat model." 363 }, 364 { 365 "title": "Vulnerability Detection with Code Language Models: How Far Are We?", 366 "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim"], 367 "year": 2024, 368 "relevance": "PrimeVul dataset challenging LLM effectiveness for vulnerability detection." 369 }, 370 { 371 "title": "Benchmarking LLMs and LLM-based Agents in Practical Vulnerability Detection for Code Repositories", 372 "authors": ["Alperen Yildiz", "Sin G Teo", "Yiling Lou"], 373 "year": 2025, 374 "relevance": "JITVUL benchmark evaluating LLM agents for repository-level vulnerability detection." 375 }, 376 { 377 "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?)", 378 "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar"], 379 "year": 2024, 380 "relevance": "Comprehensive evaluation of LLM limitations in security vulnerability detection." 381 }, 382 { 383 "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning", 384 "authors": ["DeepSeek-AI"], 385 "year": 2025, 386 "arxiv_id": "2501.12948", 387 "relevance": "Primary strategy generator model used in CoTDeceptor experiments." 388 }, 389 { 390 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 391 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 392 "year": 2023, 393 "relevance": "Foundational reasoning+acting framework that CoTDeceptor's iterative approach builds upon." 394 } 395 ] 396 }