scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19897B)
      1 {
      2   "paper": {
      3     "title": "CoTDeceptor: Adversarial Code Obfuscation Against CoT-Enhanced LLM Code Agents",
      4     "authors": ["Haoyang Li", "Mingjin Li", "Jinxin Zuo", "Siqi Li", "Xiao Li", "Hao Wu", "Yueming Lu", "Xiaochuan He"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2512.21250"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub link provided: https://github.com/hiki9712/CoT-Code-Obfuscation (footnote on page 1)."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The malicious code dataset was 'provided through collaboration with QiAnXin' (Section 5.1) but no public download link or dataset release is mentioned."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "No requirements.txt, Dockerfile, or environment setup section is provided. The paper does not specify library versions or dependencies."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README with commands, or scripts to replicate experiments are described in the paper."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Tables 2-4 report only point estimates (pass/fail, scores, F1) with no confidence intervals or error bars."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims CoTDeceptor outperforms CodeBreaker and other baselines but provides no statistical significance tests."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "Results are reported as pass/fail counts and raw F1 scores. No effect sizes (Cohen's d, odds ratios) or baseline-contextualized percentage improvements are provided."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "500 vulnerable samples are used (Section 5.2) and 15 vulnerability categories tested, but no justification for these sizes is given."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviations, variance, or spread measures are reported across experimental runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Table 2 compares CoTDeceptor against CodeBreaker across the same vulnerability categories and detectors."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Baselines include CodeBreaker (USENIX Security 2024), ITGen (ICSE 2025), Flashboom (IEEE S&P 2025), and TrojanPuzzle (IEEE S&P 2024) — all recent."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": false,
     73         "justification": "No ablation study removing individual components (reflection module, strategy tree, MoE voting) to measure their contribution."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper uses evasion pass rate, potential scores, average rollout cycles, and precision/recall/F1 for the fine-tuning experiment (Tables 2-4)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "No human evaluation of the obfuscated code quality, readability, or semantic preservation is included. All evaluation is automated."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "No explicit separation of dev and test splits is described. The same vulnerability samples appear to be used for both strategy development and evaluation."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Table 2 provides per-vulnerability-category breakdown across 15 CWE types."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Table 2 shows avoid-pickle failed for both CoTDeceptor and CodeBreaker against DeepSeek-R1. Table 3 also shows avoid-pickle as a failure case. Section 5.6 discusses semantic drift and limitations."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The avoid-pickle vulnerability category consistently fails across experiments. Section 5.6 acknowledges computational overhead and occasional semantic drift."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims '14 out of 15 vulnerability categories bypassed' and 'only 2 bypassed by prior methods' are supported by Table 2 results."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Causal claims about CoTDeceptor's obfuscation causing evasion are justified through controlled experiments comparing before/after obfuscation detection rates. The case study (Section 5.5) demonstrates the causal mechanism."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": false,
    120         "justification": "The title claims applicability to 'CoT-Enhanced LLM Code Agents' generally, but evaluation covers only a few specific models (DeepSeek-R1, GPT-5, Qwen3 variants, Codex). The paper does not bound claims to tested models."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for evasion success, such as whether the obfuscation simply makes code longer/more complex (confusing any analyzer, not just CoT), or whether the detector failures are due to general model limitations rather than CoT-specific weaknesses."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are named as 'DeepSeek-R1', 'GPT-5', 'GPT-5.1', 'GPT-5.2', 'Gemini-3-Pro', 'Qwen3' without specific version identifiers, snapshot dates, or API versions."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Appendix B provides concrete prompt templates for security analysis, strategy planning, and strategy reflection, with the actual text and placeholder variables clearly shown."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No temperature, top-p, max tokens, or other LLM hyperparameters are reported for any of the models used."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "The multi-agent framework is described in detail in Section 4: generator, verifier (three phases), reflection module, and strategy tree exploration with Thompson Sampling."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": false,
    152         "justification": "The malicious code samples are described as 'provided through collaboration with QiAnXin' covering diverse CWE categories (Section 5.1) but no details on how samples were selected, filtered, or preprocessed."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "Section 5.6 'Discussion and Limitations' discusses computational overhead, semantic drift, and dependence on detector hyperparameters."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "The limitations in Section 5.6 are somewhat generic ('computational overhead', 'occasional semantic drift'). No specific threats like dataset representativeness or potential overfitting to particular detector versions are discussed."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "The paper does not explicitly state what the results do NOT show — e.g., no acknowledgment that results may not generalize beyond the tested CWE categories, languages, or specific model versions."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "The malicious code dataset from QiAnXin is not publicly released. No raw experimental logs or intermediate results are provided."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": false,
    181         "justification": "Section 5.1 says 'real-world vulnerable code provided through collaboration with QiAnXin, covering diverse CWE categories' but provides no details on how samples were selected, how many were initially available, or inclusion criteria."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants in this study."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": false,
    191         "justification": "No documentation of how the 500 vulnerable samples were derived from the QiAnXin collaboration, or how CWE categories were selected."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": true,
    198         "justification": "Acknowledgments section lists National Natural Science Foundation of China (Grant No. 62402057) and State Key Laboratory of Cyberspace Security Defense (Grant No. 2025-C08)."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "Author affiliations are listed including QiAnXin Technology Group (author Xiaochuan He), which provided the malicious code dataset."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Funders are the National Natural Science Foundation of China and a state key lab — neither has a financial interest in the specific attack framework succeeding."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement is present. One author is from QiAnXin (a security company), which could have commercial interest in the findings, but this is not acknowledged."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper tests an attack framework's ability to evade detectors, not a model's benchmark performance. Contamination is not relevant to the evasion claims."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "Not a benchmark evaluation of model knowledge; this tests adversarial evasion capabilities."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "Not a benchmark evaluation of model knowledge."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": false,
    274         "justification": "The framework involves multi-round rollouts with multiple LLMs (1-46 cycles per vulnerability) but no API costs, token counts, or wall-clock times are reported."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No total computational budget, GPU hours, or API spend is reported despite the approach requiring extensive multi-round LLM interaction."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "CoTDeceptor bypasses 14 out of 15 vulnerability categories against CoT-enhanced LLM detectors, compared to only 2 by CodeBreaker.",
    286       "evidence": "Table 2 shows CoTDeceptor passing DeepSeek-R1 for 14/15 categories (all except avoid-pickle) while CodeBreaker passes only 2 (avoid-bind-to-all-interfaces and flask-wtf-csrf-disabled).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "A weaker strategy model (deepseek-r1-distill-qwen-32b) can successfully evolve obfuscation strategies that mislead a stronger detector (deepseek-r1-distill-llama-70b).",
    291       "evidence": "Table 3 shows the small model succeeding on 14/15 categories, though with higher average cycle counts in some cases (Section 5.3).",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Fine-tuning Qwen3-4B on CoTDeceptor-generated obfuscation data improves detection F1 from 0.5333 to 0.6667.",
    296       "evidence": "Table 4 shows Qwen3-4B base at 0.5333 F1 vs Qwen3-4B-SFT at 0.6667 F1.",
    297       "supported": "weak"
    298     },
    299     {
    300       "claim": "CoTDeceptor can bypass real-world code agents (Codex, Qwen Code) in an end-to-end case study.",
    301       "evidence": "Section 5.5 demonstrates one case study with pyramid-csrf-check-disabled where both Codex and Qwen Code failed to detect the vulnerability after obfuscation (Figure 3).",
    302       "supported": "weak"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval", "case-study"],
    306   "key_findings": "CoTDeceptor is a multi-agent reinforcement-learning framework that generates adversarial code obfuscations to evade CoT-enhanced LLM vulnerability detectors. It bypasses 14 of 15 vulnerability categories against DeepSeek-R1 and GPT-5 detectors, compared to 2 for the best prior method (CodeBreaker). A weaker strategy model can evolve obfuscations that fool stronger detectors, demonstrating capability amplification. Fine-tuning small models on CoTDeceptor-generated data improves their detection robustness.",
    307   "red_flags": [
    308     {
    309       "flag": "No ablation study",
    310       "detail": "The framework has multiple components (reflection module, strategy tree with Thompson Sampling, MoE voting, three-phase verification) but no ablation study isolates which components are responsible for the improvements."
    311     },
    312     {
    313       "flag": "Single case study for real-world agents",
    314       "detail": "The end-to-end agent evaluation (Section 5.5) uses only one vulnerability (pyramid-csrf-check-disabled) against two agents. This is too narrow to support claims about real-world implications."
    315     },
    316     {
    317       "flag": "No cost reporting despite expensive approach",
    318       "detail": "CoTDeceptor requires 1-46 multi-round rollouts per vulnerability with multiple LLM calls per round, but no costs or compute budget are reported."
    319     },
    320     {
    321       "flag": "Proprietary dataset",
    322       "detail": "The malicious code dataset from QiAnXin is not publicly available, making independent verification impossible."
    323     },
    324     {
    325       "flag": "Potential conflict of interest not acknowledged",
    326       "detail": "One author is from QiAnXin, which provided the dataset and is a security company that could benefit from demonstrating weaknesses in LLM-based detection. This conflict is not acknowledged."
    327     },
    328     {
    329       "flag": "Fine-tuning claim based on very small sample",
    330       "detail": "Table 4 F1 improvements appear based on a very small evaluation set (9 samples based on recall denominators), making the fine-tuning claim unreliable."
    331     }
    332   ],
    333   "cited_papers": [
    334     {
    335       "title": "TrojanPuzzle: Covertly Poisoning Code-Suggestion Models",
    336       "authors": ["Hojjat Aghakhani", "Wei Dai", "Andre Manoel"],
    337       "year": 2024,
    338       "relevance": "Adversarial attack on code completion models via template-based token masking, directly relevant baseline."
    339     },
    340     {
    341       "title": "An LLM-Assisted Easy-to-Trigger Backdoor Attack on Code Completion Models: Injecting Disguised Vulnerabilities Against Strong Detection",
    342       "authors": ["Shenao Yan", "Shen Wang", "Yue Duan"],
    343       "year": 2024,
    344       "relevance": "CodeBreaker baseline that uses LLMs for malicious payload transformation against vulnerability detectors."
    345     },
    346     {
    347       "title": "Make a Feint to the East While Attacking in the West: Blinding LLM-based Code Auditors with Flashboom Attacks",
    348       "authors": ["Xiao Li", "Yue Li", "Hao Wu"],
    349       "year": 2025,
    350       "relevance": "Attention manipulation attack against LLM-based code auditors, key baseline in adversarial code security."
    351     },
    352     {
    353       "title": "Iterative Generation of Adversarial Example for Deep Code Models",
    354       "authors": ["Li Huang", "Weifeng Sun", "Meng Yan"],
    355       "year": 2025,
    356       "relevance": "ITGen baseline using iterative feedback-driven identifier replacement for adversarial code generation."
    357     },
    358     {
    359       "title": "Security Weaknesses of Copilot-Generated Code in GitHub Projects: An Empirical Study",
    360       "authors": ["Yujia Fu", "Peng Liang", "Amjed Tahir"],
    361       "year": 2023,
    362       "relevance": "Empirical study of security vulnerabilities in AI-generated code, motivating the supply chain threat model."
    363     },
    364     {
    365       "title": "Vulnerability Detection with Code Language Models: How Far Are We?",
    366       "authors": ["Yangruibo Ding", "Yanjun Fu", "Omniyyah Ibrahim"],
    367       "year": 2024,
    368       "relevance": "PrimeVul dataset challenging LLM effectiveness for vulnerability detection."
    369     },
    370     {
    371       "title": "Benchmarking LLMs and LLM-based Agents in Practical Vulnerability Detection for Code Repositories",
    372       "authors": ["Alperen Yildiz", "Sin G Teo", "Yiling Lou"],
    373       "year": 2025,
    374       "relevance": "JITVUL benchmark evaluating LLM agents for repository-level vulnerability detection."
    375     },
    376     {
    377       "title": "LLMs Cannot Reliably Identify and Reason About Security Vulnerabilities (Yet?)",
    378       "authors": ["Saad Ullah", "Mingji Han", "Saurabh Pujar"],
    379       "year": 2024,
    380       "relevance": "Comprehensive evaluation of LLM limitations in security vulnerability detection."
    381     },
    382     {
    383       "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
    384       "authors": ["DeepSeek-AI"],
    385       "year": 2025,
    386       "arxiv_id": "2501.12948",
    387       "relevance": "Primary strategy generator model used in CoTDeceptor experiments."
    388     },
    389     {
    390       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    391       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    392       "year": 2023,
    393       "relevance": "Foundational reasoning+acting framework that CoTDeceptor's iterative approach builds upon."
    394     }
    395   ]
    396 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs