scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (23901B)
      1 {
      2   "paper": {
      3     "title": "Hiding in the AI Traffic: Abusing MCP for LLM-Powered Agentic Red Teaming",
      4     "authors": [
      5       "Strahinja Janjusevic",
      6       "Anna Barón Garcia",
      7       "Sohrob Kazerounian"
      8     ],
      9     "year": 2025,
     10     "venue": "arXiv.org",
     11     "arxiv_id": "2511.15998",
     12     "doi": "10.48550/arXiv.2511.15998"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["survey_methodology"],
     16   "methodology_tags": ["case-study", "meta-analysis"],
     17   "key_findings": "The paper introduces an MCP-based C2 architecture that replaces traditional periodic beaconing with event-driven, LLM-powered agent communication, claiming drastically reduced detection footprint. A case study in the Vectra AI Cyber-Range demonstrated domain compromise in under 30 minutes with zero EDR/NDR detections, compared to days for manual C2 operations. The system leverages decoupled two-leg communication (MCP server + public LLM API) to blend malicious traffic with legitimate AI service traffic. A comprehensive review of 11 existing GenAI red teaming frameworks identifies a gap in the C2 phase of the Cyber Kill Chain.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "The paper explicitly states 'we have not released the full code publicly' (Section VII-A). No repository URL is provided."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No dataset, PCAP files, or experimental data are released. The experiments were conducted in a proprietary Vectra AI Cyber-Range."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper describes the target environment conceptually (three subnets, Windows/Linux hosts, EDR stack) but provides no dependency specifications, software versions, or reproducible environment setup."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The code is not released, and the experiments were in a proprietary cyber-range."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Table III reports point comparisons (e.g., 'Days' vs '<30 Min') with no confidence intervals, error bars, or uncertainty measures."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims the MCP system is dramatically faster and stealthier than traditional C2 but provides no statistical tests. Comparisons in Table III are qualitative labels, not quantified measurements."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No quantified effect sizes are reported. Table III uses qualitative descriptors ('Days' vs '<30 Min', 'Detected' vs 'Undetected') rather than measured magnitudes."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The evaluation is based on a single case study deployment in one lab environment. No justification for sample size or acknowledgment of the limitation of N=1."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "No variance or repeated trials are reported. The case study appears to be a single run with no repeated measurements."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Table III compares the MCP-enabled C2 against a traditional (manual) C2 baseline using Cobalt Strike/Metasploit-style operations on the same environment."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The baseline represents current-generation C2 tools (Cobalt Strike, Metasploit) which are contemporary and widely used in red team operations."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No ablation study is performed. The system has multiple components (MCP server, LLM reasoning, multi-agent orchestration, hybrid planner) but their individual contributions are not isolated."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Table III compares on multiple metrics: time to objective, operator actions, and detection by NDR. The EDR assessment case study adds a qualitative dimension."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation of the system's outputs is reported. The blue team exercise mentioned in Section VII-B has incomplete results ('here we can put our results' placeholder text remains in the paper)."
     93       },
     94       "held_out_test_set": {
     95         "applies": false,
     96         "answer": false,
     97         "justification": "Not applicable — this is a case study of a system deployment, not a benchmark evaluation with train/test splits."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "Results are reported only as aggregate comparisons in Table III. No per-task, per-phase, or per-subnet breakdowns of the agent's performance are provided."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The EDR assessment case study (Section V-C-2) documents specific failures: the BYOVD attack was blocked by hardening, and process injection was blocked by AMSI. The paper also discusses hallucination issues and non-viable paths explored by the agent."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "The EDR case study shows the agent being blocked by AMSI and tamper protection. Section III discusses hallucinations and agents getting stuck in loops. The paper acknowledges 'exploring non-viable paths due to hallucinations.'"
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The abstract claims 'drastic reductions in manual effort and detection footprint' with 'experimental comparisons,' but Table III provides only qualitative labels rather than quantified measurements. The abstract claims 'rapid domain compromise' but timing details are inconsistent (abstract implies systematic comparison; actual evidence is a single case study)."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The paper makes causal claims ('our architecture not only improves goal-directed behavior... but also eliminates key host and network artifacts') based on a single uncontrolled case study. No confounds are addressed — improvements could be due to the LLM capability, the specific environment, or the comparison operator's skill level."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper generalizes from a single lab environment to broad claims about enterprise networks. The title implies general applicability ('Abusing MCP for LLM-Powered Agentic Red Teaming') but results are from one Vectra AI Cyber-Range deployment with a specific topology."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations for the results are discussed. The speed advantage could be due to the manual operator being slow rather than the system being fast. The stealth advantage could be due to the specific NDR/EDR configuration rather than inherent protocol properties."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": false,
    139         "justification": "The paper measures 'time to objective' and 'detection by NDR' in a single lab setting but frames results as demonstrating general 'stealth,' 'scalability,' and 'effectiveness' without acknowledging the gap between lab proxy measurements and real-world operational outcomes."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper states 'Anthropic's Claude Opus 4.1' but does not provide an API version, snapshot date, or model ID. No version info for any other models mentioned."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full system prompts are provided in Appendices A-E, including the base system prompt, act prompt, reason prompt, planning prompt, and action decision prompt."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "No LLM hyperparameters (temperature, top-p, max tokens) are reported for the Claude API calls."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The agentic scaffolding is described in detail: three-component architecture (Recon Agents, MCP Coordination Server, Red Team Command Agent), hybrid planning system, SQLite memory, 44 built-in tools, universal installer, and the decoupled two-leg communication flow (Sections IV-C, V)."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "No data preprocessing documentation is provided. How network traffic was captured, filtered, or analyzed for the figures is not described."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "There is no dedicated limitations section. Section III discusses challenges of GenAI red teaming in general but not limitations of this specific study. Ethical considerations in Section VII are about dual-use risk, not methodological limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No specific threats to validity are discussed for this study. The paper does not acknowledge that its single-environment case study may not generalize, or that the manual baseline comparison may be unfair."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "No explicit scope boundaries are stated. The paper does not specify what the results do NOT show or what settings are excluded from claims."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No raw data (PCAPs, logs, agent outputs) are available for verification. Only traffic pattern figures and a qualitative comparison table are provided."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section V-A-1 describes the experimental setup: target environment topology (three subnets), defensive stack (Vectra AI NDR, Splunk SIEM, CrowdStrike/SentinelOne EDR), and the agent configuration with Claude Opus 4.1."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. The study evaluates a system in a lab environment."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The pipeline from raw experimental outputs to the reported results (Table III, traffic figures) is not documented. How metrics like 'time to objective' were measured is not specified."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Section X acknowledges 'Anthropic for covering API costs that made this study possible.'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly disclosed: MIT and Vectra AI. The paper uses Vectra AI's Cyber-Range for testing."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Anthropic funded the API costs and the system uses Anthropic's Claude model. Anthropic has a financial interest in demonstrating the capabilities of its models. Two of three authors are affiliated with Vectra AI, whose NDR platform is used and presented favorably."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests statement is present. Vectra AI employees evaluated a system deployed on Vectra AI's cyber-range, and no financial interest disclosure is made."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "The paper does not evaluate a pre-trained model's capability on a benchmark. It uses the LLM as a tool within a red teaming system."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not applicable — no benchmark evaluation of model knowledge."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": false,
    243         "answer": false,
    244         "justification": "Not applicable — no benchmark evaluation of model knowledge."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No inference costs are reported despite the system making extensive Claude API calls. Anthropic covered the costs (acknowledged) but no dollar amounts or token counts are provided."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "No computational budget is stated. The total API spend, number of tokens consumed, or hardware specifications for the MCP server are not reported."
    294       }
    295     },
    296     "survey_methodology": {
    297       "prisma_or_structured_protocol": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The literature review in Section II surveys 11 frameworks but follows no structured protocol (PRISMA, systematic search strategy, or reproducible query). Papers appear to be selected ad hoc."
    301       },
    302       "quality_assessment_of_sources": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Table I categorizes frameworks by approach and lists strengths/weaknesses, but applies no quality scoring rubric or risk-of-bias assessment. All papers are treated as equally credible."
    306       },
    307       "publication_bias_discussed": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No discussion of publication bias. The review does not consider whether the surveyed frameworks disproportionately report positive results."
    311       }
    312     }
    313   },
    314   "claims": [
    315     {
    316       "claim": "The MCP-enabled C2 achieves domain dominance in under 30 minutes compared to days for traditional C2.",
    317       "evidence": "Table III comparison and case study narrative in Sections V-VI. Specific timing and metrics are qualitative labels, not measured values with uncertainty.",
    318       "supported": "weak"
    319     },
    320     {
    321       "claim": "The MCP-based C2 was undetected by NDR while traditional C2 beaconing was detected.",
    322       "evidence": "Table III states 'Undetected' for MCP vs 'Detected (Periodic Beaconing)' for traditional. Traffic analysis figures (3-6) show behavioral differences. However, this is a single test on one NDR platform (Vectra AI, the authors' employer).",
    323       "supported": "weak"
    324     },
    325     {
    326       "claim": "The agent's EDR evasion test generated zero detections from Microsoft Defender while executing complex multi-phase operations.",
    327       "evidence": "Section V-C-2 and Figure 8 describe the EDR assessment. The agent was blocked by AMSI and hardening but generated no alerts. This is a single test on one endpoint.",
    328       "supported": "moderate"
    329     },
    330     {
    331       "claim": "The architecture eliminates key host and network artifacts used to detect C2 behavior.",
    332       "evidence": "Figures 3-6 compare traffic patterns. The event-driven MCP traffic lacks the periodic beaconing signature of traditional C2. However, no systematic analysis of all artifact types is performed.",
    333       "supported": "moderate"
    334     },
    335     {
    336       "claim": "The system requires only a single high-level directive compared to 200+ individual commands for manual C2.",
    337       "evidence": "Table III, Section VI-A. No methodology for counting commands is described. The comparison is between an AI system and a single manual operator whose skill level is not characterized.",
    338       "supported": "weak"
    339     }
    340   ],
    341   "red_flags": [
    342     {
    343       "flag": "Conflict of interest: company evaluating own platform",
    344       "detail": "Two of three authors work at Vectra AI. The system was tested on Vectra AI's Cyber-Range using Vectra AI's NDR platform. The paper presents Vectra AI's NDR as the defensive baseline. Anthropic funded API costs and their model is showcased. Neither conflict is explicitly acknowledged."
    345     },
    346     {
    347       "flag": "N=1 case study presented as experimental comparison",
    348       "detail": "Table III labeled 'Comparative Benchmark' presents results from what appears to be a single run in a single environment, but uses language suggesting systematic experimental evaluation ('benchmark analysis,' 'experimental comparisons')."
    349     },
    350     {
    351       "flag": "Incomplete results placeholder left in paper",
    352       "detail": "Section VII-B contains the placeholder text '(here we can put our results)' for the blue team exercise, suggesting the paper was submitted with incomplete evaluation."
    353     },
    354     {
    355       "flag": "Unfair baseline comparison",
    356       "detail": "The manual C2 baseline is not well-characterized. The skill level of the manual operator, the specific tools used, and whether the manual test was given the same objectives under the same conditions are not described. The comparison may reflect operator competence rather than architectural superiority."
    357     },
    358     {
    359       "flag": "Overclaiming from qualitative evidence",
    360       "detail": "The abstract claims 'experimental comparisons with traditional C2 show drastic reductions in manual effort and detection footprint' but the actual comparison (Table III) uses qualitative labels ('Days' vs '<30 Min') rather than measured, quantified data with uncertainty."
    361     },
    362     {
    363       "flag": "No limitations section",
    364       "detail": "The paper contains no discussion of its own methodological limitations, despite making broad claims from a single lab deployment."
    365     }
    366   ],
    367   "cited_papers": [
    368     {
    369       "title": "A survey on offensive ai within cybersecurity",
    370       "authors": ["S. Girhepuje", "A. Verma", "G. Raina"],
    371       "year": 2024,
    372       "relevance": "Survey of AI applications in offensive cybersecurity, directly relevant to understanding the landscape of AI-powered security tools."
    373     },
    374     {
    375       "title": "Large language models for cyber security: A systematic literature review",
    376       "authors": ["H. Xu", "S. Wang", "N. Li"],
    377       "year": 2024,
    378       "relevance": "Systematic review of LLMs in cybersecurity, relevant as a meta-analysis of AI capabilities in the security domain."
    379     },
    380     {
    381       "title": "VulnBot: Autonomous Penetration Testing for A Multi-Agent Collaborative Framework",
    382       "authors": ["H. Kong", "D. Hu", "J. Ge"],
    383       "year": 2025,
    384       "relevance": "Multi-agent framework for autonomous penetration testing using LLMs, directly relevant to agentic AI evaluation."
    385     },
    386     {
    387       "title": "On the surprising efficacy of llms for penetration-testing",
    388       "authors": ["A. Happe", "J. Cito"],
    389       "year": 2025,
    390       "relevance": "Empirical evaluation of LLM capability in penetration testing tasks."
    391     },
    392     {
    393       "title": "On the ethics of using llms for offensive security",
    394       "authors": ["A. Happe", "J. Cito"],
    395       "year": 2025,
    396       "relevance": "Addresses ethical dimensions of LLM use in offensive security, relevant to AI safety and dual-use concerns."
    397     },
    398     {
    399       "title": "The malicious use of artificial intelligence: Forecasting, prevention, and mitigation",
    400       "authors": ["M. Brundage", "S. Avin", "J. Clark"],
    401       "year": 2018,
    402       "arxiv_id": "1802.07228",
    403       "relevance": "Foundational paper on dual-use AI risks, relevant to AI safety evaluation methodology."
    404     },
    405     {
    406       "title": "RedTeamLLM: an Agentic AI framework for offensive security",
    407       "authors": ["B. Challita", "P. Parrend"],
    408       "year": 2025,
    409       "relevance": "Agentic AI framework for red teaming that serves as the baseline for this paper's system."
    410     },
    411     {
    412       "title": "PentestAgent: Incorporating LLM Agents to Automated Penetration Testing",
    413       "authors": ["X. Shen"],
    414       "year": 2024,
    415       "relevance": "Multi-agent LLM platform for automated penetration testing, relevant to agentic AI capabilities."
    416     },
    417     {
    418       "title": "PentestGPT: An LLM-empowered Automated Penetration Testing Tool",
    419       "authors": ["G. Deng", "Z. Liu", "B. Li"],
    420       "year": 2023,
    421       "relevance": "Early LLM-powered penetration testing tool, foundational work in AI-assisted security testing."
    422     },
    423     {
    424       "title": "HackSynth: LLM Agent and Evaluation Framework for Autonomus Penetration Testing",
    425       "authors": ["M. Lajos", "I. David", "L. Andreas"],
    426       "year": 2024,
    427       "relevance": "Evaluation framework for autonomous LLM agents in penetration testing, relevant to benchmarking AI agent capabilities."
    428     },
    429     {
    430       "title": "LLMs can autonomously exploit one-day vulnerabilities",
    431       "authors": ["R. Fang", "R. Bindu", "A. Gupta", "D. Kang"],
    432       "year": 2024,
    433       "arxiv_id": "2404.08144",
    434       "relevance": "Demonstrates autonomous LLM exploitation capabilities, directly relevant to AI agent evaluation."
    435     },
    436     {
    437       "title": "Why do multi-agent llm systems fail?",
    438       "authors": ["M. Cemri"],
    439       "year": 2025,
    440       "relevance": "Analysis of failure modes in multi-agent LLM systems, relevant to understanding limitations of agentic AI."
    441     }
    442   ]
    443 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs