ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan.json (18999B)


      1 {
      2   "paper": {
      3     "title": "MCP Safety Audit: LLMs with the Model Context Protocol Allow Major Security Exploits",
      4     "authors": ["Brandon Radosevich", "John T. Halloran"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2504.03767"
      8   },
      9   "scan_version": 2,
     10   "active_modules": [],
     11   "methodology_tags": ["case-study", "qualitative"],
     12   "key_findings": "The paper demonstrates that Claude 3.7 and Llama-3.3-70B-Instruct connected to standard MCP servers can be coerced into executing malicious code, granting remote access control, and stealing credentials. A novel Retrieval-Agent Deception (RADE) attack is introduced where corrupted data in a vector database triggers multi-MCP-server exploits without direct attacker access. The authors release McpSafetyScanner, a multi-agent tool that proactively scans MCP server configurations for vulnerabilities and generates remediation reports.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "McpSafetyScanner is released at https://github.com/johnhalloran321/mcpSafetyScanner, stated in the abstract and confirmed as 'freely available.'"
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "Attack prompts are shown in figures/screenshots but no structured dataset of attack prompts, MCP configurations, or results is released as a downloadable artifact."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "Section 7 specifies: Claude Desktop v0.8.1, macOS Sequoia v15.3.2, Agno v1.2.6, gpt-4o-2024-08-06, mcp v1.1.2, huggingface-hub v0.29.3, GNU netcat v0.7.1. Sufficient to reconstruct the setup."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The MCP config file is given (Appendix A) but there are no scripts or README instructions for replicating the specific attacks."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No quantitative success rates are measured. The paper presents individual attack demonstrations (screenshots) with no aggregate statistics or uncertainty measures."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical tests are performed. Claims about guardrail reliability ('refuses some of the time') are not quantified with success/failure rates across trials."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "No effect sizes or quantified success rates. The paper shows individual examples of successful and refused attacks but does not measure rates across systematic trials."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The number of attack prompts tested is not stated systematically. Only individual examples are shown in figures with no justification for why these specific prompts were chosen or how many were tested total."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance or spread measures reported. The paper acknowledges guardrail refusals vary but does not quantify this variation across runs or prompt variants."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No comparison to existing security auditing tools or alternative MCP security approaches. McpSafetyScanner is presented without comparison to any baseline."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No baselines of any kind are included."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "McpSafetyScanner has three stages (hacker agent, auditor agent, supervisor agent) but no ablation study examines the contribution of each stage."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "No quantitative metrics are used. Results are presented as qualitative demonstrations (screenshots of successful/failed attacks and scanner reports)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation of attack severity, scanner report quality, or remediation usefulness. The scanner reports are shown but not evaluated by independent security experts."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "No benchmark dataset exists for this work; it is a qualitative security demonstration, not a benchmark evaluation."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 1 breaks down findings by attack type (MCE, RAC, CT). Results are shown separately for Claude and Llama-3.3-70B-Instruct, and for direct prompt attacks vs. RADE attacks."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper extensively discusses cases where attacks are refused by LLM guardrails (Figures 1a, 7, 8a, 18, 19), analyzing what triggers refusals vs. what bypasses them."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Guardrail refusals are reported as negative results for the attacker. The paper shows that some attack variants trigger guardrails and are refused (Section 3, Appendix D)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims LLMs can be coerced for MCE, RAC, and CT attacks — all demonstrated with screenshots. McpSafetyScanner is described and shown to detect these vulnerabilities (Table 1, Figures 20-21)."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The claims are demonstrated constructively: the paper shows specific prompts that cause specific attack outcomes. This proof-by-demonstration is appropriate for a security audit showing that attacks are possible."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'LLMs with the Model Context Protocol Allow Major Security Exploits' but only tests Claude 3.7 and Llama-3.3-70B-Instruct. No explicit bounding of claims to these two models. The paper says 'likely other LLMs' require re-evaluation without evidence."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations are considered. For example, the paper does not discuss whether the observed vulnerabilities stem from MCP design specifically vs. general tool-use capabilities, or whether user-confirmation dialogs (which Claude Desktop has) would mitigate the attacks in practice."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper demonstrates actual exploits (code execution, SSH key insertion, credential exfiltration) rather than proxy measures. The claims match the granularity of what was demonstrated."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Section 7 specifies Claude Desktop v0.8.1, gpt-4o-2024-08-06 (for McpSafetyScanner), Llama-3.3-70B-Instruct with specific library versions. Claude model version behind Desktop is not specified beyond 'Claude 3.7' in text."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Attack prompts are shown in full in figures (Figures 1, 2, 6-19). The RADE attack files are also shown (Figures 4a, 10a, 12a). Sufficient to reconstruct the attacks."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No temperature, top-p, or sampling settings are reported for any model. The McpSafetyScanner agents use gpt-4o but no inference parameters are stated."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "McpSafetyScanner's three-stage architecture is described in detail (Figure 5, Section 5): hacker agent probes MCP features, auditor agent searches knowledge bases, supervisor generates report. Agent roles and tools are specified."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No description of how attack prompts were designed, selected, or iterated upon. The methodology for constructing the RADE attack files is not documented."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "No dedicated limitations section. Section 6 ('Discussion, Conclusions, and Future Work') mentions future plans but does not substantively discuss limitations of the current work."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity discussed. The paper does not address whether results might differ with different prompt phrasings, model updates, MCP server versions, or user-confirmation settings."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No explicit scope boundaries. The paper does not state what it did NOT test (e.g., other models, other MCP servers, defense mechanisms like user approval dialogs)."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data released. The full set of prompts tested, success/failure logs, or complete conversation transcripts are not available — only selected screenshots in figures."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "No systematic methodology for how attack prompts were designed or selected. The paper presents specific examples without describing the process of developing them."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants and no sampling from a larger population. Attack prompts were crafted by the authors."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "No documentation of how many attacks were attempted, which were selected for inclusion, or any filtering/selection process for the examples shown."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Section 8: 'We thank Leidos for funding this research through the Office of Technology. Approved for public release 25-LEIDOS-0318-29149.'"
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Both authors list Leidos as their affiliation on the title page."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Leidos is a defense/IT services contractor with no apparent financial stake in MCP being insecure or secure. The funder appears independent of the outcome."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is included in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": false,
    228         "answer": false,
    229         "justification": "This paper tests security vulnerabilities in MCP-enabled systems, not model knowledge on benchmarks. Contamination is structurally inapplicable."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": false,
    233         "answer": false,
    234         "justification": "Not a benchmark evaluation of model capability. Train/test overlap is inapplicable."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": false,
    238         "answer": false,
    239         "justification": "Not a benchmark evaluation. Contamination is structurally inapplicable."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Section 5 states McpSafetyScanner has a 'runtime of less than one minute to scan and generate each report on an M2 Max MacBook Pro.' Wall-clock time is reported."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Hardware (M2 Max MacBook Pro) is mentioned but no total compute budget, API costs, or token consumption is reported for either the attacks or the scanner."
    289       }
    290     }
    291   },
    292   "claims": [
    293     {
    294       "claim": "Industry-leading LLMs (Claude 3.7, Llama-3.3-70B) connected to standard MCP servers can be coerced to execute malicious code, grant remote access control, and steal credentials.",
    295       "evidence": "Demonstrated via screenshots in Figures 1b, 2, 6, 8b, 9 (direct prompt attacks) and Figures 4, 10-14 (RADE attacks). Section 3 describes MCE, RAC, CT attacks; Section 4 describes RADE.",
    296       "supported": "moderate"
    297     },
    298     {
    299       "claim": "LLM guardrails are unreliable for preventing MCP-enabled attacks — refusal varies based on prompt phrasing rather than attack severity.",
    300       "evidence": "Figures 1a vs 1b show the same MCE attack refused when octal-encoded but executed in plaintext. Llama-3.3-70B-Instruct only refuses when explicit words like 'hack' or 'steal' are used (Section 3, Figures 18-19).",
    301       "supported": "moderate"
    302     },
    303     {
    304       "claim": "The RADE attack enables credential theft and remote access without direct attacker access to the victim's system.",
    305       "evidence": "Demonstrated end-to-end for CT (Figure 4) and RAC (Figures 10-11) using Claude Desktop with Chroma + filesystem/everything MCP servers.",
    306       "supported": "moderate"
    307     },
    308     {
    309       "claim": "McpSafetyScanner correctly identifies the vulnerabilities used in the demonstrated attacks and provides actionable remediations.",
    310       "evidence": "Table 1 summarizes scanner findings matching the three attack types. Figures 20-21 show scanner reports. However, the scanner is only evaluated on the same servers used in the attacks.",
    311       "supported": "weak"
    312     }
    313   ],
    314   "red_flags": [
    315     {
    316       "flag": "No systematic evaluation",
    317       "detail": "All results are individual demonstrations (screenshots). No success rates are measured across multiple trials, prompt variants, or model configurations. The paper shows selected examples without quantifying how often attacks succeed or fail."
    318     },
    319     {
    320       "flag": "Circular evaluation of McpSafetyScanner",
    321       "detail": "McpSafetyScanner is evaluated only on the same MCP servers and attack types the authors designed. There is no evaluation on unseen servers, novel attack types, or comparison to other security tools. The scanner 'finding' the vulnerabilities the authors already know about does not demonstrate general detection capability."
    322     },
    323     {
    324       "flag": "Overly broad claims from narrow testing",
    325       "detail": "Only two LLMs tested (Claude 3.7, Llama-3.3-70B-Instruct) and four MCP servers, but claims are framed as general MCP security issues. The title suggests broad applicability ('LLMs with the Model Context Protocol') beyond what was tested."
    326     },
    327     {
    328       "flag": "User confirmation dialogs not discussed",
    329       "detail": "Claude Desktop requires user confirmation before tool execution in many cases. The paper does not discuss whether the demonstrated attacks require the user to approve tool calls, which would significantly reduce real-world threat level."
    330     },
    331     {
    332       "flag": "No limitations section",
    333       "detail": "The paper has no dedicated limitations section and does not discuss threats to validity, scope boundaries, or potential weaknesses in the study methodology."
    334     }
    335   ],
    336   "cited_papers": [
    337     {
    338       "title": "The Llama 3 Herd of Models",
    339       "authors": ["Aaron Grattafiori", "Abhimanyu Dubey"],
    340       "year": 2024,
    341       "arxiv_id": "2407.21783",
    342       "relevance": "Documents safety alignment and cybersecurity evaluation methodology for Llama 3, directly relevant to understanding the guardrails tested in this paper."
    343     },
    344     {
    345       "title": "Llama Guard: LLM-based Input-Output Safeguard for Human-AI Conversations",
    346       "authors": ["Hakan Inan"],
    347       "year": 2023,
    348       "arxiv_id": "2312.06674",
    349       "relevance": "Describes input/output safety guardrails for LLMs, relevant to understanding why MCP attacks can bypass safety mechanisms."
    350     },
    351     {
    352       "title": "Introducing the Model Context Protocol",
    353       "authors": ["Anthropic"],
    354       "year": 2025,
    355       "relevance": "The protocol whose security properties are audited in this paper; foundational to the MCP ecosystem being studied."
    356     }
    357   ]
    358 }

Impressum · Datenschutz