scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (24336B)
      1 {
      2   "paper": {
      3     "title": "MSB (MCP Security Bench): Benchmarking Attacks Against Model Context Protocol in LLM Agents",
      4     "authors": ["Dongsen Zhang", "Zekun Li", "Xu Luo", "Xuannan Liu", "Peipei Li", "Wenjun Xu"],
      5     "year": 2025,
      6     "venue": "arXiv",
      7     "arxiv_id": "2510.15994"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "MSB evaluates 9 LLM agents across 12 MCP-specific attack types with 2,000 attack instances, finding an overall average ASR of 40.71%. Out-of-scope parameter attacks are most effective (74.03% ASR). An inverse scaling law is observed: more capable models are more vulnerable due to stronger tool-use and instruction-following abilities. MCP-specific attacks (user impersonation, false error) are more aggressive than traditional function-calling attacks (prompt injection, retrieval injection).",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No repository URL, GitHub link, or code archive is provided in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No dataset download link or repository for the 2,000 attack instances is provided. The benchmark is described but not released."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No requirements.txt, Dockerfile, or environment setup details are provided beyond naming the LLM models used."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions, scripts, or README are provided."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Tables 3, 8, and 9 report only point estimates for ASR, PUA, and NRP with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper makes comparative claims about model vulnerability (e.g., 'more capable models are more susceptible') but uses no statistical tests—just raw ASR comparisons."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "ASR percentages are reported with baseline context (e.g., 'peak attack success rate of 75.83%', per-attack and per-model breakdowns in Table 3), providing magnitude of effects."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The benchmark includes 2,000 attack instances and 65 user tasks but no justification for why these numbers are sufficient or any power analysis."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or multi-run results are reported. It appears each attack instance was run once per model."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Nine LLM backbones are compared against each other across all attack types, and the paper compares MSB's scope against prior benchmarks (ASB, AgentDojo, InjecAgent, MCPTox) in Table 1 of related work."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Models evaluated include DeepSeek-V3.1, Claude 4 Sonnet, Gemini 2.5 Flash, GPT-4o-mini, Qwen3, and Llama 3.3—all recent models."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The paper evaluates individual attack types and mixed attacks separately, showing which attack vectors and stages contribute most to ASR (Fig. 3 shows per-stage and per-tool-configuration breakdowns)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Three metrics are used: ASR, PUA, and NRP (Sec. 5.2), capturing attack effectiveness, task performance under attack, and their trade-off."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "Evaluation is entirely automated—attack success is determined by examining workspace state and tool invocation logs. No human evaluation of attack quality or agent behavior."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "This is a benchmark evaluation of existing models, not a trained system. There is no train/test split to manage."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down by attack type (12 types), by model (9 models), by pipeline stage (Fig. 3a), and by tool configuration (Fig. 3b)."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "No qualitative failure analysis or examples of why specific attacks failed against specific models. Only aggregate ASR numbers are reported."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Several attacks show low effectiveness: NC-FE has only 16.25% average ASR, RI is 0% for most models, and PI is only 0% for Llama3.3 70B. These are reported transparently."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims 'peak attack success rate of 75.83%' and 'stronger models are more vulnerable'—both are supported by Table 3 and Fig. 2 results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": false,
    119         "justification": "The paper claims 'Models with stronger performance are more vulnerable to attacks due to their outstanding tool calling and instruction following capabilities' (abstract). This causal explanation is asserted without controlled experiments isolating tool-calling ability as the causal factor."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title and abstract frame MSB as evaluating 'MCP agents' generally, but the evaluation uses a specific system prompt (Fig. 4), specific attack task templates, and specific tool configurations. These scope boundaries are not explicitly stated as limitations."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The inverse scaling finding is attributed solely to tool-calling ability. Alternative explanations (e.g., different safety training, RLHF alignment differences, system prompt sensitivity) are not discussed."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "ASR measures whether the agent performs the attack task (e.g., writes a file, kills a process), which is framed as 'security vulnerability.' The paper does not discuss whether these controlled lab attacks reflect real-world MCP security risk, or whether the specific attack tasks chosen are representative proxies for actual security harm."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are listed as 'DeepSeek-V3.1', 'GPT-4o-mini', 'Claude 4 Sonnet', 'Gemini 2.5 Flash', 'Qwen3 8B', etc. No API version dates, snapshot identifiers, or specific version strings (e.g., gpt-4o-mini-2024-07-18) are provided."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "The system prompt template is provided in Fig. 4, and specific attack prompt templates are shown in Table 1, Tables 4-5, and detailed attack examples in Appendix B."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No temperature, top-p, max tokens, or other inference hyperparameters are reported for any of the 9 LLM evaluations."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "The MCP host-client-server workflow is described in Sec. 3.1 and Fig. 1. The agent scaffold uses system prompt with tool list injection, iterative tool invocation, and observation sequence—all described formally."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "Appendix C describes how benign tools were sourced from Smithery, how attack tools were constructed by modifying benign tools (C.1.2), how user tasks were designed (C.2), and how attack instances were combined (C.3)."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "There is no limitations, threats to validity, or discussion section. The paper ends with Sec. 7 Conclusion, which is a brief summary without discussing limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No threats to validity are discussed anywhere in the paper."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not state what the results do NOT show, what populations are excluded, or what claims are NOT being made."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw data (individual attack instance results, agent traces, tool invocation logs) is made available."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Appendix C describes how tools were sourced from Smithery, how attack tools were constructed, how tasks were designed, and how attack instances were combined. Table 2 summarizes statistics."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; data sources are LLM APIs and MCP tool configurations."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline from tool selection to attack construction to evaluation is documented in Sec. 5 and Appendix C, including how attack success is determined by examining workspace state."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: Beijing University of Posts and Telecommunications and University of California, Santa Barbara."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding source is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper evaluates 9 LLMs on their benchmark but does not state training data cutoff dates for any model."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the attack scenarios or tool descriptions could overlap with model training data."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The benchmark is new (so contamination risk is lower), but the paper does not discuss this or the fact that MCP documentation and similar attack patterns may be in training data."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "Running 2,000 attack instances across 9 LLMs (18,000+ LLM calls) and no cost, token count, or latency information is provided."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "No total compute budget, API costs, or hardware specifications are mentioned."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No mention of multiple seeds or runs. Results appear to be single-run per attack instance per model."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of runs per attack instance is not stated. It appears each instance was run once."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": false,
    304         "answer": false,
    305         "justification": "The paper evaluates existing models via API without hyperparameter tuning."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "No configuration selection was performed; all models were evaluated with the same setup."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "No statistical tests are performed at all, so multiple comparison correction is not applied despite comparing 9 models across 12 attack types."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors designed both the benchmark and the attack tools, then evaluated models on their own benchmark. This self-evaluation bias is not acknowledged."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": false,
    324         "answer": false,
    325         "justification": "Compute differences between models are not a focus; the study evaluates security rather than performance-per-compute."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "The paper does not discuss whether its 6 attack tasks (file writing, process killing, data exfiltration) represent the full space of real-world MCP security threats, or whether ASR on these tasks generalizes to actual security risk."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "All models use the same system prompt (Fig. 4) but it is unclear whether the MCP client/scaffold is identical across models. Different models may have different native tool-calling mechanisms that interact differently with MCP, and this confound is not discussed."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "The benchmark is new but the paper does not discuss whether MCP documentation, attack patterns, or tool configurations could appear in model training data."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the evaluation setup leaks information (e.g., whether the system prompt or tool descriptions give hints about expected behavior)."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The 2,000 attack instances are combinations of 65 user tasks and 6 attack tasks, creating structural dependencies. This non-independence is not discussed."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Overall average attack success rate across all models and attack types is 40.71%.",
    364       "evidence": "Table 3 shows per-model, per-attack ASR values averaging to 40.71% (Sec. 6.2).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Out-of-scope parameter attacks are the most effective, achieving 74.03% average ASR.",
    369       "evidence": "Table 3 shows OP column averaging 74.03% across 9 models.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "An inverse scaling law exists between LLM capability and security—more capable models are more vulnerable.",
    374       "evidence": "Fig. 2 and Table 3 show DeepSeek-V3.1 (highest PUA) also has highest ASR (60.94%). Smaller models like Llama3.1 8B have lower ASR (19.74%). But this is correlational, not causally demonstrated.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "MCP-specific attacks (UI, FE) are more aggressive than traditional function-calling attacks (PI, RI).",
    379       "evidence": "Table 3: UI average ASR 50.72% and FE 43.42% vs PI 17.03% and RI 18.89%.",
    380       "supported": "moderate"
    381     },
    382     {
    383       "claim": "Attacks remain effective even in multi-tool environments containing benign tools.",
    384       "evidence": "Fig. 3b shows significant ASR for attacks in configurations with benign tools present.",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "No limitations section",
    391       "detail": "The paper has no limitations, threats to validity, or discussion section. For a security benchmark paper proposing a new evaluation framework, this is a significant omission."
    392     },
    393     {
    394       "flag": "No variance or multi-run reporting",
    395       "detail": "LLM outputs are stochastic, yet results appear to be single-run. With no variance reporting, it's impossible to know if observed ASR differences are stable or artifacts of sampling."
    396     },
    397     {
    398       "flag": "No artifacts released",
    399       "detail": "Neither code nor benchmark data is released, making the benchmark unverifiable and non-reproducible despite being framed as a community resource."
    400     },
    401     {
    402       "flag": "Causal claims without causal evidence",
    403       "detail": "The 'inverse scaling law' claim attributes vulnerability to 'tool calling and instruction following capabilities' without controlled experiments isolating these factors from other model differences (safety training, RLHF, etc.)."
    404     },
    405     {
    406       "flag": "Missing hyperparameters",
    407       "detail": "Temperature and sampling settings are not reported for any of the 9 models, despite these significantly affecting tool-calling behavior and susceptibility to injection attacks."
    408     },
    409     {
    410       "flag": "Self-evaluation bias",
    411       "detail": "Authors designed the benchmark, attack tools, and evaluation criteria, then reported results without acknowledging or mitigating self-evaluation bias."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "Agent Security Bench (ASB): Formalizing and Benchmarking Attacks and Defenses in LLM-based Agents",
    417       "authors": ["Hanrong Zhang", "Jingyuan Huang", "Kai Mei"],
    418       "year": 2025,
    419       "relevance": "Prior agent security benchmark using function-calling paradigm, direct comparison point for MSB."
    420     },
    421     {
    422       "title": "AgentDojo: A Dynamic Environment to Evaluate Attacks and Defenses for LLM Agents",
    423       "authors": ["Edoardo Debenedetti", "Jie Zhang", "Mislav Balunovic"],
    424       "year": 2025,
    425       "relevance": "Dynamic benchmark for LLM agent attacks/defenses, cited as limited to function-calling paradigm."
    426     },
    427     {
    428       "title": "InjecAgent: Benchmarking Indirect Prompt Injections in Tool-Integrated Large Language Model Agents",
    429       "authors": ["Qiusi Zhan", "Zhixiang Liang", "Zifan Ying", "Daniel Kang"],
    430       "year": 2024,
    431       "relevance": "Benchmark for indirect prompt injection in tool-using agents."
    432     },
    433     {
    434       "title": "EnIGMA: Interactive tools substantially assist LM agents in finding security vulnerabilities",
    435       "authors": ["Talor Abramovich", "Meet Udeshi", "Minghao Shao"],
    436       "year": 2025,
    437       "relevance": "Evaluates LLM agents for security vulnerability discovery with interactive tools."
    438     },
    439     {
    440       "title": "TrustAgent: Towards Safe and Trustworthy LLM-based Agents through Agent Constitution",
    441       "authors": ["Wenyue Hua", "Xianjun Yang", "Zelong Li"],
    442       "year": 2024,
    443       "relevance": "Safety framework for LLM agents using constitutional principles."
    444     },
    445     {
    446       "title": "ToolLLM: Facilitating Large Language Models to Master 16000+ Real-world APIs",
    447       "authors": ["Yujia Qin", "Shi Liang", "Yining Ye"],
    448       "year": 2024,
    449       "relevance": "Large-scale tool-use benchmark for LLMs, foundational to tool-calling evaluation."
    450     },
    451     {
    452       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    453       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessi"],
    454       "year": 2023,
    455       "relevance": "Foundational work on LLM tool use, relevant to understanding tool-calling capabilities."
    456     },
    457     {
    458       "title": "Inverse Scaling: When Bigger Isn't Better",
    459       "authors": ["Ian R. McKenzie", "Alexander Lyzhov", "Michael Martin Pieler"],
    460       "year": 2023,
    461       "relevance": "Establishes the inverse scaling phenomenon that MSB claims to observe in security context."
    462     },
    463     {
    464       "title": "MCP Safety Audit: LLMs with the Model Context Protocol Allow Major Security Exploits",
    465       "authors": ["Brandon Radosevich", "John Halloran"],
    466       "year": 2025,
    467       "arxiv_id": "2504.03767",
    468       "relevance": "Early MCP security audit demonstrating retrieval injection and other MCP-specific attacks."
    469     },
    470     {
    471       "title": "Systematic Analysis of MCP Security",
    472       "authors": ["Yongjian Guo", "Puzhuo Liu", "Wanlun Ma"],
    473       "year": 2025,
    474       "arxiv_id": "2508.12538",
    475       "relevance": "Systematic analysis of MCP security vulnerabilities including false error attacks."
    476     },
    477     {
    478       "title": "Beyond the Protocol: Unveiling Attack Vectors in the Model Context Protocol Ecosystem",
    479       "authors": ["Hao Song", "Yiming Shen", "Wenxuan Luo"],
    480       "year": 2025,
    481       "arxiv_id": "2506.02040",
    482       "relevance": "Identifies attack vectors in the MCP ecosystem."
    483     },
    484     {
    485       "title": "MPMA: Preference Manipulation Attack Against Model Context Protocol",
    486       "authors": ["Zihan Wang", "Hongwei Li", "Rui Zhang"],
    487       "year": 2025,
    488       "arxiv_id": "2505.11154",
    489       "relevance": "Preference manipulation attacks against MCP, one of the attack types evaluated in MSB."
    490     }
    491   ]
    492 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs