scan.json (25264B)
1 { 2 "paper": { 3 "title": "RAG-MCP: Mitigating Prompt Bloat in LLM Tool Selection via Retrieval-Augmented Generation", 4 "authors": ["Tiantian Gan", "Qiyao Sun"], 5 "year": 2025, 6 "venue": "arXiv.org", 7 "arxiv_id": "2505.03275", 8 "doi": "10.48550/arXiv.2505.03275" 9 }, 10 "scan_version": 3, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "RAG-MCP applies semantic retrieval to filter relevant MCP tool descriptions before presenting them to an LLM, improving tool selection accuracy from 13.62% (naive all-tools prompting) to 43.13% on MCPBench web search tasks while reducing prompt tokens by roughly 49%. A stress test varying the MCP pool from 1 to 11,100 shows performance degrades sharply beyond ~100 candidate tools, with reliable selection only in small pools.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No code repository, GitHub link, or archive is provided anywhere in the paper." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The evaluation uses the publicly available web search subset of MCPBench [8] and draws distractor MCPs from the public mcp.so registry [14]." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "No environment specifications, requirements files, or dependency lists are provided." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions, README, or scripts are provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Table 1 reports only point estimates (43.13%, 18.20%, 13.62%) with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims RAG-MCP 'significantly outperforms' baselines but provides no statistical significance tests (p-values, t-tests, etc.). Comparisons rely solely on raw number differences." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Table 1 reports accuracy percentages for all three methods (43.13%, 18.20%, 13.62%) along with token counts, providing baseline context for the magnitude of improvement. The abstract notes the tripling (43.13% vs 13.62%)." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The paper uses 20 trials per baseline and 20 web search tasks but provides no justification for why these sample sizes were chosen or whether they provide adequate statistical power." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No variance, standard deviation, or spread measures are reported across the 20 trials. Only aggregate point estimates appear in Table 1." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Two baselines are included: 'Blank Conditioning' (all MCPs in prompt) and 'Actual Match' (keyword-filtered MCPs). Section 4.2." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": false, 73 "justification": "The baselines are ad-hoc strategies (naive prompting and keyword matching) rather than published contemporary tool-selection methods. No comparison against prior retrieval-based tool selection approaches (e.g., Gorilla's retrieval-augmented approach, which is discussed in the related work)." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": false, 78 "justification": "RAG-MCP has three components (retrieval, validation, invocation) but no ablation study removing individual components. The validation step's contribution is not isolated." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Table 1 reports accuracy, average prompt tokens, and average completion tokens." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "Evaluation is entirely automated using a 'Llama-based verifier' and DeepSeek-v3 as evaluator. No human evaluation of outputs." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper explicitly states the web search subset of MCPBench was used as 'our heldout testbed' (Section 4.2), an external benchmark not used for development." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": false, 98 "justification": "Only aggregate accuracy across all 20 web search tasks is reported. No per-task or per-category breakdown." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": false, 103 "justification": "The stress test analysis (Section 5) discusses general patterns of degradation but no specific failure examples are shown or analyzed qualitatively." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The stress test and analysis explicitly acknowledge performance degradation: 'retrieval precision and overall throughput degrade as the tool registry scales to thousands of MCPs' (Section 4.1). Figure 3 shows widespread failure at scale." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": false, 115 "justification": "The abstract claims token reduction 'by over 50%' but Table 1 shows 1084 vs 2133.84 = 49.2% reduction, technically under 50%. The abstract claims 'scalable and accurate tool integration' but 43.13% accuracy is not accurate in absolute terms, and the stress test shows poor scaling beyond ~100 MCPs." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": false, 120 "justification": "The paper claims retrieval 'causes' improvement but the study design does not disentangle confounds: RAG-MCP uses an additional embedding model (Qwen) that baselines do not, and the effect of prompt reduction vs retrieval quality is not separated. No statistical tests confirm the observed differences are not due to chance." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title claims general 'LLM Tool Selection' and conclusions reference 'hundreds or thousands of tools,' but all experiments use only web search tasks with a single LLM (qwen-max-0125). No other task types, domains, or models are tested." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No alternative explanations are considered. Could the improvement come from the additional Qwen model rather than the retrieval approach? Could the keyword baseline be poorly implemented? Could web search tasks be unusually easy for semantic retrieval? None of these are discussed." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper measures tool selection accuracy and frames results in terms of tool selection accuracy. The measurements match the granularity of the claims — no proxy gap exists." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": false, 142 "justification": "The base LLM is specified as 'qwen-max-0125' (Section 4.2), but the retriever is only described as 'a lightweight LLM-based retriever (e.g., Qwen)' without a version, and the judge is only 'Llama-based verifier' with no model name or version." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": false, 147 "justification": "No actual prompts are provided. The paper describes what the system does in natural language but never shows the actual prompt text sent to any model." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No hyperparameters are reported — no temperature, top-p, top-k for retrieval, embedding dimensions, similarity thresholds, or other settings." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": false, 157 "justification": "The three-step pipeline (retrieval → validation → invocation) is described at a high level with Figure 2, but the validation step is vaguely described ('generate a few-shot example query and test its response') with no implementation detail. How the retriever index is built, what similarity metric is used, and how top-k is determined are not specified." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "How MCP descriptions were collected from mcp.so, preprocessed, and indexed into the vector store is not documented. The selection of the 20 web search tasks from MCPBench is not described." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations or threats-to-validity section exists. The conclusion briefly mentions future work on 'retrieval at extreme scale' but does not substantively discuss limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity are discussed anywhere in the paper." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what the results do NOT show. It does not bound claims to web search tasks, the single LLM tested, or the specific MCP registry used." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data, logs, or per-trial results are released. Only aggregate statistics in Table 1 and the heatmap in Figure 3 are provided." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Section 4.1 describes the stress test data collection: ground-truth MCPs and N-1 distractors from 4,400+ public MCP servers on mcp.so, with N varied from 1 to 11,100 in 26 intervals. Section 4.2 describes using the MCPBench web search subset with 20 trials." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data comes from public benchmark (MCPBench) and public MCP registry (mcp.so)." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from raw MCP descriptions to vector index to retrieval results is not documented. How MCPs were embedded, how the index was constructed, and how distractor selection worked in detail are not specified." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding information is mentioned. The acknowledgements section thanks the MCPBench authors but does not disclose any funding sources." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations are clearly stated: Beijing University of Post and Communications and Queen Mary University of London. No product being evaluated is affiliated with their institutions." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": false, 217 "answer": false, 218 "justification": "Appears to be unfunded student research (university affiliations with student email addresses, no funding mentioned)." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper evaluates qwen-max-0125 on MCPBench tasks but does not state the model's training data cutoff date." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether MCPBench web search tasks or MCP descriptions appeared in Qwen's training data." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "MCPBench [8] was published in April 2025. The model qwen-max-0125 likely predates this, reducing contamination risk, but this temporal relationship is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Table 1 reports average prompt tokens and average completion tokens for each method (e.g., RAG-MCP: 1084 prompt, 78.14 completion tokens), providing a measure of per-query inference cost." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "No total computational budget is stated — no total API spend, wall-clock time for experiments, or hardware specifications." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Distractor MCPs are randomly selected but no seed sensitivity analysis is performed and no results across multiple seeds are reported." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "Section 4.2 states '20 independent trials' for each baseline method." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. Top-k for retrieval, embedding model choice, and similarity threshold appear to have been set without documented search." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "No explanation of how the final configuration (top-k=1, specific embedding model, etc.) was selected or whether alternatives were tried." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Three methods are compared but no statistical tests are performed at all, making multiple comparison correction impossible to apply." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement all three baselines themselves (including the keyword-matching 'Actual Match') but do not acknowledge potential bias in their implementation of competitors." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": false, 326 "justification": "RAG-MCP adds a separate embedding and retrieval step using a Qwen model that baselines do not incur. This additional compute cost is not discussed relative to the performance gain." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether MCPBench web search tasks represent real-world tool selection scenarios or whether success on web search generalizes to other tool types." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "RAG-MCP adds a retriever model (Qwen) not present in baselines. The improvement could stem from the additional model's capabilities rather than the retrieval approach itself. This confound is not addressed." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "No discussion of whether qwen-max-0125's training data includes MCPBench tasks or MCP server descriptions from mcp.so." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information. For example, MCP server names in the prompt may contain hints about functionality." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "The 20 web search tasks likely share structural similarities (all being web search), but independence between test examples is not discussed." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are applied." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "RAG-MCP cuts prompt tokens by over 50% compared to feeding all tools at once.", 365 "evidence": "Table 1 shows RAG-MCP uses 1084 avg prompt tokens vs Blank Conditioning's 2133.84, a reduction of ~49.2%. The '50%' threshold is not quite met.", 366 "supported": "weak" 367 }, 368 { 369 "claim": "RAG-MCP more than triples tool selection accuracy (43.13% vs 13.62% baseline).", 370 "evidence": "Table 1 shows 43.13% accuracy for RAG-MCP vs 13.62% for Blank Conditioning, a 3.17x improvement. However, only 20 trials with no statistical tests or error bars.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "RAG-MCP enables scalable and accurate tool integration for LLMs.", 375 "evidence": "The stress test (Figure 3, Section 5) shows performance degrades sharply beyond ~100 MCPs, with purple (failure) dominating above that threshold. 43.13% accuracy is below chance in absolute terms.", 376 "supported": "weak" 377 }, 378 { 379 "claim": "LLM tool selection performance degrades as the number of candidate MCPs increases.", 380 "evidence": "Figure 3 visualizes per-trial success across MCP pool sizes 1-11,100, showing high success below 30, variability at 31-70, and dominant failure beyond ~100.", 381 "supported": "moderate" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "Very low absolute accuracy", 387 "detail": "The best method (RAG-MCP) achieves only 43.13% tool selection accuracy — less than half of selections are correct. The paper frames this as a major success and 'compelling solution' for scalable tool integration." 388 }, 389 { 390 "flag": "Overclaimed token reduction", 391 "detail": "The abstract claims 'over 50%' token reduction, but Table 1 shows 1084 vs 2133.84 tokens = 49.2% reduction, which is under 50%." 392 }, 393 { 394 "flag": "Tiny sample size with no uncertainty quantification", 395 "detail": "All conclusions rest on 20 trials per method with no error bars, confidence intervals, or statistical tests. The word 'significantly' is used without any statistical test." 396 }, 397 { 398 "flag": "Unspecified evaluation judge", 399 "detail": "Answer correctness is judged by a 'Llama-based verifier' whose specific model, version, and accuracy are never stated. DeepSeek-v3 is also used as an evaluator. Neither judge is validated." 400 }, 401 { 402 "flag": "Missing critical implementation details", 403 "detail": "The retriever model version, embedding dimensions, similarity metric, top-k value, and all hyperparameters are unspecified, making reproduction impossible even with code." 404 }, 405 { 406 "flag": "Scalability claim contradicted by own results", 407 "detail": "The paper claims to 'enable scalable tool integration' but the stress test shows performance collapses beyond ~100 MCPs, well short of the 4,400+ registry or the claimed 'thousands of tools' scenario." 408 } 409 ], 410 "cited_papers": [ 411 { 412 "title": "Gorilla: Large Language Model Connected with Massive APIs", 413 "authors": ["Shishir G. Patil", "Tianjun Zhang", "Xin Wang", "Joseph E. Gonzalez"], 414 "year": 2024, 415 "relevance": "Demonstrates retrieval-augmented API documentation to improve LLM tool use accuracy, directly relevant to tool selection for LLM agents." 416 }, 417 { 418 "title": "Toolformer: Language Models Can Teach Themselves to Use Tools", 419 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessi", "Roberta Raileanu"], 420 "year": 2023, 421 "relevance": "Foundational work on self-supervised tool-use learning in LLMs, showing models can learn when and how to call APIs." 422 }, 423 { 424 "title": "ReAct: Synergizing Reasoning and Acting in Language Models", 425 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu", "Nan Du"], 426 "year": 2023, 427 "relevance": "Key agentic workflow paper interleaving reasoning and tool actions, relevant to LLM tool-use evaluation." 428 }, 429 { 430 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 431 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus"], 432 "year": 2020, 433 "relevance": "Foundational RAG paper combining parametric LLMs with non-parametric retrieval, the basis for RAG-MCP's approach." 434 }, 435 { 436 "title": "WebGPT: Browser-Assisted Question-Answering with Human Feedback", 437 "authors": ["Reiichiro Nakano", "Jacob Hilton", "Saurav Balaji"], 438 "year": 2022, 439 "arxiv_id": "2112.09332", 440 "relevance": "Trains GPT-3 to navigate and search the web, relevant to LLM tool use and grounded retrieval to reduce hallucinations." 441 }, 442 { 443 "title": "Evaluation Report on MCP Servers", 444 "authors": ["Zhiling Luo", "Xiaorong Shi", "Xuanrui Lin", "Jinyang Gao"], 445 "year": 2025, 446 "arxiv_id": "2504.11094", 447 "relevance": "Provides the MCPBench evaluation framework and WebSearch dataset used as the testbed in this paper." 448 }, 449 { 450 "title": "Enhancing Function-Calling Capabilities in LLMs: Strategies for Prompt Formats, Data Integration, and Multilingual Translation", 451 "authors": ["Yi-Cheng Chen", "Pei-Chi Hsu", "Cheng-Jui Hsu"], 452 "year": 2024, 453 "arxiv_id": "2412.01130", 454 "relevance": "Explores prompt format strategies for LLM function calling, directly relevant to tool-use methodology." 455 }, 456 { 457 "title": "DeepSeek-V3 Technical Report", 458 "authors": ["Aixin Liu", "Bei Feng", "Bing Xue"], 459 "year": 2024, 460 "arxiv_id": "2412.19437", 461 "relevance": "Used as the automated evaluator for answer correctness in the experiments." 462 } 463 ], 464 "engagement_factors": { 465 "practical_relevance": { 466 "score": 2, 467 "justification": "Using RAG to filter MCP tools before prompting is a practically useful idea for developers building multi-tool LLM agents, though the low absolute accuracy limits immediate applicability." 468 }, 469 "surprise_contrarian": { 470 "score": 0, 471 "justification": "The finding that filtering irrelevant tools from the prompt improves performance confirms intuitive expectations rather than challenging them." 472 }, 473 "fear_safety": { 474 "score": 0, 475 "justification": "No safety, security, or risk implications are raised." 476 }, 477 "drama_conflict": { 478 "score": 0, 479 "justification": "No controversy or conflict angle." 480 }, 481 "demo_ability": { 482 "score": 0, 483 "justification": "No code, demo, or tool is released." 484 }, 485 "brand_recognition": { 486 "score": 1, 487 "justification": "References MCP (Anthropic) and uses Qwen/DeepSeek models, but authors are from relatively lesser-known university programs." 488 } 489 } 490 }