scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28281B)
      1 {
      2   "paper": {
      3     "title": "Picking the Right Specialist: Attentive Neural Process-based Selection of Task-Specialized Models as Tools for Agentic Healthcare Systems",
      4     "authors": ["Pramit Saha", "Joshua Strong", "Mohammad Alsharid", "Divyanshu Mishra", "J. Alison Noble"],
      5     "year": 2026,
      6     "venue": "arXiv",
      7     "arxiv_id": "2602.14901"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "ToolSelect, an Attentive Neural Process-based tool selector, consistently outperforms 10 SOTA routing baselines across four chest X-ray task families (disease diagnosis, report generation, visual grounding, VQA). The paper demonstrates substantial per-instance complementarity among specialist models — e.g., Oracle F1 is 58.00% on Open-I vs 13.35% for the best single model — and shows that ToolSelect closes a large fraction of this gap (43.80% F1). The paper introduces ToolSelectBench with 1448 queries and a diverse pool of 55 specialist models.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code repository URL or GitHub link is provided in the paper. The supplementary material includes model predictions and benchmark data but no source code for the ToolSelect method."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "§3.1: 'All benchmark samples, together with the individual predictions of 55 candidate models, are attached as supplementary material for transparency and reproducibility.' The evaluation datasets (Open-I, VinDr-CXR, ReX-VQA) are also public."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "§3.3 lists key architecture choices (ViT-B/16, CheXbert) and hyperparameters but provides no requirements.txt, Dockerfile, or detailed software dependency listing with versions."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions, README, or scripts to replicate experiments are mentioned in the paper."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Tables 1-4 report only point estimates for all metrics (accuracy, F1, precision, recall, AUC, IoU, etc.). No confidence intervals, error bars, or ± notation appear anywhere in the results."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "The paper claims 'ToolSelect consistently outperforms' baselines based solely on comparing raw numbers across Tables 1-4. No statistical significance tests (p-values, bootstrap tests, etc.) are used to support these comparative claims."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "§4.1: 'improving over the best baseline by +31.82 F1 points (vs 11.98%)' and '+35.44 F1 points (vs 24.44%)'. §4.2: 'improving over Random by +12.46 points.' Absolute improvements with baseline context are reported throughout."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "The benchmark consists of 550 diagnosis, 156 report generation, 394 grounding, and 348 VQA queries. No justification for these sizes or power analysis is provided. The report generation set (156 queries) is notably small."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "All results in Tables 1-4 appear to be from single runs. No standard deviations, variance across seeds, or multiple-run spread measures are reported."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "§3.4 describes 10+ baselines: KNNRouter, SVMRouter, MLPRouter, MFRouter, EloRouter, RouterDC, AutoMix, Hybrid LLM, GraphRouter, and CausalLLM Router. Also includes Random and Oracle bounds."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include RouterDC (Chen et al., 2024a, NeurIPS 2024), GraphRouter (Feng et al., ICLR), RouteLLM (Ong et al., 2024), and ToolACE (Liu et al., 2025). These are recent and representative."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "§4.2: 'See Suppl. E.1 for ablation on reference point / aggregation.' An ablation study exists in the supplementary material, though only referenced briefly in the main text."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Comprehensive multi-metric evaluation: disease diagnosis uses Acc/F1/P/R/AUC (Table 1); report generation uses SemBScore/F1-RadGraph/ROUGE-L/METEOR/RateScore (Table 2); grounding uses mAP@0.25/mAP@0.5/Mean IoU (Table 3)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation is conducted. All evaluation is automated. For report generation and clinical grounding tasks, human expert evaluation of clinical faithfulness would strengthen the claims, but none is included."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "§3.1: 'we follow the standard dataset splits: the training split is used for training the selector, and performance is reported on the corresponding test split.'"
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Table 3 provides per-phrase performance across 11 clinical findings for visual grounding. Table 1 breaks down results by Open-I vs VinDr datasets. Results are shown separately for all 4 task families."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "§4.1 discusses failure modes: 'R-All attains the highest accuracy but has the lowest F1.' §4.3 discusses 'Hard phrases and weak specialists' with specific performance details. §4.4 notes baseline routers that underperform single models."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "ToolSelect outperforms all baselines in every table on every primary metric. No failed configurations, abandoned approaches, or design choices that hurt performance are reported. The supplementary ablation is referenced but no negative findings are mentioned."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims: (1) 'ToolSelect consistently outperforms 10 SOTA methods across four different task families' — supported by Tables 1-4; (2) introduces ToolSelectBench with 1448 queries — confirmed in §3.1. Claims match the results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Causal claims are limited to 'ToolSelect improves over baselines' which is supported by controlled comparisons holding the specialist pool constant while varying only the selection method. The ablation study (Suppl. E.1) tests component contributions."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title claims 'Agentic Healthcare Systems' broadly but all experiments are on chest X-ray only — one imaging modality, one anatomical region. The conclusion references 'multimodal clinical agentic pipelines' without bounding generalization to the tested setting."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No alternative explanations for ToolSelect's gains are discussed. For instance, the ANP's advantage could stem from the behavioral reference sets rather than the attention mechanism, or compute budget differences between baselines could affect results. Neither is explored."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper measures automated metrics (F1, IoU, F1-RadGraph) but frames results as 'clinically aligned reports' (§4.2) and 'clinical performance' (§2.1). The gap between automated proxies and actual clinical utility is not acknowledged. F1-RadGraph measures structured clinical entity overlap, not clinical decision quality."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "§3.2 specifies model names with sizes/variants: 'CheXAgent-8B', 'Qwen3-VL-2B/4B/8B', 'LLaVA-1.5-7B/13B', 'LLaMA-3.2-Vision-11B', 'EVA-X Tiny/Small/Base', etc. These are specific enough to identify the models used."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The system uses a ReAct-style LLM orchestrator (Figure 3) that maps queries to tasks. No actual prompt text is provided for the LLM agent core or the task selector. Only natural language descriptions of the pipeline are given."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "§3.3: 'AdamW (learning rate 3 × 10−5, weight decay 10−4), up to 50 epochs with early stopping (patience = 10, min delta = 10−4). Batch size 16. λH = 0.05, dropout 0.1, reference set Bt ∈ [16, 64].' Key hyperparameters are reported."
    152       },
    153       "scaffolding_described": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Figure 3 provides an architecture overview of the agentic framework showing the ReAct-style loop with LangChain short-term memory and the tool model zoo. §2.1-2.3 describe the selector architecture in mathematical detail. Figure 4 provides detailed ToolSelect architecture."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "The construction of ToolSelectBench queries is insufficiently documented. §3.1 states benchmark sizes (550/156/394/348 queries) and mentions using standard splits and 1% sampling for VQA, but the query construction criteria and any filtering applied to arrive at these specific counts are not described."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper has an 'Impact Statement' section discussing deployment concerns but no dedicated 'Limitations' or 'Threats to Validity' section with substantive discussion of methodological limitations."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "No specific threats to validity are discussed. The Impact Statement contains generic statements like 'improper deployment without clinical oversight could pose risks' but no study-specific threats."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what the results do NOT show. It does not bound the scope to chest X-ray or acknowledge that generalization to other imaging modalities, clinical domains, or real-world deployment settings is untested."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": true,
    185         "justification": "§3.1: 'All benchmark samples, together with the individual predictions of 55 candidate models, are attached as supplementary material.' This allows independent verification of the routing results from raw predictions."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "§3.1-3.2 describe the data sources (Open-I, VinDr-CXR, ReX-VQA), the model collection process (publicly available models, fine-tuned models), training datasets, and architectures for each of the 55 specialist models."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. All data comes from standard public medical imaging benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "While the label-space alignment (§2.3) and cost function definitions (§2.1) are mathematically described, the concrete pipeline from raw datasets to the final 1448 benchmark queries — including any filtering, exclusion, or sampling criteria at each stage — is not fully documented."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding or acknowledgments section is present in the provided paper text. Funding sources are not disclosed."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations are listed: Department of Engineering Science, University of Oxford (Saha, Strong, Mishra, Noble) and Department of Computer Science, Khalifa University (Alsharid)."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "Funding is not disclosed, so independence cannot be assessed. The absence of funding disclosure makes this NO."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial disclosure statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "The paper evaluates pre-trained VLMs (Qwen3-VL, LLaVA, LLaMA-3.2-Vision) on benchmark datasets but does not state the training data cutoff dates for any of these models. It is unclear whether these models could have seen the test data."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the pre-trained VLMs (which are used as tool candidates) may have been trained on data overlapping with the Open-I, VinDr-CXR, or ReX-VQA test sets."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "Open-I and VinDr-CXR are publicly available datasets. Large VLMs like Qwen3-VL and LLaVA may have encountered these during pre-training. This contamination risk is not discussed."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": false,
    283         "justification": "No inference cost, latency, or per-query cost is reported. The system involves running 55 specialist models plus a selector but the computational cost of this pipeline is not discussed."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "§3.3 mentions training for 50 epochs with early stopping but no GPU hours, hardware specifications, total training time, or total compute budget are stated."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No mention of multiple random seeds or seed sensitivity analysis. Results appear to be from a single training run of the selector."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is not stated anywhere. Results in Tables 1-4 are presented without indicating how many runs produced them."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "Hyperparameters are reported (§3.3) but no information is given about how they were selected — number of configurations tried, search method, or compute spent on tuning."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": true,
    310         "justification": "§3.3: 'early stopping (patience = 10, min delta = 10−4)' — model selection is based on validation performance with a principled early stopping criterion, not cherry-picked from test results."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors implement all baseline routing methods and compare them against their own ToolSelect. No acknowledgment that their implementations of baselines may systematically underperform compared to original authors' implementations."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "No performance-vs-compute analysis. Baselines range from lightweight heuristics (EloRouter) to learned methods (GraphRouter with GNN) but compute costs are not compared. ToolSelect uses ANP cross-attention which likely costs more than simpler baselines."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether the automated metrics (F1, IoU, F1-RadGraph) actually measure clinical utility. The paper frames results as clinical performance improvement but doesn't question whether the benchmarks capture what matters clinically."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": true,
    334         "answer": true,
    335         "justification": "All routing methods are compared within the same agentic framework (Figure 3). The specialist model pool is held constant across all methods. Only the selection mechanism varies, so the scaffold confound is controlled by design."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "Not discussed. The VLMs (Qwen3-VL, LLaVA, LLaMA-3.2-Vision) were trained on web-scale data that may include the benchmark datasets which predate them (Open-I from 2014, VinDr-CXR from 2022)."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Not discussed. The selector uses behavioral reference sets from the same distribution as the test data; whether this introduces any information leakage from reference to test sets is not addressed."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "Not discussed. No verification that training and test splits for the selector are fully independent, or that reference set examples don't share structural similarities with test queries."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No concrete leakage detection or prevention method is used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "ToolSelect consistently outperforms 10 SOTA routing baselines across all four task families.",
    364       "evidence": "Tables 1-4: ToolSelect achieves best routing performance on disease diagnosis (43.80% F1 Open-I, 59.88% VinDr), report generation (24.17% F1-RadGraph), visual grounding (50.08 Mean IoU), and VQA (72.01% accuracy).",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "Per-instance Oracle selection reveals large headroom beyond any single specialist model.",
    369       "evidence": "Table 1: Oracle F1 58.00% vs best single-model F1 13.35% on Open-I. Table 4: Oracle 96.49% vs best single 63.22% on VQA. Table 3: Oracle IoU 56.20 vs best single 40.66.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "No single specialist model is uniformly reliable under dataset shift; the best model depends on the metric and dataset.",
    374       "evidence": "§4.1: R-All has highest accuracy (84.08% Open-I, 75.66% VinDr) but lowest F1 (6.61%, 4.70%). D-RSNA has 64.99% recall on VinDr but degrades on Open-I. Table 3 shows different grounding specialists excel on different phrases.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "Classical and heuristic routers provide limited improvement and can trail strong individual models.",
    379       "evidence": "§4.4: 'KNN 41.52%, Graph 47.37%' underperform the best single model (NV 63.22%). §4.1: best baseline F1 on Open-I is only 11.98% (MF Router).",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "ToolSelect produces more clinically aligned reports than baseline routing methods.",
    384       "evidence": "Figure 5 shows two qualitative examples where ToolSelect selects better-suited specialists. §4.2: ToolSelect achieves 24.17% F1-RadGraph vs 22.40% for best baseline.",
    385       "supported": "moderate"
    386     }
    387   ],
    388   "red_flags": [
    389     {
    390       "flag": "No error bars or variance reporting",
    391       "detail": "All results across Tables 1-4 are single-run point estimates with no confidence intervals, standard deviations, or multiple-seed results. For a learning-based method (ToolSelect), training variance could substantially affect results. Claims of 'consistently outperforms' are unsupported without uncertainty quantification."
    392     },
    393     {
    394       "flag": "No statistical significance tests",
    395       "detail": "Comparative claims are made by comparing raw numbers without any statistical tests. The differences between ToolSelect and baselines could potentially fall within random variation, especially for the report generation task with only 156 queries."
    396     },
    397     {
    398       "flag": "No limitations section",
    399       "detail": "The paper lacks a limitations section. The Impact Statement addresses deployment ethics but does not discuss methodological limitations such as the chest-X-ray-only evaluation, small benchmark sizes, or single-run results."
    400     },
    401     {
    402       "flag": "Overclaiming scope",
    403       "detail": "Title claims 'Agentic Healthcare Systems' broadly, but all experiments are limited to chest X-ray tasks only. Generalization to other imaging modalities, clinical domains, or real-world deployment settings is untested and not bounded."
    404     },
    405     {
    406       "flag": "No compute cost analysis for 55-model system",
    407       "detail": "The system requires maintaining and running predictions from 55 specialist models plus a selector, but no inference cost, latency, or computational budget is reported. Clinical deployment feasibility is a significant unaddressed concern."
    408     },
    409     {
    410       "flag": "Every result shows ToolSelect winning",
    411       "detail": "ToolSelect outperforms every baseline on every primary metric across every task. No configurations, tasks, or settings where the method fails or underperforms are reported. This uniformly positive narrative warrants skepticism."
    412     }
    413   ],
    414   "cited_papers": [
    415     {
    416       "title": "ToolLLM: Facilitating large language models to master 16000+ real-world APIs",
    417       "authors": ["Y. Qin", "S. Liang", "Y. Ye"],
    418       "year": 2023,
    419       "arxiv_id": "2307.16789",
    420       "relevance": "Major benchmark for LLM tool use capabilities across real-world APIs."
    421     },
    422     {
    423       "title": "RouteLLM: Learning to route LLMs with preference data",
    424       "authors": ["I. Ong", "A. Almahairi", "V. Wu"],
    425       "year": 2024,
    426       "arxiv_id": "2406.18665",
    427       "relevance": "Proposes LLM routing methods for cost-quality tradeoffs, used as baseline in this paper."
    428     },
    429     {
    430       "title": "RouterDC: Query-based router by dual contrastive learning for assembling large language models",
    431       "authors": ["S. Chen", "W. Jiang", "B. Lin"],
    432       "year": 2024,
    433       "relevance": "NeurIPS 2024 paper on contrastive learning for LLM routing, used as baseline."
    434     },
    435     {
    436       "title": "Toolformer: Language models can teach themselves to use tools",
    437       "authors": ["T. Schick", "J. Dwivedi-Yu"],
    438       "year": 2023,
    439       "relevance": "Foundational work on self-supervised tool use acquisition by LLMs."
    440     },
    441     {
    442       "title": "ReAct: Synergizing reasoning and acting in language models",
    443       "authors": ["S. Yao", "J. Zhao", "D. Yu"],
    444       "year": 2023,
    445       "relevance": "Core agentic paradigm (thought-action prompting) used in this paper's framework."
    446     },
    447     {
    448       "title": "AutoGen: Enabling next-gen LLM applications via multi-agent conversation",
    449       "authors": ["Q. Wu", "G. Bansal", "J. Zhang"],
    450       "year": 2024,
    451       "relevance": "Multi-agent LLM framework relevant to agentic system design."
    452     },
    453     {
    454       "title": "MDAgents: An adaptive collaboration of LLMs for medical decision-making",
    455       "authors": ["Y. Kim", "C. Park", "H. Jeong"],
    456       "year": 2024,
    457       "relevance": "Multi-agent LLM collaboration for medical tasks, directly relevant to healthcare agent methodology."
    458     },
    459     {
    460       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    461       "authors": ["L. Chen", "M. Zaharia", "J. Zou"],
    462       "year": 2023,
    463       "arxiv_id": "2305.05176",
    464       "relevance": "Cost-efficient LLM cascading approach relevant to model routing and selection."
    465     },
    466     {
    467       "title": "GraphRouter: A graph-based router for LLM selections",
    468       "authors": ["T. Feng", "Y. Shen", "J. You"],
    469       "year": 2024,
    470       "arxiv_id": "2410.03834",
    471       "relevance": "GNN-based LLM routing method, used as baseline in this paper."
    472     },
    473     {
    474       "title": "ToolACE: Winning the points of LLM function calling",
    475       "authors": ["W. Liu", "X. Huang", "X. Zeng"],
    476       "year": 2025,
    477       "relevance": "Recent work on LLM tool use and function calling capabilities."
    478     },
    479     {
    480       "title": "MMedAgent: Learning to use medical tools with multi-modal agent",
    481       "authors": ["B. Li", "T. Yan", "Y. Pan"],
    482       "year": 2024,
    483       "relevance": "Multi-modal medical agent that selects and composes imaging tools across modalities."
    484     },
    485     {
    486       "title": "Large language model based multi-agents: A survey of progress and challenges",
    487       "authors": ["T. Guo", "X. Chen", "Y. Wang"],
    488       "year": 2024,
    489       "arxiv_id": "2402.01680",
    490       "relevance": "Survey of multi-agent LLM systems covering planning, reasoning, and tool interaction."
    491     }
    492   ]
    493 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs