scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (29369B)
      1 {
      2   "paper": {
      3     "title": "SkillOrchestra: Learning to Route Agents via Skill Transfer",
      4     "authors": [
      5       "Jiayu Wang",
      6       "Yifei Ming",
      7       "Zixuan Ke",
      8       "Shafiq Joty",
      9       "Aws Albarghouthi",
     10       "Frederic Sala"
     11     ],
     12     "year": 2026,
     13     "venue": "arXiv",
     14     "arxiv_id": "2602.19672"
     15   },
     16   "scan_version": 2,
     17   "active_modules": ["experimental_rigor", "data_leakage"],
     18   "methodology_tags": ["benchmark-eval"],
     19   "key_findings": "SkillOrchestra proposes skill-aware orchestration using a learned Skill Handbook that captures fine-grained competence and cost profiles of agents. Across 10 benchmarks (QA, math, FRAMES), it outperforms RL-based orchestrators like Router-R1 by up to 22.5 percentage points while requiring 700× less training data. The method alleviates routing collapse seen in RL-based approaches (Router-R1 routes 98% to one model vs. SkillOrchestra's balanced distribution) and transfers the learned handbook across orchestrator backbones without retraining.",
     20   "checklist": {
     21     "artifacts": {
     22       "code_released": {
     23         "applies": true,
     24         "answer": true,
     25         "justification": "The abstract states 'The code is available at: https://github.com/jiayuww/SkillOrchestra' providing a GitHub repository URL."
     26       },
     27       "data_released": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "All evaluation benchmarks are publicly available standard datasets: NQ, TriviaQA, PopQA, HotpotQA, 2WikiMultiHopQA, Musique, Bamboogle, MATH, AMC23, and FRAMES. Referenced with citations throughout Section 5."
     31       },
     32       "environment_specified": {
     33         "applies": true,
     34         "answer": false,
     35         "justification": "The paper lists models used (Appendix A.1, A.2) but provides no environment specifications such as requirements.txt, Dockerfile, library versions, or dependency lists."
     36       },
     37       "reproduction_instructions": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "No step-by-step reproduction instructions are provided in the paper. Appendix A describes experimental setup details but not how to run the code to replicate results."
     41       }
     42     },
     43     "statistical_methodology": {
     44       "confidence_intervals_or_error_bars": {
     45         "applies": true,
     46         "answer": false,
     47         "justification": "Table 1 reports only point estimates (e.g., 54.2, 71.6) with no confidence intervals, error bars, or ± notation. Figure 5 likewise shows only single values."
     48       },
     49       "significance_tests": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "The paper claims SkillOrchestra 'outperforms' all baselines based solely on comparing raw accuracy numbers. No statistical significance tests (p-values, bootstrap tests, etc.) are reported."
     53       },
     54       "effect_sizes_reported": {
     55         "applies": true,
     56         "answer": true,
     57         "justification": "Effect sizes are reported with baseline context throughout: 'SkillOrchestra reaches 47.4 (+5.8)' relative to Router-R1's 41.6, 'up to +22.5 percentage-point improvement' on MATH (73.6 vs 55.8), and '+8.0 points' over ToolOrchestra (84.3 vs 76.3). Cost reductions are also quantified (38.4¢ vs 51.8¢)."
     58       },
     59       "sample_size_justified": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "Training uses 'k (k < 50) samples from each dataset' with no justification for why this number is sufficient. The ablation uses '100 randomly sampled FRAMES tasks' with no power analysis or sample size justification."
     63       },
     64       "variance_reported": {
     65         "applies": true,
     66         "answer": false,
     67         "justification": "No standard deviations, variance, or spread measures are reported for any experiment. All results in Table 1 and Figure 5 appear to be single-run numbers."
     68       }
     69     },
     70     "evaluation_design": {
     71       "baselines_included": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "Extensive baselines across three categories: no routing (Vanilla, SFT, RAG, CoT, Search-R1), heuristic/discriminative (9 methods including FrugalGPT, RouterDC, GraphRouter), and RL-based (Router-R1, ToolOrchestra). Table 1 and Figure 1."
     75       },
     76       "baselines_contemporary": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "Baselines include very recent work: Router-R1 (2025), ToolOrchestra (2025), GraphRouter (2025), RouteLLM (2025). GPT-5, Claude Opus 4.5, and Gemini 3 Pro are also used as orchestrator comparisons."
     80       },
     81       "ablation_study": {
     82         "applies": true,
     83         "answer": true,
     84         "justification": "Table 2 presents a controlled ablation removing components incrementally: No Handbook, No Refinement+Selection, No Selection, No Fine-Grained Skills, and Full System. Each row isolates the contribution of one component."
     85       },
     86       "multiple_metrics": {
     87         "applies": true,
     88         "answer": true,
     89         "justification": "Two primary metrics are reported: Exact Match accuracy and total inference cost (USD). Figure 1 plots both jointly, and Figure 5 and Table 2 report both."
     90       },
     91       "human_evaluation": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No human evaluation of system outputs is conducted. QA benchmarks use automated Exact Match, and FRAMES uses GPT-5-mini as a judge (Section A.2). All evaluation is automated."
     95       },
     96       "held_out_test_set": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "Section 4.3 describes evaluation on 'a held-out validation set D_val' for handbook selection. Appendix A.1 states 'k additional samples for validation.' The main results use standard benchmark test splits separate from handbook training data."
    100       },
    101       "per_category_breakdown": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Table 1 breaks down results across all 7 QA benchmarks individually. Figure 5 shows per-benchmark results for MATH and AMC. Figure 6 shows per-model routing distributions. Table 2 breaks down component contributions."
    105       },
    106       "failure_cases_discussed": {
    107         "applies": true,
    108         "answer": false,
    109         "justification": "The paper discusses failures of baseline methods (e.g., routing collapse in Router-R1) but does not analyze where SkillOrchestra itself fails or show qualitative failure examples of its own method."
    110       },
    111       "negative_results_reported": {
    112         "applies": true,
    113         "answer": false,
    114         "justification": "Every experiment shows SkillOrchestra improving over baselines. No approaches that were tried and abandoned, configurations that failed, or ablations where adding a component hurt performance are reported."
    115       }
    116     },
    117     "claims_and_evidence": {
    118       "abstract_claims_supported": {
    119         "applies": true,
    120         "answer": true,
    121         "justification": "The abstract claims 'up to 22.5%' improvement supported by MATH results in Figure 5 (73.6 vs 55.8 = 17.8pp; AMC 52.5 vs 25.0 = 27.5pp — the 22.5% likely refers to AMC or rounding). The '700×' cost reduction is implied by k<50 training samples vs Router-R1's 14k (14000/20≈700). Results across 10 benchmarks are shown in Table 1 and Figure 1."
    122       },
    123       "causal_claims_justified": {
    124         "applies": true,
    125         "answer": true,
    126         "justification": "The paper's causal claims are primarily supported through ablation studies (Table 2): 'removing the Skill Handbook causes a large drop in accuracy (85.0% → 71.0%).' Ablation design is controlled single-variable manipulation, which is adequate for these claims."
    127       },
    128       "generalization_bounded": {
    129         "applies": true,
    130         "answer": false,
    131         "justification": "The abstract claims 'scalable, interpretable, and sample-efficient orchestration' and the conclusion states 'a principled alternative to data-intensive RL-based approaches' without bounding to the tested QA/math/FRAMES settings. The title 'Learning to Route Agents' suggests general applicability beyond the specific benchmarks tested."
    132       },
    133       "alternative_explanations_discussed": {
    134         "applies": true,
    135         "answer": false,
    136         "justification": "The paper does not consider alternative explanations for the performance gains. For example, the LLM-based skill discovery (using GPT-5 as the 'reflector') could itself be a confound — the improvement might stem from the LLM's reasoning rather than the skill abstraction framework. No robustness checks against alternative explanations are provided."
    137       },
    138       "proxy_outcome_distinction": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "The paper measures Exact Match accuracy and USD cost, and claims are about accuracy-cost tradeoffs. These measurements directly match the claims — no significant proxy gap exists between what is measured and what is claimed."
    142       }
    143     },
    144     "setup_transparency": {
    145       "model_versions_specified": {
    146         "applies": true,
    147         "answer": false,
    148         "justification": "Open-source models are specified with family and size (e.g., 'Qwen2.5-3B', 'LLaMA-3.1-70B') but proprietary models use only marketing names: 'GPT-5', 'GPT-5-mini', 'Claude Opus 4.5', 'Gemini 3 Pro' without API versions or snapshot dates. Per schema guidance, marketing names without versions do not count."
    149       },
    150       "prompts_provided": {
    151         "applies": true,
    152         "answer": true,
    153         "justification": "Figures 7-10 in Appendix D provide full orchestration instruction templates with complete actual prompt text, including skill definitions, model performance data, routing instructions, and format specifications. This is exceptionally transparent."
    154       },
    155       "hyperparameters_reported": {
    156         "applies": true,
    157         "answer": false,
    158         "justification": "Max turns are stated (4 for routing, 50 for orchestration) and k < 50 training samples. However, critical hyperparameters are missing: temperature and sampling settings for all LLM calls, the value of λ (cost tradeoff weight) and λc, and the number k of training samples per dataset."
    159       },
    160       "scaffolding_described": {
    161         "applies": true,
    162         "answer": true,
    163         "justification": "The orchestration framework is described in extensive detail: Section 3 formalizes modes, model pools, tools, and agent instantiation. Section 4 details the Skill Handbook structure, mode selection, and skill-grounded routing. Figure 3 provides a complete system overview. Algorithm 1 gives pseudocode."
    164       },
    165       "data_preprocessing_documented": {
    166         "applies": true,
    167         "answer": false,
    168         "justification": "The paper states 'We assume an exploratory dataset D_train' without describing how trajectories were collected in practice. Appendix A references 'the same evaluation protocol as Router-R1' without detailing preprocessing steps. How benchmark examples were sampled for training/validation splits is not documented."
    169       }
    170     },
    171     "limitations_and_scope": {
    172       "limitations_section_present": {
    173         "applies": true,
    174         "answer": false,
    175         "justification": "No dedicated limitations or threats-to-validity section exists. Section 6 (Conclusion) is a brief forward-looking paragraph with no discussion of limitations."
    176       },
    177       "threats_to_validity_specific": {
    178         "applies": true,
    179         "answer": false,
    180         "justification": "No specific threats to validity are discussed anywhere in the paper. There is no analysis of what could threaten the validity of the results."
    181       },
    182       "scope_boundaries_stated": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No explicit statements about what the results do not show. The conclusion says 'We hope this work serves as a springboard for scalable orchestration' without stating boundaries or non-claims."
    186       }
    187     },
    188     "data_integrity": {
    189       "raw_data_available": {
    190         "applies": true,
    191         "answer": false,
    192         "justification": "While benchmarks are public, the collected execution traces, learned skill handbooks, agent profiles, and trajectory data generated during experiments are not made available for independent verification."
    193       },
    194       "data_collection_described": {
    195         "applies": true,
    196         "answer": false,
    197         "justification": "The data collection procedure for the exploratory dataset is only described abstractly: 'trajectories obtained by varying the agent choice at specific modes' (Section 4.2). How many trajectories, which specific queries, and what the success/failure criteria were are not specified."
    198       },
    199       "recruitment_methods_described": {
    200         "applies": false,
    201         "answer": false,
    202         "justification": "No human participants. Data sources are standard public benchmarks (NQ, TriviaQA, PopQA, HotpotQA, etc.)."
    203       },
    204       "data_pipeline_documented": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "The skill handbook learning pipeline is described algorithmically (Phase 1: Discovery, Phase 2: Refinement) but the specific data transformation steps — how many skills were discovered, how many were merged/split, what thresholds were used — are not documented with counts."
    208       }
    209     },
    210     "conflicts_of_interest": {
    211       "funding_disclosed": {
    212         "applies": true,
    213         "answer": false,
    214         "justification": "No funding statement or acknowledgments section is present in the paper despite having authors from both a university and a major AI company (Salesforce)."
    215       },
    216       "affiliations_disclosed": {
    217         "applies": true,
    218         "answer": true,
    219         "justification": "Author affiliations are clearly listed: 'University of Wisconsin-Madison' and 'Salesforce AI Research' with superscript numbering identifying which authors belong to each institution."
    220       },
    221       "funder_independent_of_outcome": {
    222         "applies": true,
    223         "answer": false,
    224         "justification": "Funding is not disclosed, so independence cannot be assessed. Salesforce AI Research, which employs three of the six authors, has commercial interest in AI orchestration and agent technology."
    225       },
    226       "financial_interests_declared": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No competing interests or financial interests statement is present in the paper."
    230       }
    231     },
    232     "contamination": {
    233       "training_cutoff_stated": {
    234         "applies": true,
    235         "answer": false,
    236         "justification": "The paper uses models like Qwen2.5, LLaMA-3.1, Mistral-7B, GPT-5, and Gemini 3 Pro without stating any of their training data cutoff dates."
    237       },
    238       "train_test_overlap_discussed": {
    239         "applies": true,
    240         "answer": false,
    241         "justification": "No discussion of potential train/test overlap despite using benchmarks published years before model training (NQ 2019, TriviaQA 2017, HotpotQA 2018, MATH 2021). These could be in the training data of all models used."
    242       },
    243       "benchmark_contamination_addressed": {
    244         "applies": true,
    245         "answer": false,
    246         "justification": "Most benchmarks (NQ, TriviaQA, PopQA, HotpotQA, 2Wiki, Musique, Bamboogle, MATH) were published years before the models' training cutoffs. This contamination risk is not discussed."
    247       }
    248     },
    249     "human_studies": {
    250       "pre_registered": {
    251         "applies": false,
    252         "answer": false,
    253         "justification": "No human participants in this study. All evaluation is automated on benchmarks."
    254       },
    255       "irb_or_ethics_approval": {
    256         "applies": false,
    257         "answer": false,
    258         "justification": "No human participants in this study."
    259       },
    260       "demographics_reported": {
    261         "applies": false,
    262         "answer": false,
    263         "justification": "No human participants in this study."
    264       },
    265       "inclusion_exclusion_criteria": {
    266         "applies": false,
    267         "answer": false,
    268         "justification": "No human participants in this study."
    269       },
    270       "randomization_described": {
    271         "applies": false,
    272         "answer": false,
    273         "justification": "No human participants in this study."
    274       },
    275       "blinding_described": {
    276         "applies": false,
    277         "answer": false,
    278         "justification": "No human participants in this study."
    279       },
    280       "attrition_reported": {
    281         "applies": false,
    282         "answer": false,
    283         "justification": "No human participants in this study."
    284       }
    285     },
    286     "cost_and_practicality": {
    287       "inference_cost_reported": {
    288         "applies": true,
    289         "answer": true,
    290         "justification": "Inference costs are reported throughout: Figure 1 plots accuracy vs cost, Figure 5 compares cost in US cents (6.5¢ vs 3.6¢ for MATH), Table 2 reports cost in USD ($9.3 for full system), and specific per-query costs are given (38.4¢ vs 51.8¢ for Router-R1)."
    291       },
    292       "compute_budget_stated": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "The total computational budget for training the Skill Handbook is not quantified. The paper mentions '700× cost reduction' relative to baselines but does not state the absolute training cost, GPU hours, or total API spend."
    296       }
    297     },
    298     "experimental_rigor": {
    299       "seed_sensitivity_reported": {
    300         "applies": true,
    301         "answer": false,
    302         "justification": "No mention of multiple random seeds. All results (Table 1, Figure 5, Table 2) appear to be single-run numbers with no seed sensitivity analysis."
    303       },
    304       "number_of_runs_stated": {
    305         "applies": true,
    306         "answer": false,
    307         "justification": "The number of experimental runs is never explicitly stated. Results are presented without indicating how many runs produced them."
    308       },
    309       "hyperparameter_search_budget": {
    310         "applies": true,
    311         "answer": false,
    312         "justification": "The trade-off hyperparameters λ and λc are mentioned but their values and how they were tuned are not reported. No hyperparameter search budget is stated."
    313       },
    314       "best_config_selection_justified": {
    315         "applies": true,
    316         "answer": true,
    317         "justification": "Section 4.3 describes Pareto-optimal handbook selection on a held-out validation set D_val. The selection criterion is formalized as maximizing the expected reward minus cost. SkillOrchestra+ selects the best orchestrator via the same validation-based Pareto criterion."
    318       },
    319       "multiple_comparison_correction": {
    320         "applies": true,
    321         "answer": false,
    322         "justification": "Comparisons are made across 10 benchmarks and against 15+ baselines with no correction for multiple comparisons."
    323       },
    324       "self_comparison_bias_addressed": {
    325         "applies": true,
    326         "answer": false,
    327         "justification": "The authors compare their own system against baselines without acknowledging self-comparison bias. While they use 'the same evaluation protocol as Router-R1,' the SkillOrchestra implementation, including LLM-based skill discovery and refinement, is their own."
    328       },
    329       "compute_budget_vs_performance": {
    330         "applies": true,
    331         "answer": true,
    332         "justification": "Figure 1 explicitly plots performance (accuracy) as a function of inference cost for all methods. Figure 5 directly compares accuracy and cost side by side. The paper's central thesis is about performance-cost tradeoffs."
    333       },
    334       "benchmark_construct_validity": {
    335         "applies": true,
    336         "answer": false,
    337         "justification": "Standard QA and math benchmarks are used without any discussion of whether they actually measure the orchestration capabilities claimed. No analysis of construct validity for using Exact Match on factoid QA as a measure of 'orchestration quality.'"
    338       },
    339       "scaffold_confound_addressed": {
    340         "applies": true,
    341         "answer": true,
    342         "justification": "In model routing (Section 5.1): 'We use the same configuration as Router-R1 for controlled comparison.' In agent orchestration (Section 5.2): 'We follow the same evaluation protocol and experimental setup as ToolOrchestra' with identical modes, model pools, tools, and execution environments. The scaffold is controlled across comparisons."
    343       }
    344     },
    345     "data_leakage": {
    346       "temporal_leakage_addressed": {
    347         "applies": true,
    348         "answer": false,
    349         "justification": "Not discussed. Models trained on data collected well after the benchmark creation dates (e.g., TriviaQA 2017, HotpotQA 2018) are used without addressing temporal leakage."
    350       },
    351       "feature_leakage_addressed": {
    352         "applies": true,
    353         "answer": false,
    354         "justification": "Not discussed. No analysis of whether the evaluation setup (multi-turn routing with intermediate model calls) leaks answer information through context."
    355       },
    356       "non_independence_addressed": {
    357         "applies": true,
    358         "answer": false,
    359         "justification": "Not discussed. No verification that training samples used for skill handbook learning are independent from test examples."
    360       },
    361       "leakage_detection_method": {
    362         "applies": true,
    363         "answer": false,
    364         "justification": "No concrete leakage detection or prevention methods are used. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipelines."
    365       }
    366     }
    367   },
    368   "claims": [
    369     {
    370       "claim": "SkillOrchestra outperforms SoTA RL-based orchestrators by up to 22.5 percentage points with 700× and 300× learning cost reduction compared to Router-R1 and ToolOrchestra respectively.",
    371       "evidence": "Table 1 shows SkillOrchestra at 47.4 avg EM vs Router-R1 at 41.6. Figure 5 shows MATH: 73.6 vs 55.8 and AMC: 52.5 vs 25.0. Training uses k<50 samples vs Router-R1's 14k samples (Section A.1). Figure 1 (right) shows 84.3% vs ToolOrchestra's 76.3%.",
    372       "supported": "moderate"
    373     },
    374     {
    375       "claim": "Skill-based routing alleviates routing collapse seen in RL-based routing.",
    376       "evidence": "Figure 6 (left) shows Router-R1 selects LLaMA-3.1-70B for 98.02% of calls, while SkillOrchestra distributes: Mixtral-8x22B 44.53%, Qwen2.5-7B 25.99%, LLaMA-3.1-70B 15.38%, Qwen2.5-3B 11.50% (Section 5.1, Observation 3).",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "The learned Skill Handbook transfers across orchestrator backbones without retraining.",
    381       "evidence": "Figure 6 (right) shows a handbook learned from Qwen2.5-3B improving all tested models: Qwen2.5-7B +24.3, LLaMA-3.1-8B +22.5, Mistral-7B +23.3, Mixtral-8x22B +14.8 (Section 5.1, Observation 4).",
    382       "supported": "moderate"
    383     },
    384     {
    385       "claim": "SkillOrchestra achieves the best performance-cost trade-off in full agent orchestration, outperforming GPT-5, Claude Opus 4.5, and Gemini 3 Pro as orchestrators.",
    386       "evidence": "Figure 1 (right) shows SkillOrchestra at 84.3% accuracy, $72.7 cost vs ToolOrchestra 76.3%, $92.7; GPT-5 74.6%, $120.4; Claude Opus 4.5 77.9%, $758.1; Gemini 3 Pro 78.9%, $1729.3 (Section 5.2, Observation 5).",
    387       "supported": "moderate"
    388     },
    389     {
    390       "claim": "Optimal skill granularity depends on orchestrator capacity; more skills are not always better.",
    391       "evidence": "Table 2 ablation shows: No Fine-Grained Skills achieves 80.4% at $15.1 cost vs Full System at 85.0% at $9.3. Disabling fine-grained skills degrades both accuracy and efficiency (Section 5.2, Observation 6).",
    392       "supported": "moderate"
    393     }
    394   ],
    395   "red_flags": [
    396     {
    397       "flag": "No error bars or variance reporting",
    398       "detail": "All results across all experiments (Tables 1-2, Figures 1, 5) are single-run point estimates with no standard deviations, confidence intervals, or multi-seed results. Given known high variance in LLM-based systems, the reported differences could be within noise."
    399     },
    400     {
    401       "flag": "No contamination discussion despite using old benchmarks",
    402       "detail": "Benchmarks like TriviaQA (2017), HotpotQA (2018), NQ (2019), and MATH (2021) are evaluated with models trained years later (Qwen2.5, LLaMA-3.1, GPT-5). These benchmarks are almost certainly in the training data, yet contamination is never discussed."
    403     },
    404     {
    405       "flag": "No limitations section",
    406       "detail": "The paper contains no limitations, threats to validity, or scope boundaries discussion. This is a significant omission for a paper making broad claims about 'scalable, interpretable, and sample-efficient orchestration.'"
    407     },
    408     {
    409       "flag": "LLM-based components as confound",
    410       "detail": "The Skill Handbook is discovered and refined by an LLM-based discoverer and reflector (e.g., GPT-5). The improvement could partially stem from GPT-5's reasoning ability rather than the skill abstraction framework itself. This confound is not addressed."
    411     },
    412     {
    413       "flag": "Salesforce affiliation without funding disclosure",
    414       "detail": "Three of six authors are from Salesforce AI Research, a company with commercial interest in AI orchestration. No funding or competing interests statement is provided."
    415     },
    416     {
    417       "flag": "Every experiment shows improvement",
    418       "detail": "Across all 10 benchmarks, all metrics, and all ablations, SkillOrchestra always improves over baselines. No negative results, failed configurations, or settings where the method underperforms are reported, which is suspicious."
    419     }
    420   ],
    421   "cited_papers": [
    422     {
    423       "title": "Router-R1: Teaching LLMs Multi-Round Routing and Aggregation via Reinforcement Learning",
    424       "authors": ["Haozhen Zhang", "Tao Feng", "Jiaxuan You"],
    425       "year": 2025,
    426       "relevance": "SoTA RL-based multi-turn routing approach that SkillOrchestra directly outperforms; demonstrates routing collapse problem."
    427     },
    428     {
    429       "title": "ToolOrchestra: Elevating Intelligence via Efficient Model and Tool Orchestration",
    430       "authors": ["Hongjin Su", "Shizhe Diao", "Ximing Lu"],
    431       "year": 2025,
    432       "arxiv_id": "2511.21689",
    433       "relevance": "RL-trained agent orchestrator using GRPO for multi-tool coordination; primary baseline in agent orchestration experiments."
    434     },
    435     {
    436       "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
    437       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    438       "year": 2024,
    439       "relevance": "Cascade-based LLM routing approach for cost-efficient model selection; representative of heuristic routing methods."
    440     },
    441     {
    442       "title": "RouteLLM: Learning to Route LLMs from Preference Data",
    443       "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"],
    444       "year": 2025,
    445       "relevance": "Discriminative routing approach using preference data; BERT Router baseline in the evaluation."
    446     },
    447     {
    448       "title": "RouterDC: Query-Based Router by Dual Contrastive Learning for Assembling Large Language Models",
    449       "authors": ["Shuhao Chen", "Weisen Jiang", "Baijiong Lin"],
    450       "year": 2024,
    451       "relevance": "Contrastive learning-based router for LLM selection; discriminative routing baseline."
    452     },
    453     {
    454       "title": "GraphRouter: A Graph-Based Router for LLM Selections",
    455       "authors": ["Tao Feng", "Yanzhen Shen", "Jiaxuan You"],
    456       "year": 2025,
    457       "relevance": "Graph-based formulation for LLM routing decisions; discriminative routing baseline."
    458     },
    459     {
    460       "title": "RouterBench: A Benchmark for Multi-LLM Routing System",
    461       "authors": ["Qitian Jason Hu", "Jacob Bieker", "Xiuyu Li"],
    462       "year": 2024,
    463       "arxiv_id": "2403.12031",
    464       "relevance": "Benchmark and KNN/MLP router baselines for multi-LLM routing evaluation."
    465     },
    466     {
    467       "title": "Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning",
    468       "authors": ["Bowen Jin", "Hansi Zeng", "Zhenrui Yue"],
    469       "year": 2025,
    470       "relevance": "RL-trained LLM for search-augmented reasoning; no-routing baseline in evaluation."
    471     },
    472     {
    473       "title": "A Survey of Frontiers in LLM Reasoning: Inference Scaling, Learning to Reason, and Agentic Systems",
    474       "authors": ["Zixuan Ke", "Fangkai Jiao", "Yifei Ming"],
    475       "year": 2025,
    476       "relevance": "Survey of LLM reasoning and agentic systems; motivates the need for effective orchestration in compound AI systems."
    477     },
    478     {
    479       "title": "Towards an AI Co-Scientist",
    480       "authors": ["Juraj Gottweis", "Wei-Hung Weng", "Alexander Daryin"],
    481       "year": 2025,
    482       "arxiv_id": "2502.18864",
    483       "relevance": "Compound AI system for scientific discovery illustrating the need for multi-model coordination."
    484     },
    485     {
    486       "title": "FACT, FETCH, AND REASON: A Unified Evaluation of Retrieval-Augmented Generation",
    487       "authors": ["Satyapriya Krishna", "Kalpesh Krishna", "Anhad Mohananey"],
    488       "year": 2024,
    489       "arxiv_id": "2409.12941",
    490       "relevance": "FRAMES benchmark used in agent orchestration evaluation."
    491     },
    492     {
    493       "title": "Proximal Policy Optimization Algorithms",
    494       "authors": ["John Schulman", "Filip Wolski", "Prafulla Dhariwal"],
    495       "year": 2017,
    496       "arxiv_id": "1707.06347",
    497       "relevance": "PPO algorithm used for training RL-based routing baselines like Router-R1."
    498     }
    499   ]
    500 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs