scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (31927B)
      1 {
      2   "paper": {
      3     "title": "R2-ROUTER: A New Paradigm for LLM Routing with Reasoning",
      4     "authors": [
      5       "Jiaqi Xue",
      6       "Qian Lou",
      7       "Jiarong Xing",
      8       "Heng Huang"
      9     ],
     10     "year": 2026,
     11     "venue": "arXiv",
     12     "arxiv_id": "2602.02823"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval"],
     17   "key_findings": "R2-ROUTER jointly selects the best LLM and output length budget by modeling each LLM as a quality-cost curve rather than a single point, achieving comparable quality at 4-5x lower cost than reactive routers. The approach is data-efficient, reaching near-optimal performance with only 6-8 anchor points via piecewise linear interpolation. R2-BENCH, the first routing dataset capturing LLM behavior across diverse output length budgets, raises the oracle upper bound by 15% AUDC over single-response datasets. The method generalizes to unseen LLMs and out-of-distribution queries.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": false,
     23         "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper. The system is described algorithmically but no implementation is released."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "R2-BENCH is constructed and described in detail (Section 3) but no download link or data repository is provided. The paper states it integrates queries from 6 public benchmarks and collects LLM responses, but the collected dataset itself is not released."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper mentions hardware (8 NVIDIA B200 GPUs for data collection, single RTX 3090 for training) and some software choices (Adam optimizer, PyTorch implicitly), but provides no requirements.txt, Dockerfile, or detailed dependency listing with library versions."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": false,
     38         "justification": "No step-by-step reproduction instructions are provided. The methodology is described at an algorithmic level (Section 4.3, 5.3) but there are no scripts, commands, or README instructions to replicate the experiments."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "All tables (Tables 1-5) report mean ± standard deviation across 5 independent runs. Section 5.5 states: 'All experiments are conducted over 5 independent runs with different random seeds, and we report the mean and standard deviation of the results.'"
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "Despite claiming R2-ROUTER 'outperforms' baselines across multiple settings, no statistical significance tests are reported. Comparisons are made solely by comparing point estimates with standard deviations, without p-values, confidence intervals on differences, or any formal testing."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Effect sizes are reported with baseline context throughout: '4-5× lower cost' (Section 6.1), 'AUDC improves by 5% and cost drops by 80%' (Section 6.2), and tables provide absolute values for both R2-ROUTER and baselines (e.g., AUDC 0.71 vs 0.67 in Table 1)."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "R2-BENCH contains 30,968 queries with 15 LLMs and 16 cost levels. No justification is given for why these specific numbers were chosen, and no power analysis is discussed."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": true,
     65         "justification": "Standard deviations across 5 runs are reported in all results tables (Tables 1-5). Section 5.5 explicitly states results are averaged over 5 independent runs with different random seeds."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Multiple baselines are compared: MIRT-IRT and NIRT-IRT (Song et al., 2025), CARROT-KNN and CARROT-Linear (Somerstep et al., 2025), and UniRouter (Jitkrittum et al., 2025). Section 5.2 describes these as 'top-performing routing methods from the RouterArena benchmark.'"
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "All baselines are from 2025 publications and are described as top-performing methods from the RouterArena leaderboard (Top-1, Top-3, Top-6 rankings). These are the current state of the art for LLM routing."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Section 6.5 presents extensive ablations: embedding model choice (Table 2, MiniLM vs Qwen3-Embedding), predictor architecture (Table 3, MLP vs LGBM), judge robustness (Table 4, DeepSeek-V3.1 vs Qwen3-80B), and length-constraint prompt impact (Table 5). Section 6.4 analyzes sensitivity to number of interpolation heads."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "Three evaluation metrics are used throughout: Area Under the Deferral Curve (AUDC), Peak Quality, and Query-Normalized Cost (QNC), as defined in Section 5.5."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "The routing system's outputs are evaluated entirely via automated metrics (AUDC, QNC, Peak Quality). While 30 human annotators validated the LLM judge (Section 3.1, Pearson ρ=0.82), this validates the measurement instrument, not the routing system's outputs."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Section 3 states R2-BENCH uses 'a standard train/test split.' The OOD experiment (Section 6.3) uses STEM disciplines for training and non-STEM for testing. Train/test separation is explicitly maintained."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": false,
    102         "justification": "R2-BENCH spans 20 categories from 6 benchmarks (Appendix B.1 lists these), but main results are reported only as aggregate deferral curves (Figure 5) and overall metrics. No per-benchmark or per-category performance breakdown is provided."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": false,
    107         "justification": "No failure cases of the routing system are discussed. Appendix A notes that small models have low compliance with length constraints (as low as 3%), but the authors argue this doesn't undermine the method. No analysis of when R2-ROUTER makes wrong routing decisions or performs poorly."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": true,
    112         "justification": "Several negative results are reported: small models show very poor compliance at tight budgets (Figure 8, as low as 3%), and Figure 7 shows that K=2 interpolation heads performs poorly (QNC ≈0.45). The compliance heatmap honestly shows where the approach faces challenges."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "Abstract claims are supported: '4-5× lower cost' is shown in Figure 5 and Section 6.1; 'state-of-the-art performance' is demonstrated across Tables 1-5; 'near-optimal performance with minimal overhead (20 minutes on a single GPU)' is approximately supported by Section 5.4 (though it says 30 minutes there); integration with UniRouter improving AUDC by 5% and reducing cost by 80% is shown in Section 6.2."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper claims the reasoning step causes the improvement over reactive routers. This is supported by controlled ablations: Table 5 isolates the effect of length-constraint prompts from routing logic, Tables 2-3 show robustness across architectures, and Figure 7 shows performance degrades with fewer anchor points. The ablation design adequately supports the causal attribution."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The title claims 'A New Paradigm for LLM Routing' and the abstract says 'This work opens a new direction: routing as reasoning.' Results are on a single dataset (R2-BENCH) with 15 specific open-source LLMs and 6 benchmarks. The OOD test (STEM→non-STEM within MMLU-Pro) is limited. The paper does not explicitly bound generalization to the tested setting or acknowledge what settings were NOT tested."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "Table 5 and Section 6.5 directly address the most natural alternative explanation: that gains come from simply prompting models to be concise rather than from reasoning-based routing. The paper shows that reactive baselines with length-constraint prompts still underperform R2-ROUTER, because the prompts don't change the selection logic."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper measures 'quality' via an LLM judge (Qwen3-80B-Instruct) and explicitly acknowledges this is a proxy. Section 3.1 validates the judge against 30 human annotators (Pearson ρ=0.82) and explains why automated metrics like exact match are insufficient for open-ended queries. Table 4 further tests robustness by switching to a different judge (DeepSeek-V3.1)."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific model versions are listed in Table 6 and Appendix B.2: Qwen3-235B-A22B-Instruct, LLaMA-3.1-70B-Instruct, Mistral-7B-v0.2, Qwen2.5-Math-7B-Instruct, GLM-4.5-Air, GLM-4.6, etc. These include version identifiers and instruction-tuned variants."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "The length-constraint instruction is provided ('use at most k tokens'), but the LLM judge prompt used for quality scoring is not provided. Section 3.1 references the 'LLM-as-a-judge validation protocol (Zheng et al., 2023)' but does not include the actual prompt text. The judge prompt is critical to the evaluation as it determines all quality scores."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": true,
    156         "justification": "Section 5.3 reports: embedding dimension (1024), MLP hidden dimensions [256, 128, 64], MSE loss, Adam optimizer with learning rate 1×10⁻⁴, 100 training epochs, K anchor costs. The 16 specific token budgets are listed in Section 5.1."
    157       },
    158       "scaffolding_described": {
    159         "applies": false,
    160         "answer": false,
    161         "justification": "No agentic scaffolding is used. R2-ROUTER is a standard ML pipeline (embedding encoder + MLP prediction heads + decision maker) without agent loops, tool use, or iterative reasoning."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": true,
    166         "justification": "Section 3.1 describes the full dataset construction pipeline: source benchmarks selected, LLM pool defined, 16 token budgets applied via prompt constraints, truncation enforcement, quality scoring by LLM judge, human validation of judge. Appendix B provides additional details on benchmark composition and LLM pricing."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "There is no dedicated limitations or threats-to-validity section. The Impact Statement (end of Section 7) briefly discusses potential positive societal impact and mentions extending to other variables, but does not discuss methodological limitations."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "No specific threats to validity are discussed. The Impact Statement says 'We do not foresee any specific negative societal consequences' but this addresses societal impact, not methodological validity. No discussion of specific weaknesses of the approach, dataset, or evaluation."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The paper does not explicitly state what the results do NOT show. No discussion of untested settings, populations excluded, or claims the authors are not making. The title and abstract make broad claims about 'a new paradigm' and 'a new direction' without bounding these to the tested configuration."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "R2-BENCH responses and quality scores are not publicly available. Only aggregated results are presented. No data download link is provided."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Section 3.1 describes the data collection pipeline: 6 source benchmarks, 15 LLMs queried under 16 token budgets, responses scored by Qwen3-80B-Instruct judge validated against 30 human annotators. Appendix B details benchmark composition and LLM pricing."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "30 'expert annotators' are mentioned for judge validation (Section 3.1) but no description of how they were recruited, what qualifies them as 'experts,' their background, or potential selection bias."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": true,
    205         "justification": "The pipeline from queries through LLM responses to quality scores is documented in Section 3.1 and Figure 4: benchmark queries → LLM responses under token budgets → LLM judge scoring → train/test split. Appendix A validates the length-constraint mechanism."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": false,
    212         "justification": "No funding sources are disclosed anywhere in the paper. There is no acknowledgments section listing grants, corporate sponsors, or funding agencies."
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly stated: University of Central Florida, Rice University, and University of Maryland College Park. These are academic institutions, not companies whose products are being evaluated."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "Since funding is not disclosed, independence cannot be verified. The authors are at academic institutions evaluating open-source LLMs they did not create, which suggests low conflict risk, but the absence of any funding disclosure prevents assessment."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial disclosure statement is included in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper uses 15 LLMs (including Qwen3-235B, DeepSeek-V3, LLaMA-3.1-70B) on benchmarks like MATH (2021) and MMLU-Pro (2024), but never states the training data cutoff dates for any of the models."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether the LLMs' training data includes examples from MATH, MMLU-Pro, GPQA, or other benchmarks used. Models trained in 2025-2026 could have seen these benchmarks during training."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "MATH was published in 2021, MMLU-Pro in 2024, and GPQA in 2024. Models like Qwen3-235B and DeepSeek-V3 trained well after these benchmarks were publicly available. No contamination analysis or discussion is provided."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "This is a benchmark evaluation paper. The 30 annotators validated the LLM judge but are not research participants in a human subjects study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human subjects study is conducted. The annotators provided quality judgments for measurement validation, not as research participants."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human subjects study. The 30 expert annotators' demographics are not reported but this is a benchmark-eval paper, not a human subjects study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human subjects study is conducted."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human subjects study is conducted."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human subjects study is conducted."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human subjects study is conducted."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Per-token costs are listed in Table 6 from OpenRouter pricing. Routing overhead is quantified: '<400 ms on average, accounting for less than 1% of the total LLM generation time' (Section 5.4). Cost is central to the paper and extensively reported."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": true,
    293         "justification": "Section 5.4 states: '8 NVIDIA B200 GPUs for efficient data collection' and 'training routers for 15 LLMs takes approximately 30 minutes' on a single NVIDIA RTX 3090. The paper also notes the dataset 'can alternatively be constructed via API calls.'"
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Section 5.5: 'All experiments are conducted over 5 independent runs with different random seeds, and we report the mean and standard deviation of the results.' Standard deviations are shown in all tables."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": true,
    305         "justification": "Section 5.5 explicitly states '5 independent runs with different random seeds.'"
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "Hyperparameters are reported (hidden dims [256,128,64], lr=1e-4, 100 epochs) but no search budget is stated. No description of how these values were selected or how many configurations were tried."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The final configuration (3-layer MLP, specific hidden dimensions, learning rate) is presented without justification for why these specific values were chosen. No description of selection on a validation set or comparison of alternative configurations for the main architecture."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical significance tests are performed, so multiple comparison correction is inapplicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors evaluate their own R2-ROUTER against baselines they re-implemented. No acknowledgment of the systematic bias documented by Lucic et al. (2018) where authors' implementations of baselines tend to underperform."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "While training cost is stated (30 minutes on RTX 3090) and inference overhead is quantified (<400ms), performance is not systematically reported as a function of compute budget. No compute-matched comparison with baselines."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "Six benchmarks are used (GPQA, MuSR, MMLU-Pro, MATH, OpenHermes, RAGBench) without any discussion of whether they collectively or individually measure what the paper claims to evaluate. No construct validity analysis is provided."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "No scaffolding is involved. R2-ROUTER directly calls LLMs with prompts; there is no agentic scaffold confound."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "Benchmarks like MATH (2021) and MMLU-Pro (2024) were published before the training dates of models like Qwen3-235B and DeepSeek-V3. No discussion of whether the models' training data includes benchmark solutions."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks answer information. The length-constraint prompts could interact with model behavior in unexpected ways, but this is not analyzed from a leakage perspective."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No analysis of whether train and test examples in R2-BENCH share structural similarities or overlap across the 6 source benchmarks."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No concrete leakage detection or prevention method is applied. No canary strings, membership inference, n-gram overlap analysis, or decontamination pipeline."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "R2-ROUTER achieves comparable quality at 4-5× lower cost than existing routers",
    369       "evidence": "Figure 5 shows deferral curves where R2-ROUTER reaches quality 0.8 at cost ~0.5×10⁻³ while baselines require 4-5× that budget. Section 6.1 discusses the results.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "R2-BENCH raises the oracle upper bound by 15% in AUDC over single-response datasets",
    374       "evidence": "Section 3.2 compares Oracle performance: R2-BENCH achieves AUDC 0.98 vs SPROUT's 0.85, QNC drops from 0.18 to 0.04, and Peak Quality rises from 0.90 to 0.98.",
    375       "supported": "strong"
    376     },
    377     {
    378       "claim": "R2-ROUTER is data-efficient, achieving near-optimal performance with 6-8 anchor points",
    379       "evidence": "Figure 7 and Section 6.4 show QNC converges to ~0.12 with K=6-8 heads, significantly outperforming baselines even at K=4.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "Integrating R2-ROUTER with UniRouter improves AUDC by 5% and reduces QNC by 80%",
    384       "evidence": "Section 6.2 and Figure 6 show UNI-R2ROUTER achieves AUDC 0.623 vs UniRouter's 0.590, and QNC drops from 0.23 to 0.04 on the expanded pool.",
    385       "supported": "moderate"
    386     },
    387     {
    388       "claim": "R2-ROUTER generalizes to out-of-distribution queries",
    389       "evidence": "Table 1 shows performance on STEM→non-STEM transfer: R2-ROUTER achieves AUDC 0.71 vs 0.67 for CARROT-L and 0.63 for MIRT.",
    390       "supported": "moderate"
    391     },
    392     {
    393       "claim": "Large models reliably follow length-constrained instructions with >82% compliance even at 10 tokens",
    394       "evidence": "Figure 8 and Appendix A show compliance heatmap across 15 LLMs and 5 budgets. Qwen3-235B achieves 86% at 10 tokens, >97% at ≥100 tokens.",
    395       "supported": "strong"
    396     },
    397     {
    398       "claim": "Powerful LLMs with constrained budgets can outperform weaker LLMs at comparable cost",
    399       "evidence": "This is the central thesis, supported by the quality-cost curves in Figure 3 and the routing results in Figure 5. The formalization in Theorem 4.3 provides a theoretical guarantee that the reasoning search space dominates the reactive one.",
    400       "supported": "moderate"
    401     }
    402   ],
    403   "red_flags": [
    404     {
    405       "flag": "No limitations section",
    406       "detail": "A paper proposing 'a new paradigm' and 'a new direction' for LLM routing includes no limitations or threats-to-validity discussion. The Impact Statement only addresses societal impact and mentions no methodological limitations."
    407     },
    408     {
    409       "flag": "No code or data release",
    410       "detail": "Despite constructing R2-BENCH as a novel benchmark dataset and proposing R2-ROUTER as a new method, neither code nor data is released. This prevents independent verification and replication."
    411     },
    412     {
    413       "flag": "No contamination discussion",
    414       "detail": "Benchmarks like MATH (2021) and MMLU-Pro (2024) are used with models trained in 2025-2026 that could have seen benchmark solutions during training. If contamination inflates quality scores for certain LLMs, the quality-cost curves and routing decisions would be unreliable. This is never discussed."
    415     },
    416     {
    417       "flag": "LLM judge correlation leaves substantial unexplained variance",
    418       "detail": "The LLM judge (Qwen3-80B-Instruct) achieves Pearson ρ=0.82 with human annotators, meaning roughly 33% of variance is unexplained. All quality scores and thus all routing decisions are based on this judge. The judge was also one of the 15 LLMs being routed, creating a potential self-evaluation confound."
    419     },
    420     {
    421       "flag": "Training time inconsistency",
    422       "detail": "The abstract claims 'near-optimal performance with minimal overhead (20 minutes on a single GPU)' but Section 5.4 states 'training routers for 15 LLMs takes approximately 30 minutes' on a single RTX 3090."
    423     },
    424     {
    425       "flag": "No statistical significance testing",
    426       "detail": "Claims of outperforming baselines are made throughout (e.g., 'achieves state-of-the-art performance') without any significance tests. Given the overlapping standard deviations in several tables (e.g., Table 1 AUDC: 0.71±0.05 vs 0.67±0.04), the claimed improvements may not be statistically significant."
    427     }
    428   ],
    429   "cited_papers": [
    430     {
    431       "title": "FrugalGPT: How to use large language models while reducing cost and improving performance",
    432       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
    433       "year": 2024,
    434       "relevance": "Pioneering work on cost-efficient LLM usage through cascading model calls, directly compared as a baseline approach to LLM routing."
    435     },
    436     {
    437       "title": "RouterBench: A benchmark for multi-LLM routing system",
    438       "authors": ["Qinian Jason Hu", "Jacob Bieker", "Xiuyu Li"],
    439       "year": 2024,
    440       "arxiv_id": "2403.12031",
    441       "relevance": "Large-scale routing benchmark with 405k inference outcomes from 11 LLMs, foundational dataset for evaluating LLM routing methods."
    442     },
    443     {
    444       "title": "CARROT: A cost aware rate optimal router",
    445       "authors": ["Sara Somerstep", "Felipe Maia Polo", "Andre F. M. de Oliveira"],
    446       "year": 2025,
    447       "arxiv_id": "2502.03261",
    448       "relevance": "Top-3 router on RouterArena with separate quality and cost predictors, used as primary baseline and compared directly against R2-ROUTER."
    449     },
    450     {
    451       "title": "IRT-Router: Effective and interpretable multi-LLM routing via item response theory",
    452       "authors": ["Wenhan Song", "Zhi Huang", "Caoyun Cheng"],
    453       "year": 2025,
    454       "arxiv_id": "2506.01048",
    455       "relevance": "Top-1 router on RouterArena using item response theory for quality prediction, key baseline demonstrating the state of point-based routing."
    456     },
    457     {
    458       "title": "Universal model routing for efficient LLM inference",
    459       "authors": ["Wittawat Jitkrittum", "Harikrishna Narasimhan", "Ankit Singh Rawat"],
    460       "year": 2025,
    461       "arxiv_id": "2502.08773",
    462       "relevance": "Router designed for dynamic LLM pools that generalizes to unseen models, integrated with R2-ROUTER as a case study for plug-in compatibility."
    463     },
    464     {
    465       "title": "RouterDC: Query-based router by dual contrastive learning for assembling large language models",
    466       "authors": ["Shuhao Chen", "Weisen Jiang", "Baijiong Lin"],
    467       "year": 2024,
    468       "relevance": "Neural network-based LLM router using contrastive learning, representative of the predictive routing approach that R2-ROUTER extends."
    469     },
    470     {
    471       "title": "Route to reason: Adaptive routing for LLM and reasoning strategy selection",
    472       "authors": ["Zekun Pan", "Kai Zhang", "Yiming Zhao"],
    473       "year": 2025,
    474       "arxiv_id": "2505.19435",
    475       "relevance": "Extends routing to (LLM, reasoning strategy) pairs but without explicit cost control, closely related to R2-ROUTER's joint selection approach."
    476     },
    477     {
    478       "title": "GraphRouter: A graph-based router for LLM selections",
    479       "authors": ["Tao Feng", "Yanzhi Shen", "Jiaxuan You"],
    480       "year": 2024,
    481       "arxiv_id": "2410.03834",
    482       "relevance": "Graph neural network approach to LLM routing, representing the diversity of routing architectures in the field."
    483     },
    484     {
    485       "title": "RouterArena: An open platform for comprehensive comparison of LLM routers",
    486       "authors": ["Yanxi Lu", "Rundong Liu", "Jingzhou Yuan"],
    487       "year": 2025,
    488       "arxiv_id": "2510.00202",
    489       "relevance": "Comprehensive routing benchmark platform used to rank baselines (MIRT, CARROT, etc.) and contextualize R2-ROUTER's contributions."
    490     },
    491     {
    492       "title": "AutoMix: Automatically mixing language models",
    493       "authors": ["Pranjal Aggarwal", "Aman Madaan", "Ankit Anand"],
    494       "year": 2023,
    495       "arxiv_id": "2310.12963",
    496       "relevance": "Early cascading approach to LLM routing that sequentially queries models by cost, establishing the cascade paradigm that predictive routing improves upon."
    497     },
    498     {
    499       "title": "MetaLLM: A high-performant and cost-efficient dynamic framework for wrapping LLMs",
    500       "authors": ["Quang H. Nguyen", "Trung Dao", "Duy Cuong Hoang"],
    501       "year": 2024,
    502       "arxiv_id": "2407.10834",
    503       "relevance": "Dynamic LLM routing framework that predicts output lengths for cost estimation, a precursor to R2-ROUTER's approach of treating output length as controllable."
    504     },
    505     {
    506       "title": "How well do LLMs compress their own chain-of-thought? A token complexity approach",
    507       "authors": ["Albert Lee", "Ethan Che", "Tongyi Peng"],
    508       "year": 2025,
    509       "arxiv_id": "2503.01141",
    510       "relevance": "Studies how output length affects LLM reasoning quality, providing empirical basis for R2-ROUTER's core assumption that quality varies with output length."
    511     }
    512   ]
    513 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs