scan.json (29096B)
1 { 2 "paper": { 3 "title": "SLO-Conditioned Action Routing for Retrieval-Augmented Generation: Objective Ablation and Failure Modes", 4 "authors": ["Bharath Nunepalli"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2601.00841", 8 "doi": "10.48550/arXiv.2601.00841" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval", "case-study"], 13 "key_findings": "In SLO-conditioned RAG routing on SQuAD 2.0 (N=200), conservative fixed policies (k=2, guarded generation) are hard to beat: learned Argmax-CE routing provides only modest reward improvements under a quality-first SLO (+0.013 reward, +2.5% accuracy) at higher token cost. Under a cost-focused SLO, learned policies degenerate into refusal collapse (94-100% refusal rate), a known failure mode of selective prediction without calibrated abstention constraints. Reward-weighted training (Argmax-CE-WT) amplifies noise and overfits to expensive actions, yielding worse reward than fixed baselines.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository provided: https://github.com/bh3r1th/rl-rag-slo-controller. Section 5.2 states 'The full codebase for the experiments, including scripts to precompute logged action outcomes, train SLO-conditioned policies, run evaluations, and regenerate the figures, is available.'" 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Uses publicly available SQuAD 2.0 dataset. Section 5.2: 'All results in this paper are produced from the public SQuAD 2.0 dataset and the released scripts; no proprietary data or employer resources are used.'" 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper specifies gpt-4.1-nano as the LLM backend and references the GitHub repo, but does not include dependency versions, requirements.txt, Dockerfile, or any environment specification in the paper itself." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": true, 34 "justification": "Section 5.2 states the released codebase includes 'scripts to precompute logged action outcomes, train SLO-conditioned policies, run evaluations, and regenerate the figures.' The repo is described as containing reproduction scripts." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Section 8 explicitly acknowledges: 'The paper reports point estimates only; it does not provide confidence intervals, hypothesis tests, or multiple-seed analysis.' Table 1 contains only point estimates." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "Section 8 explicitly states no hypothesis tests are provided. Comparative claims (e.g., 'Argmax-CE improves average reward') are based on comparing raw numbers without statistical testing." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Section 6.1 reports improvements with baseline context: 'Argmax-CE improves average reward from −0.0419 (best fixed) to −0.0287 and accuracy from 0.250 to 0.275, at higher cost (244 to 359 tokens).' Table 1 provides complete baseline and method values for all metrics." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "N=200 examples are used with no justification for this choice and no power analysis. Section 8 acknowledges 'Small differences (e.g., a few percentage points) should not be over-interpreted' but does not justify the sample size." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Section 8 explicitly states 'does not provide confidence intervals, hypothesis tests, or multiple-seed analysis.' All results appear to be single-run numbers with no variance or spread measures." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Section 5.3 defines two baselines: 'Fixed-k baseline (action 1): always choose action 1 (k=5, guarded)' and 'Best fixed action: choose the single action that maximizes average reward for the given SLO on the evaluation set.' Results are compared in Table 1." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "The fixed-action baselines are the natural and competitive comparisons for SLO-conditioned RAG routing, a novel framing with no direct prior work. The paper demonstrates these baselines are very strong — often outperforming learned policies — making them genuinely competitive rather than strawmen." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "The paper compares two policy objectives (Argmax-CE vs Argmax-CE-WT, where WT adds per-example reward-margin weighting) across two SLO profiles (quality first vs cheap), showing how each component affects behavior. Section 6.3 isolates the effect of the weighting scheme." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Section 5.1 reports five metrics: accuracy (normalized exact match), avg cost tokens, hallucination rate, refusal rate, and retrieval hit rate. Table 1 reports accuracy, cost, reward, refusal rate, and hit rate." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "All evaluation is automated. Accuracy is normalized exact match on answer strings. No human evaluation of answer quality, routing decisions, or refusal appropriateness is performed." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "Section 4.1 constructs an offline dataset by sampling questions and running all actions. Section 5.1 evaluates on 'N=200 examples from the development set.' It is unclear whether the training and evaluation sets are separate — the paper does not describe a train/test split." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": false, 98 "justification": "Table 1 shows aggregate metrics per condition (SLO × method). No breakdown by question type (answerable vs unanswerable), difficulty, or topic. A single aggregate accuracy/reward number per condition hides variation across question types." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Failure analysis is a central contribution. Section 6.2 documents refusal collapse under cheap SLO (94.5-100% refusal rate). Section 6.3 documents Argmax-CE-WT instability. Section 7.1 provides a detailed discussion of why refusal collapse occurs." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper is structured around reporting negative results: cost-focused policies collapse to refusal (Table 1, cheap SLO rows); Argmax-CE-WT produces worse reward than fixed baselines under quality first; conservative fixed policies are hard to beat. Section 6.3 explicitly describes Argmax-CE-WT as a 'common pitfall.'" 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims are well-hedged and supported: 'a strong fixed baseline performs competitively' (Table 1 best-fixed often wins), 'learned policies mainly provide additional cost savings under a quality-focused SLO' (Argmax-CE under quality first), 'can exhibit refusal collapse under a cheap SLO' (Table 1 cheap rows). No overclaiming." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper's causal claims are supported by controlled ablations. The SLO weight comparison isolates the effect of objective design on policy behavior. Section 7.1 provides a mechanistic explanation for refusal collapse (known degeneracy in selective prediction). Claims like 'cost-heavy rewards can make refuse appear disproportionately attractive' are justified by the controlled SLO variation." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": true, 125 "justification": "Generalization is explicitly bounded. Section 8: 'This study is intentionally small and is best read as a systems-oriented case study rather than a benchmark claim.' Multiple specific boundaries are stated: single model, single dataset, offline evaluation, simplified cost proxy." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 7.1 discusses refusal collapse as a 'known degeneracy in selective prediction.' Section 7.2 discusses why higher retrieval coverage doesn't improve reward (distractors, cost). Section 8 discusses offline distribution shift, simplified metrics, and cost proxy limitations as alternative factors." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "Claims match measurement granularity. Accuracy is defined as 'normalized exact match on answer strings' (Section 5.1). Cost is 'prompt+completion tokens.' The paper does not frame these as proxies for broader concepts. Section 8 explicitly acknowledges that token count 'is a coarse proxy' for real deployment costs." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section 5.4: 'Generation uses gpt-4.1-nano via the OpenAI API.' This is a specific API model identifier with version (4.1) and size (nano), not a marketing name." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix A provides the full prompt text for all three modes: guarded mode (A.1), auto mode (A.2), and refusal action (A.3). The actual prompt text is given, not just natural-language descriptions." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "No LLM API hyperparameters (temperature, top-p, max tokens) are reported. No policy network training hyperparameters (learning rate, architecture, batch size) are stated. The SLO weight vectors are not explicitly specified." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. The system is a simple retrieve-then-generate pipeline with a routing controller that selects retrieval depth and prompting mode. No tool use, retry logic, or feedback mechanisms." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": false, 162 "justification": "The paper says N=200 examples from the SQuAD 2.0 development set but does not specify how these 200 were selected (random sample? first 200? stratified?). The offline log generation process (Section 4.1) is described at a high level but the sampling procedure is missing." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 8 is titled 'Limitations' and contains substantive discussion across four specific bullet points covering sample size, offline evaluation, retrieval metric limitations, and cost proxy." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": true, 174 "justification": "Section 8 identifies study-specific threats: 'Most reported metrics are estimated on N=200 SQuAD 2.0 dev examples per condition,' 'Any change in model, prompt, or corpus can change the reward landscape,' 'retrieval hit rate... is a coarse proxy and does not capture semantic support, multi-hop reasoning, or non-extractive settings.'" 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": true, 179 "justification": "Multiple explicit scope boundaries: 'best read as a systems-oriented case study rather than a benchmark claim' (Section 8), 'does not alter the retriever or the LLM' (Section 2), 'The goal is not to claim state-of-the-art QA performance' (Section 1). Section 8 lists specific things results do NOT show." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": true, 186 "justification": "SQuAD 2.0 is publicly available. The GitHub repo includes scripts to regenerate the logged action dataset from the public data. Section 5.2: 'All results in this paper are produced from the public SQuAD 2.0 dataset and the released scripts.'" 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": false, 191 "justification": "Section 4.1 describes the offline log generation process (execute all actions per question, record metrics). However, the selection of 200 examples from the SQuAD 2.0 dev set is not documented — no sampling method, stratification, or selection criteria specified." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data source is SQuAD 2.0, a standard public benchmark." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": false, 201 "justification": "The pipeline from SQuAD 2.0 to final analysis has a gap: how 200 examples were selected from the ~12K development set is unexplained. The downstream pipeline (execute actions → compute rewards → train policies) is described but the initial sampling step is missing." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "The Disclaimer section states: 'This work and the accompanying code were created by the author in a personal capacity. They are not affiliated with, endorsed by, or representative of any current or past employer. No proprietary datasets, internal systems, or confidential resources were used.' This effectively discloses the work as unfunded personal research." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "The Disclaimer explicitly states the work is personal and 'not affiliated with, endorsed by, or representative of any current or past employer.' The author lists personal blog and GitHub, making clear this is independent work." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": false, 217 "answer": false, 218 "justification": "The work is unfunded personal research as stated in the Disclaimer. No external funding involved." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is included. The Disclaimer addresses employer affiliation but does not address patents, equity, or other financial interests." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses gpt-4.1-nano to evaluate on SQuAD 2.0 but does not state the model's training data cutoff date. SQuAD 2.0 (2018) was almost certainly in the training data." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "SQuAD 2.0 is a widely-used benchmark from 2018 that was likely in gpt-4.1-nano's training data. The paper does not discuss potential memorization or train/test overlap." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "SQuAD 2.0 has been publicly available since 2018 and is widely included in LLM training data. The paper uses it with a post-2018 model without addressing contamination risk. This could bias the reward landscape and routing analysis." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Token costs per method are a central metric. Table 1 reports average cost tokens for all conditions (244, 359, 833, 608, 23, 11). Figure 2 plots cost vs accuracy. Cost is one of the SLO dimensions." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "Total API spend, number of API calls, wall-clock time, and compute used for policy training are not reported. The offline log generation requires running all 5 actions for each of 200 questions, but the total compute cost is not quantified." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Section 8 explicitly acknowledges the paper 'does not provide confidence intervals, hypothesis tests, or multiple-seed analysis.' All results appear to be single-run." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "The number of experimental runs is not stated anywhere. Results appear to be from a single run per condition." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. Policy network architecture, learning rate, and other training hyperparameters are not specified, nor is any search procedure described." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "All four conditions (2 objectives × 2 SLO profiles) and both baselines are reported in Table 1. No hidden selection — all tried configurations are shown, including those that perform poorly (refusal collapse, Argmax-CE-WT instability)." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "Multiple conditions are compared across multiple metrics but no statistical tests are performed at all (Section 8 acknowledges this), let alone corrections for multiple comparisons." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors implement both the baselines and the learned policies themselves. No acknowledgment of author-evaluation bias or use of independent evaluation." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Figure 2 explicitly plots 'Average token cost vs. accuracy for learned policies and best fixed-action baselines.' Cost-quality trade-offs are a central analysis dimension." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "The paper uses SQuAD 2.0 as a testbed for RAG routing but does not discuss whether SQuAD 2.0 is a valid benchmark for evaluating SLO-conditioned routing behavior. Section 8 acknowledges metric limitations but not the benchmark's construct validity for the claimed task." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No agentic scaffolding is involved. The routing parameters (retrieval depth, prompting mode) ARE the independent variables being studied, not confounds. The model is held fixed (Section 5.4)." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "SQuAD 2.0 was published in 2018 and gpt-4.1-nano was released in 2025. The model almost certainly encountered SQuAD data during training. This temporal leakage is not discussed." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks answer information. If the model has memorized SQuAD answers, the retrieval-routing analysis may be confounded." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of whether train and test examples share structural similarities or whether the 200 evaluation examples are independent of the offline training data." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, decontamination, or temporal splits." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "A strong fixed baseline (k=2, guarded generation) performs competitively against learned routing policies across SLO profiles.", 365 "evidence": "Table 1 shows the best fixed action (action 0) achieves the best or near-best reward in all conditions. Under quality first, best-fixed reward is −0.0419 vs Argmax-CE's −0.0287 but at much lower cost (244 vs 359 tokens). Under cheap, best-fixed reward (−0.0166) substantially outperforms learned policies.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "Learned quality-first routing (Argmax-CE) provides modest reward improvement over the best fixed policy.", 370 "evidence": "Table 1: Argmax-CE improves reward from −0.0419 to −0.0287 and accuracy from 0.250 to 0.275 under quality first, but at higher token cost (244→359). Single-run N=200 evaluation with no uncertainty quantification.", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "Cost-focused SLO rewards cause learned policies to collapse into refusal behavior.", 375 "evidence": "Table 1, cheap SLO: Argmax-CE achieves 95.5% refusal rate with 1.5% accuracy; Argmax-CE-WT reaches 100% refusal with 0% accuracy. Figure 1 visualizes the action distribution collapse. Section 7.1 provides mechanistic explanation.", 376 "supported": "strong" 377 }, 378 { 379 "claim": "Reward-weighted training (Argmax-CE-WT) can amplify imperfections and overfit to expensive actions under quality-first SLO.", 380 "evidence": "Table 1, quality first: Argmax-CE-WT has 833 avg tokens (vs 244 for best-fixed) and reward of −0.1350 (vs −0.0419), yielding worse reward despite the highest retrieval hit-rate (0.791). Section 6.3 explains this as 'a common pitfall.'", 381 "supported": "moderate" 382 }, 383 { 384 "claim": "Increasing retrieval coverage (higher hit-rate) does not necessarily improve end-task reward.", 385 "evidence": "Section 7.2: Argmax-CE-WT produces highest retrieval hit-rate (0.791) but lower reward than the best fixed policy (reward −0.1350 vs −0.0236). Demonstrated for one method/SLO condition.", 386 "supported": "moderate" 387 } 388 ], 389 "red_flags": [ 390 { 391 "flag": "Very small sample size with no uncertainty quantification", 392 "detail": "All results are based on N=200 examples with point estimates only. The paper acknowledges this in Section 8 ('does not provide confidence intervals, hypothesis tests, or multiple-seed analysis') but the differences reported (e.g., 2.5 percentage points of accuracy) are well within noise for this sample size." 393 }, 394 { 395 "flag": "Benchmark contamination unaddressed", 396 "detail": "SQuAD 2.0 (published 2018) was almost certainly in gpt-4.1-nano's training data (2025). If the model has memorized answers, the reward landscape and routing analysis may be confounded — the optimal routing strategy for a contaminated model may differ from that for a clean model. This is not discussed." 397 }, 398 { 399 "flag": "Unclear train/test separation", 400 "detail": "The paper constructs an offline dataset from SQuAD 2.0 dev examples and reports evaluation on N=200 dev examples. It is unclear whether the training and evaluation data overlap. Policies may be evaluated on their training data." 401 } 402 ], 403 "cited_papers": [ 404 { 405 "title": "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks", 406 "authors": ["Patrick Lewis", "Ethan Perez", "Aleksandra Piktus", "Fabio Petroni", "Vladimir Karpukhin", "Naman Goyal", "Heinrich Küttler", "Mike Lewis", "Wen-tau Yih", "Tim Rocktäschel", "Sebastian Riedel", "Douwe Kiela"], 407 "year": 2020, 408 "arxiv_id": "2005.11401", 409 "relevance": "Foundational RAG paper establishing the retrieval-augmented generation paradigm that this work applies routing control to." 410 }, 411 { 412 "title": "Dense Passage Retrieval for Open-Domain Question Answering", 413 "authors": ["Vladimir Karpukhin", "Barlas Oğuz", "Sewon Min", "Patrick Lewis", "Ledell Wu", "Sergey Edunov", "Danqi Chen", "Wen-tau Yih"], 414 "year": 2020, 415 "arxiv_id": "2004.04906", 416 "relevance": "Core dense retrieval method for open-domain QA, establishing the retrieval infrastructure that RAG routing systems operate over." 417 }, 418 { 419 "title": "Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection", 420 "authors": ["Akari Asai", "Zeqiu Wu", "Yizhong Wang", "Aohan Zeng", "Hannaneh Hajishirzi"], 421 "year": 2023, 422 "arxiv_id": "2310.11511", 423 "relevance": "Adaptive retrieval system that learns when and what to retrieve, directly relevant to the question of per-query control in RAG systems." 424 }, 425 { 426 "title": "RouteLLM: Learning to Route LLMs with Preference Data", 427 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu", "Wei-Lin Chiang", "Tianhao Wu", "Joseph E. Gonzalez", "M. Waleed Kadous", "Ion Stoica"], 428 "year": 2024, 429 "arxiv_id": "2406.18665", 430 "relevance": "LLM routing system that selects between cheaper and more capable models per request; complementary routing approach to the retrieval-depth routing studied here." 431 }, 432 { 433 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 434 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 435 "year": 2023, 436 "arxiv_id": "2305.05176", 437 "relevance": "Cost-optimization framework for LLM usage via model cascading, directly relevant to the SLO-driven routing problem and cost-quality trade-offs." 438 }, 439 { 440 "title": "Offline Reinforcement Learning: Tutorial, Review, and Perspectives on Open Problems", 441 "authors": ["Sergey Levine", "Aviral Kumar", "George Tucker", "Justin Fu"], 442 "year": 2020, 443 "arxiv_id": "2005.01643", 444 "relevance": "Foundational offline RL survey relevant to the logged-feedback policy learning approach used in this work." 445 }, 446 { 447 "title": "Counterfactual Risk Minimization: Learning from Logged Bandit Feedback", 448 "authors": ["Adith Swaminathan", "Thorsten Joachims"], 449 "year": 2015, 450 "arxiv_id": "1502.02362", 451 "relevance": "Theoretical foundation for learning from logged feedback, directly cited as a potential future direction for the routing policy learning problem." 452 }, 453 { 454 "title": "Retrieval-Augmented Generation for Large Language Models: A Survey", 455 "authors": ["Yifan Gao", "Yun Xiong", "Rui Yan"], 456 "year": 2023, 457 "arxiv_id": "2312.10997", 458 "relevance": "Comprehensive RAG survey covering the design space of retrievers, reranking, and context composition that the routing controller operates over." 459 } 460 ] 461 }