scan.json (29989B)
1 { 2 "paper": { 3 "title": "Pay for Hints, Not Answers: LLM Shepherding for Cost-Efficient Inference", 4 "authors": [ 5 "Ziming Dong", 6 "Hardik Sharma", 7 "Evan O'Toole", 8 "Jaya Prakash Champati", 9 "Kui Wu" 10 ], 11 "year": 2026, 12 "venue": "arXiv preprint", 13 "arxiv_id": "2601.22132", 14 "doi": "10.48550/arXiv.2601.22132" 15 }, 16 "scan_version": 2, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "LLM Shepherding requests only a short prefix (hint) from a large model to improve small model accuracy, achieving 42–94% cost reduction relative to LLM-only inference. Reactive Shepherding achieves the highest Accuracy-per-Cost Efficiency (ACE) across all four benchmarks (GSM8K, CNK12, HumanEval, MBPP), outperforming routing and cascading baselines. Even small hints (10–30% of the full LLM response) yield substantial accuracy gains, and the approach generalizes zero-shot from math to code generation tasks.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No repository URL, GitHub link, or code archive is provided anywhere in the paper." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "All four benchmarks are publicly available: GSM8K (Cobbe et al., 2021), CNK12 from OpenR1-Math-220k (OpenR1, 2025), HumanEval (Chen, 2021), and MBPP (Austin et al., 2021). The paper uses these without modification (except sampling CNK12)." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions NVIDIA RTX 5090 GPU and Groq API but provides no requirements.txt, Dockerfile, or detailed environment/dependency specification." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "Tables 1–4 report only point estimates for accuracy and cost. No confidence intervals or error bars are provided despite running 7 independent trials per query." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims shepherding outperforms baselines based solely on comparing numbers in tables. No statistical significance tests (p-values, t-tests, bootstrap tests) are used to support these comparative claims." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Cost reductions are reported as percentages with baseline context (e.g., '42–94% relative to LLM-only inference'), and ACE ratios provide normalized efficiency comparisons (e.g., 2.78 for Reactive Shepherding on MBPP vs 1.76 for ABC). Absolute costs and accuracies are shown for all methods." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "No justification is provided for the sample sizes used. GSM8K test set is 776 (subset of 1,319), CNK12 samples 21,471 from a larger dataset, but neither choice is justified with a power analysis or rationale." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "The paper states '7 independent trials per query and apply majority voting' (Section 4.1) but does not report standard deviation, variance, or any spread measure across these trials." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Four baselines are compared: two routing methods (RouteLLM, GraphRouter) and two cascading methods (FrugalGPT, ABC), plus SLM-only and LLM-only reference points." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "All baselines are recent: RouteLLM (Ong et al., 2025), GraphRouter (Feng et al., 2025), FrugalGPT (Chen et al., 2024), and ABC (Kolawole et al., 2025)." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": false, 84 "justification": "The system has multiple components (DeBERTa encoder, hint/no-hint classifier, hint size regressor, cascading decision module) but no systematic ablation removes individual components. Proactive vs reactive comparison and hint size sweeps (Figure 2) exist but are not framed as controlled single-variable ablations." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Results are reported using accuracy, total cost, cost reduction percentage, and the ACE metric (accuracy gain per unit cost)." 90 }, 91 "human_evaluation": { 92 "applies": false, 93 "answer": false, 94 "justification": "The paper evaluates on math (exact numerical match) and code generation (unit test pass/fail) benchmarks where correctness is binary and deterministic. Human evaluation is irrelevant to these claims." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Explicit train/validation/test splits are described: GSM8K (776 test, 543 validation), CNK12 (2,147 test, 1,073 validation, 18,251 train). HumanEval and MBPP are used as zero-shot test sets with no dataset-specific training." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Results are broken down per dataset (Tables 1–4), per strategy type (routing vs cascading vs shepherding), and per operating mode (proactive vs reactive). Figure 3 shows per-dataset cost comparisons under accuracy constraints." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "The paper discusses non-monotonic quality where excessive hints degrade performance (Figure 2, HumanEval), heavy-tailed hint distributions (Section 3, Appendix D), unsolvable queries that exceed SLM capabilities even with 90% hints (8.7% in CNK12), and proactive shepherding underperforming on some datasets." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Several negative results are reported: non-monotonic quality (hints can hurt on HumanEval at high sizes, Figure 2), proactive shepherding achieving lower accuracy than routing on CNK12 (77.9% vs 82.6%), and the framework being limited to verifiable tasks (Section 5)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims match results: '42–94% cost reduction' is supported by Tables 1–4 (67.4% GSM8K, 42.1% CNK12, 44.3% HumanEval, 93.6% MBPP). '2.8× cost reduction vs cascading' matches HumanEval comparison (44.3% vs 15.8%). 'Hints comprising 10–30% improve SLM accuracy' is supported by Figure 2." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims ('shepherding reduces costs', 'hints improve accuracy') are supported by controlled comparisons: same datasets, same models, same evaluation protocol across all methods. The oracle cost analysis (Proposition 2.1, Corollary 2.2) provides theoretical justification. Ablation-like comparisons (proactive vs reactive, varying hint sizes) support component-level causal claims." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title 'LLM Shepherding for Cost-Efficient Inference' and conclusion 'a new approach for cost-efficient deployment' frame the contribution broadly. However, experiments use only one SLM-LLM pair (Llama 3.2/3.3 from the same family with shared tokenizer) on four benchmarks in two domains (math and code). Section 5 acknowledges some limitations but the core framing extends beyond the tested settings." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper does not discuss alternative explanations for why shepherding works well. For example, the shared tokenizer and model family could be a major factor, but this is only briefly mentioned as a scope limitation in Section 5 rather than as an alternative explanation for the strong results." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures accuracy (exact match for math, unit test pass for code) and monetary cost ($), and frames claims in terms of these specific metrics. The ACE metric is explicitly defined. No gap exists between measurements and claims." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Specific model names with sizes are provided: 'Llama-3.2-3B-Instruct' (SLM), 'Llama-3.3-70B-Versatile' (LLM via Groq API), and 'DeBERTa-v3-large' (predictor backbone). These are identifiable model versions." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": false, 153 "justification": "Appendix E describes prompt structure with a template ('Instruction: Produce a final answer... Question: {query} Hint: {hint}') and mentions 'a fixed few-shot prompt consisting of two example question–answer pairs' for GSM8K, but the actual few-shot examples are not provided. The reader cannot reconstruct the exact prompts used." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Key hyperparameters are reported: temperature 0.3, top-p 0.95 for generation; K=2 or K=3 for reactive consensus; threshold values α and ηhint per dataset (Table 5, Appendix F); Groq API pricing ($0.59/$0.79 per 1M tokens); AdamW optimizer with EMA (Appendix B.2)." 159 }, 160 "scaffolding_described": { 161 "applies": false, 162 "answer": false, 163 "justification": "No agentic scaffolding is used. The system is a standard predict-then-query pipeline without tools, retry logic, or feedback mechanisms." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 3.1 and Appendix B.1 detail the training data construction pipeline: discretized token budgets at 10% increments, deterministic LLM decoding for labeling, filtering criteria. Appendix C describes outlier filtering with distribution plots (Figures 4–5). Dataset splits are explicitly stated (Section 4.1)." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": true, 175 "justification": "Section 5 'Scope and Future Directions' provides a dedicated discussion of limitations including restriction to verifiable tasks and same model family experiments." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": true, 180 "justification": "Section 5 identifies study-specific threats: (1) the method is only validated on verifiable reasoning tasks where correctness is automatically assessable, (2) SLM-LLM pairs are from the same model family sharing a common tokenizer, and extending to different architectures is an open question." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": true, 185 "justification": "Section 5 explicitly states: 'This work focuses on verifiable reasoning tasks, including mathematical problem-solving and code generation.' It also identifies what was NOT tested: open-ended tasks, cross-family model pairs with different tokenizers." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw experimental data (model outputs, per-query results, training labels) is released for independent verification. Only aggregated results in tables are shown." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 3.1 and Appendix B.1 describe in detail how training supervision labels were constructed: discretized hint sizes at 10% increments, LLM calls with max_new_tokens, SLM evaluation at each hint level. Benchmark sources are cited with specific versions." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. All data comes from standard public benchmarks (GSM8K, CNK12, HumanEval, MBPP)." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The full pipeline is documented: benchmark sampling (Section 4.1), supervision label construction with 10 LLM calls per query (Appendix B.1), outlier filtering with distribution analysis and counts (Appendix C, Figures 4–5), train/validation/test splits with exact sizes." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": false, 214 "justification": "No funding information, acknowledgments section, or grant numbers are mentioned anywhere in the paper." 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are clearly listed: University of Victoria (Dong, O'Toole, Champati, Wu) and Manipal University (Sharma). They are not evaluating their own commercial product." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "No funding source is disclosed, so independence cannot be assessed. The absence of a funding statement does not confirm the absence of funding." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests statement or financial disclosure is present in the paper." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "The paper does not state the training data cutoff date for Llama-3.2-3B-Instruct or Llama-3.3-70B-Versatile. This is needed to assess whether benchmarks overlap with training data." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether GSM8K (2021), HumanEval (2021), or MBPP (2021) problems appeared in the Llama models' training data. These benchmarks predate the models and could be contaminated." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "GSM8K, HumanEval, and MBPP were all published in 2021, well before Llama 3's training. These benchmarks were widely available online and could be in the training data. This contamination risk is not discussed." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study. All evaluation is automated on benchmarks." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "Inference cost is central to the paper. Tables 1–4 report per-strategy costs in dollars. API pricing is stated ($0.59/1M input, $0.79/1M output). Section 4.4 reports latency (7.32 ms policy overhead, 384.62 ms SLM generation per run)." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": false, 295 "justification": "While inference hardware is mentioned (NVIDIA RTX 5090) and API costs are reported, the total computational budget for training the DeBERTa-based predictor, generating supervision labels (10 LLM calls per training query), and running outlier analysis is not quantified." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "The paper runs 7 independent trials per query with majority voting but does not report sensitivity across different random seeds or variance in final accuracy/cost metrics." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": true, 307 "justification": "Section 4.1 states: 'we perform 7 independent trials per query and apply majority voting.' Reactive Shepherding uses K=2 or K=3 SLM responses for consensus (Section 3.5, Table 5)." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "Appendix B.2 mentions 'grid search on the validation set to calibrate the policy' but does not state the number of configurations tried, compute spent on the search, or the search grid dimensions." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "Appendix B.2 describes selection via grid search on the validation set, and Table 5 shows the final configurations used per dataset. Configurations are selected to maximize accuracy under cost constraints on validation data, not test data." 318 }, 319 "multiple_comparison_correction": { 320 "applies": false, 321 "answer": false, 322 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "The paper uses 'default parameter settings released by the authors in their respective GitHub repositories' for baselines (Section 4.2) but does not acknowledge that their own method is tuned while baselines use defaults, creating a potential systematic advantage." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "Cost-performance tradeoffs are central: Tables 1–4 show cost and accuracy for all methods, Figure 3 shows minimum cost to meet accuracy targets, and the ACE metric explicitly normalizes accuracy gain per unit cost." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "The paper uses GSM8K, CNK12, HumanEval, and MBPP without discussing whether these benchmarks validly measure the capabilities claimed (mathematical reasoning, code generation ability). No discussion of construct validity or benchmark limitations." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": false, 341 "answer": false, 342 "justification": "No scaffolding is involved. All methods use the same direct API/inference pipeline. Comparisons are between inference strategies, not between models in different scaffolds." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "GSM8K (2021), HumanEval (2021), and MBPP (2021) were published years before the Llama 3.2/3.3 models were trained. The paper does not discuss whether these benchmark solutions appeared in training data." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup could leak information. The hint mechanism provides LLM partial outputs to the SLM — while this is intentional, the paper does not analyze whether hints contain memorized benchmark answers rather than genuine reasoning." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "The training data for the shepherding model is constructed from the same benchmarks used for evaluation (GSM8K train split for GSM8K test, CNK12 train split for CNK12 test). While train/test splits are used, structural similarity and potential distributional overlap are not discussed." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No concrete leakage detection method (canary strings, membership inference, n-gram overlap, decontamination) is applied to any of the benchmarks." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "LLM Shepherding reduces costs by 42–94% relative to LLM-only inference across four benchmarks.", 371 "evidence": "Tables 1–4 show Reactive Shepherding cost reductions: 67.4% (GSM8K), 42.1% (CNK12), 44.3% (HumanEval), 93.6% (MBPP).", 372 "supported": "strong" 373 }, 374 { 375 "claim": "Reactive Shepherding achieves the highest ACE on all datasets, up to 2.78 on MBPP.", 376 "evidence": "Tables 1–4 show ACE values: 1.97 (GSM8K), 1.25 (CNK12), 1.42 (HumanEval), 2.78 (MBPP). All are highest among compared methods.", 377 "supported": "strong" 378 }, 379 { 380 "claim": "Even hints comprising 10–30% of the full LLM response improve SLM accuracy significantly.", 381 "evidence": "Figure 2 shows SLM accuracy rising substantially with small hints across all four datasets, with diminishing returns beyond 60%.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "Shepherding achieves lower cost than routing and cascading under oracle decision-making.", 386 "evidence": "Corollary 2.2 provides a mathematical proof that c*_shep(q) ≤ c*_route(q) = c*_casc(q) when SLM cost is zero. Oracle rows in Tables 1–4 confirm empirically.", 387 "supported": "strong" 388 }, 389 { 390 "claim": "Shepherding transfers zero-shot from math to code generation, matching cascading accuracy on HumanEval at 2.8× cost reduction.", 391 "evidence": "Table 3 shows Reactive Shepherding matches ABC accuracy (76.2%) with 44.3% cost reduction vs ABC's 15.8%, using a GSM8K-trained model without code-specific training (Section 4.2).", 392 "supported": "moderate" 393 }, 394 { 395 "claim": "This is the first work to exploit token-level budget control for SLM-LLM collaboration.", 396 "evidence": "Section G.5 positions the work against routing, cascading, speculative decoding, and knowledge distillation, arguing none use partial output control. No prior work on hint-based collaboration is cited.", 397 "supported": "weak" 398 } 399 ], 400 "red_flags": [ 401 { 402 "flag": "Single model family tested", 403 "detail": "All experiments use Llama 3.2-3B and Llama 3.3-70B, which share the same tokenizer. The paper acknowledges this in Section 5 but does not test any cross-family pairs. The approach's effectiveness may depend on shared tokenization, limiting generalizability." 404 }, 405 { 406 "flag": "No error bars despite multiple trials", 407 "detail": "The paper runs 7 independent trials per query with majority voting but reports only final point estimates in Tables 1–4. Variance across these trials is never shown, making it impossible to assess result stability or whether differences between methods are statistically significant." 408 }, 409 { 410 "flag": "Benchmark contamination risk ignored", 411 "detail": "GSM8K, HumanEval, and MBPP were all published in 2021, years before Llama 3 training. These benchmarks are widely reproduced online. If the LLM has memorized solutions, the 'hints' may simply be regurgitating memorized answers rather than demonstrating genuine reasoning assistance. This could inflate both baseline and shepherding accuracies." 412 }, 413 { 414 "flag": "Potentially unfair baseline comparison", 415 "detail": "Section 4.2 states baselines use 'default parameter settings released by the authors in their respective GitHub repositories,' while Shepherding configurations are tuned per-dataset (Table 5). Baselines may be operating at suboptimal settings for these specific benchmarks." 416 }, 417 { 418 "flag": "No code released", 419 "detail": "No source code, trained models, or reproduction artifacts are provided. The DeBERTa-based predictor, training pipeline, and prompt templates cannot be independently verified or reproduced." 420 } 421 ], 422 "cited_papers": [ 423 { 424 "title": "RouteLLM: Learning to Route LLMs from Preference Data", 425 "authors": ["I. Ong", "A. Almahairi", "V. Wu", "W.-L. Chiang", "T. Wu", "J. E. Gonzalez", "M. W. Kadous", "I. Stoica"], 426 "year": 2025, 427 "relevance": "Routing framework that learns to direct queries to different LLMs based on preference data, directly compared as a baseline." 428 }, 429 { 430 "title": "GraphRouter: A Graph-Based Router for LLM Selections", 431 "authors": ["T. Feng", "Y. Shen", "J. You"], 432 "year": 2025, 433 "relevance": "Graph-based LLM routing method using heterogeneous graphs for task-query-LLM interactions, used as a routing baseline." 434 }, 435 { 436 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 437 "authors": ["L. Chen", "M. Zaharia", "J. Zou"], 438 "year": 2024, 439 "relevance": "Pioneering LLM cascading approach demonstrating up to 98% cost reduction through learned model combinations." 440 }, 441 { 442 "title": "Agreement-Based Cascading for Efficient Inference", 443 "authors": ["S. Kolawole", "D. Dennis", "A. Talwalkar", "V. Smith"], 444 "year": 2025, 445 "relevance": "Training-free cascading strategy using response agreement for escalation decisions, used as a cascading baseline." 446 }, 447 { 448 "title": "Evaluating Large Language Models Trained on Code", 449 "authors": ["M. Chen"], 450 "year": 2021, 451 "arxiv_id": "2107.03374", 452 "relevance": "Introduces the HumanEval benchmark for code generation evaluation, one of the four evaluation benchmarks used." 453 }, 454 { 455 "title": "Training Verifiers to Solve Math Word Problems", 456 "authors": ["K. Cobbe", "V. Kosaraju", "M. Bavarian", "M. Chen", "H. Jun", "L. Kaiser", "M. Plappert", "J. Tworek", "J. Hilton", "R. Nakano", "C. Hesse", "J. Schulman"], 457 "year": 2021, 458 "arxiv_id": "2110.14168", 459 "relevance": "Introduces GSM8K, a grade-school math benchmark used as one of the primary evaluation datasets." 460 }, 461 { 462 "title": "Program Synthesis with Large Language Models", 463 "authors": ["J. Austin", "A. Odena", "M. Nye", "M. Bosma", "H. Michalewski", "D. Dohan", "E. Jiang", "C. Cai", "M. Terry", "Q. Le", "C. Sutton"], 464 "year": 2021, 465 "relevance": "Introduces the MBPP benchmark for entry-level Python programming tasks, used as a code generation evaluation dataset." 466 }, 467 { 468 "title": "The Llama 3 Herd of Models", 469 "authors": ["A. Dubey"], 470 "year": 2024, 471 "relevance": "Describes the Llama 3 model family used as both the SLM (3.2-3B) and LLM (3.3-70B) in all experiments." 472 }, 473 { 474 "title": "Automix: Automatically Mixing Language Models", 475 "authors": ["P. Aggarwal", "A. Madaan", "A. Anand", "S. P. Potharaju", "S. Mishra", "P. Zhou", "A. Gupta", "D. Rajagopal", "K. Kappaganthu", "Y. Yang"], 476 "year": 2024, 477 "relevance": "LLM cascading approach using few-shot self-verification and POMDP-based routing for cost reduction." 478 }, 479 { 480 "title": "Hybrid LLM: Cost-Efficient and Quality-Aware Query Routing", 481 "authors": ["D. Ding", "A. Mallick", "C. Wang", "R. Sim", "S. Mukherjee", "V. Rühle", "L. V. S. Lakshmanan", "A. H. Awadallah"], 482 "year": 2024, 483 "relevance": "Quality-aware LLM query routing that dynamically trades quality for cost based on predicted query difficulty." 484 }, 485 { 486 "title": "Language Model Cascades: Token-Level Uncertainty and Beyond", 487 "authors": ["N. Gupta", "H. Narasimhan", "W. Jitkrittum", "A. S. Rawat", "A. K. Menon", "S. Kumar"], 488 "year": 2024, 489 "relevance": "Token-level uncertainty approach for LLM cascading decisions, closely related to shepherding's token-level control." 490 }, 491 { 492 "title": "MiniLLM: Knowledge Distillation of Large Language Models", 493 "authors": ["Y. Gu", "L. Dong", "F. Wei", "M. Huang"], 494 "year": 2024, 495 "relevance": "Knowledge distillation method for LLMs using reverse KL divergence, complementary approach to inference-time collaboration." 496 }, 497 { 498 "title": "Small Language Models (SLMs) Can Still Pack a Punch: A Survey", 499 "authors": ["S. Subramanian", "V. Elango", "M. Gungor"], 500 "year": 2025, 501 "arxiv_id": "2501.05465", 502 "relevance": "Survey of SLM capabilities relevant to the cost-quality tradeoff motivation of shepherding." 503 } 504 ] 505 }