scan.json (29566B)
1 { 2 "paper": { 3 "title": "In-Context Distillation with Self-Consistency Cascades: A Simple, Training-Free Way to Reduce LLM Agent Costs", 4 "authors": [ 5 "Vishnu Sarukkai", 6 "Asanshay Gupta", 7 "James Hong", 8 "Michaël Gharbi", 9 "Kayvon Fatahalian" 10 ], 11 "year": 2025, 12 "venue": "arXiv", 13 "arxiv_id": "2512.02543", 14 "doi": "10.48550/arXiv.2512.02543" 15 }, 16 "scan_version": 2, 17 "active_modules": ["experimental_rigor", "data_leakage"], 18 "methodology_tags": ["benchmark-eval"], 19 "key_findings": "In-context distillation—retrieving teacher demonstrations and inserting them into a cheaper student model's prompt—combined with self-consistency cascades achieves 2.5× cost reduction on ALFWorld at teacher-level accuracy (96% vs 89%) and 2× cost reduction on AppWorld at iso-accuracy. The method requires no fine-tuning or weight updates and generalizes to open-weight models (Llama-3.3-70B). Breakeven versus always using the teacher occurs after only 843 episodes on ALFWorld. All results are single-run point estimates without error bars or variance reporting.", 20 "checklist": { 21 "artifacts": { 22 "code_released": { 23 "applies": true, 24 "answer": false, 25 "justification": "No code repository URL, GitHub link, or archive is provided anywhere in the paper." 26 }, 27 "data_released": { 28 "applies": true, 29 "answer": true, 30 "justification": "The paper uses publicly available benchmarks: ALFWorld (Shridhar et al., 2020) and AppWorld (Trivedi et al., 2024). No proprietary data was collected beyond teacher demonstrations, which are not released." 31 }, 32 "environment_specified": { 33 "applies": true, 34 "answer": false, 35 "justification": "The paper mentions MiniLM-L6-v2 for embeddings and API access to OpenAI/Anthropic, but provides no requirements.txt, Dockerfile, or detailed environment specification with library versions." 36 }, 37 "reproduction_instructions": { 38 "applies": true, 39 "answer": false, 40 "justification": "No step-by-step reproduction instructions, README, or scripts are provided. The method is described algorithmically (Algorithms 1-2) but without runnable reproduction guidance." 41 } 42 }, 43 "statistical_methodology": { 44 "confidence_intervals_or_error_bars": { 45 "applies": true, 46 "answer": false, 47 "justification": "All results are reported as single point estimates (e.g., '0.87 accuracy', '0.96 accuracy'). No confidence intervals or error bars appear in any table or figure." 48 }, 49 "significance_tests": { 50 "applies": true, 51 "answer": false, 52 "justification": "The paper claims its method 'dominates alternatives' and 'establishes a new Pareto frontier' based solely on comparing raw numbers. No statistical significance tests are reported." 53 }, 54 "effect_sizes_reported": { 55 "applies": true, 56 "answer": true, 57 "justification": "Effect sizes are reported with baseline context throughout: '2.5× cost reduction... reducing per-episode costs from $0.059 to $0.024' (Section 6.1), accuracy improvements from 0.18 to 0.87 (Section 6.1), and detailed Tables 1, 4, 5 provide absolute and relative comparisons." 58 }, 59 "sample_size_justified": { 60 "applies": true, 61 "answer": false, 62 "justification": "Test sets are 134 tasks (ALFWorld) and 168 tasks (AppWorld), determined by the benchmarks. No power analysis or justification for whether these sizes are sufficient for the comparative claims made." 63 }, 64 "variance_reported": { 65 "applies": true, 66 "answer": false, 67 "justification": "No variance, standard deviation, or spread measures are reported. All results appear to be single-run. There is no mention of multiple experimental runs or seed variation." 68 } 69 }, 70 "evaluation_design": { 71 "baselines_included": { 72 "applies": true, 73 "answer": true, 74 "justification": "Multiple baselines included: Teacher (Claude Sonnet 4.5), Student (ZS), Student (IC), Student (Cascade only), Random Mix at various teacher fractions (0.2-0.8), and GPT-4.1 zero-shot. See Section 5 and Figure 2." 75 }, 76 "baselines_contemporary": { 77 "applies": true, 78 "answer": true, 79 "justification": "Baselines use current models: Claude Sonnet 4.5, GPT-4.1-mini, GPT-4.1, Llama-3.3-70B. They also compare against CuGA (Marreed et al., 2025), a concurrent compound system." 80 }, 81 "ablation_study": { 82 "applies": true, 83 "answer": true, 84 "justification": "Systematic ablations test each component: IC alone vs Cascade alone vs IC+Cascade (Figure 2), varying k (number of examples, Figure 3), varying DB size (Figure 4), per-step vs single retrieval (Table 3, Appendix A), and varying temperature for cascades." 85 }, 86 "multiple_metrics": { 87 "applies": true, 88 "answer": true, 89 "justification": "Two primary metrics: task accuracy (episode success rate) and cost (normalized to teacher baseline and absolute USD). Both are reported throughout." 90 }, 91 "human_evaluation": { 92 "applies": true, 93 "answer": false, 94 "justification": "No human evaluation of agent outputs or trajectories. All evaluation is automated via benchmark task success criteria (unit tests for AppWorld, goal completion for ALFWorld)." 95 }, 96 "held_out_test_set": { 97 "applies": true, 98 "answer": true, 99 "justification": "Clear train/test separation: ALFWorld uses train split (500 demos) and evaluates on 'eval-out-of-distribution' (134 tasks). AppWorld uses train+val (147 tasks) for demos and evaluates on 'test-normal' (168 tasks)." 100 }, 101 "per_category_breakdown": { 102 "applies": true, 103 "answer": true, 104 "justification": "Table 2 provides per-difficulty breakdown on AppWorld (difficulty 1/2/3). Results are also broken down by benchmark, model (GPT-4.1-mini vs Llama-3.3-70B), and retrieval configuration." 105 }, 106 "failure_cases_discussed": { 107 "applies": true, 108 "answer": true, 109 "justification": "Section 4.4 discusses when the method fails: task divergence from demos, ambiguous states, contradictory retrievals. Table 2 shows significant accuracy degradation on difficulty-3 tasks (43% vs teacher's 71%). Section 6.2 reports fine-tuning failure on AppWorld." 110 }, 111 "negative_results_reported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Fine-tuning GPT-4o-mini on AppWorld 'failed when training on our teacher demonstrations' and Llama-3.3-70B 'trained but produced 0% task success despite converging on training loss' (Section 6.2). AppWorld accuracy fluctuates with k values (Section 6.3)." 115 } 116 }, 117 "claims_and_evidence": { 118 "abstract_claims_supported": { 119 "applies": true, 120 "answer": true, 121 "justification": "Abstract claims of 2.5× cost reduction on ALFWorld (from $0.059 to $0.024), matching teacher accuracy, and 2× cost reduction on AppWorld at iso-accuracy are all supported by Section 6.1, Tables 4-5, and Figure 2." 122 }, 123 "causal_claims_justified": { 124 "applies": true, 125 "answer": true, 126 "justification": "Causal claims about which components drive performance ('in-context distillation bridges the teacher-student gap', 'combined system dominates alternatives') are supported by controlled ablations: IC alone, Cascade alone, and IC+Cascade are tested systematically with single-variable differences." 127 }, 128 "generalization_bounded": { 129 "applies": true, 130 "answer": false, 131 "justification": "The title claims 'A Simple, Training-Free Way to Reduce LLM Agent Costs' broadly, but results are on only two benchmarks (ALFWorld and AppWorld). The abstract claims the approach 'makes advanced agentic systems economically viable for a broader range of applications' without bounding this generalization to tested settings." 132 }, 133 "alternative_explanations_discussed": { 134 "applies": true, 135 "answer": false, 136 "justification": "The paper offers one speculative explanation for why the student exceeds the teacher on ALFWorld ('likely because retrieved examples provide... implicit information about environment dynamics') but does not systematically discuss alternative explanations for the observed results, confounds, or robustness to other factors." 137 }, 138 "proxy_outcome_distinction": { 139 "applies": true, 140 "answer": true, 141 "justification": "The paper measures task success rate and inference cost, and frames results in terms of task success rate and inference cost. No proxy gap exists—claims match the granularity of measurements." 142 } 143 }, 144 "setup_transparency": { 145 "model_versions_specified": { 146 "applies": true, 147 "answer": true, 148 "justification": "Specific model identifiers are provided: 'Claude Sonnet 4.5' (teacher), 'GPT-4.1-mini' (student), 'GPT-4.1' (comparison), 'Llama-3.3-70B' (open-weight student), 'MiniLM-L6-v2' (embeddings). These are distinct API model names sufficient for reproduction." 149 }, 150 "prompts_provided": { 151 "applies": true, 152 "answer": true, 153 "justification": "Appendix B provides full prompt templates for Plan, ReAct, and Verifier agents, including system_prompt and user_prompt with exact text. The templates show how retrieved examples are inserted." 154 }, 155 "hyperparameters_reported": { 156 "applies": true, 157 "answer": true, 158 "justification": "Section 5 reports: temperature=0.1 (default), max 4096 output tokens, N=3 for self-consistency sampling, k=6 (ALFWorld) and k=3 (AppWorld) for retrieval. Various temperatures (0.1, 0.5, 1.0) tested for cascades." 159 }, 160 "scaffolding_described": { 161 "applies": true, 162 "answer": true, 163 "justification": "The ReAct-style agent architecture is described in detail (Section 4.1, Algorithm 1), retrieval mechanism with multi-key embeddings (Section 4.3), self-consistency cascade logic (Section 4.4, Algorithm 2), and the complete pipeline (Figure 1)." 164 }, 165 "data_preprocessing_documented": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 4.2 documents demonstration collection: teacher trajectories include observations, reasoning traces, and actions. Dense embeddings computed with MiniLM-L6-v2 for goal, plan, and per-step reasoning. Benchmarks are used as-is with specified train/test splits." 169 } 170 }, 171 "limitations_and_scope": { 172 "limitations_section_present": { 173 "applies": true, 174 "answer": false, 175 "justification": "There is no dedicated Limitations section. Section 7 (Discussion) positions the work positively without substantively discussing limitations. No threats to validity are addressed." 176 }, 177 "threats_to_validity_specific": { 178 "applies": true, 179 "answer": false, 180 "justification": "No specific threats to validity are discussed. The Discussion section focuses on positioning the method's advantages rather than examining potential weaknesses or threats." 181 }, 182 "scope_boundaries_stated": { 183 "applies": true, 184 "answer": false, 185 "justification": "The paper does not explicitly state what the results do NOT show. It acknowledges fine-tuning may be better for 'stable, high-volume production systems' (Section 7) but does not bound claims to the tested benchmarks, models, or task types." 186 } 187 }, 188 "data_integrity": { 189 "raw_data_available": { 190 "applies": true, 191 "answer": false, 192 "justification": "No raw data (teacher trajectories, per-episode results, retrieval logs) is released. Only aggregate results are shown in tables and figures." 193 }, 194 "data_collection_described": { 195 "applies": true, 196 "answer": true, 197 "justification": "Section 4.2 describes teacher demonstration collection: the teacher model is run on Tdemo tasks to produce complete trajectories with goals, plans, observations, reasoning traces, and actions. Dense embeddings are computed using MiniLM-L6-v2." 198 }, 199 "recruitment_methods_described": { 200 "applies": false, 201 "answer": false, 202 "justification": "No human participants. Data sources are standard public benchmarks (ALFWorld, AppWorld)." 203 }, 204 "data_pipeline_documented": { 205 "applies": true, 206 "answer": true, 207 "justification": "The pipeline is documented: demonstration collection (Section 4.2), embedding and indexing, per-step retrieval (Section 4.3), self-consistency cascade (Section 4.4). Token counts and cost calculations are detailed in Appendix C." 208 } 209 }, 210 "conflicts_of_interest": { 211 "funding_disclosed": { 212 "applies": true, 213 "answer": true, 214 "justification": "Acknowledgements state: 'Support for this project was provided by Reve and Roblox, and API credits were provided by OpenAI and together.ai.'" 215 }, 216 "affiliations_disclosed": { 217 "applies": true, 218 "answer": true, 219 "justification": "Author affiliations are listed: Stanford University (Sarukkai, Gupta, Fatahalian) and Reve (Hong, Gharbi)." 220 }, 221 "funder_independent_of_outcome": { 222 "applies": true, 223 "answer": false, 224 "justification": "OpenAI provided API credits and GPT-4.1-mini (an OpenAI product) is the primary student model shown to perform well. OpenAI has a commercial interest in demonstrating cost-effective use of their cheaper models. Reve co-authored the paper, meaning the funder is directly involved in research design and outcomes." 225 }, 226 "financial_interests_declared": { 227 "applies": true, 228 "answer": false, 229 "justification": "No competing interests or financial interests statement is provided. Authors from Reve (a company) do not declare whether they have equity or other financial interests related to the findings." 230 } 231 }, 232 "contamination": { 233 "training_cutoff_stated": { 234 "applies": true, 235 "answer": false, 236 "justification": "No training data cutoff dates are stated for any of the models used (Claude Sonnet 4.5, GPT-4.1-mini, GPT-4.1, Llama-3.3-70B)." 237 }, 238 "train_test_overlap_discussed": { 239 "applies": true, 240 "answer": false, 241 "justification": "No discussion of whether ALFWorld (2020) or AppWorld (2024) tasks could appear in model training data. ALFWorld has been public for 5 years and is widely referenced." 242 }, 243 "benchmark_contamination_addressed": { 244 "applies": true, 245 "answer": false, 246 "justification": "ALFWorld was published in 2020 and is a well-known benchmark likely present in training data of recent models. AppWorld is from 2024. Neither contamination risk is discussed." 247 } 248 }, 249 "human_studies": { 250 "pre_registered": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "irb_or_ethics_approval": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "demographics_reported": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "inclusion_exclusion_criteria": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 }, 270 "randomization_described": { 271 "applies": false, 272 "answer": false, 273 "justification": "No human participants in this study." 274 }, 275 "blinding_described": { 276 "applies": false, 277 "answer": false, 278 "justification": "No human participants in this study." 279 }, 280 "attrition_reported": { 281 "applies": false, 282 "answer": false, 283 "justification": "No human participants in this study." 284 } 285 }, 286 "cost_and_practicality": { 287 "inference_cost_reported": { 288 "applies": true, 289 "answer": true, 290 "justification": "Detailed inference costs reported: absolute USD per episode ($0.059 teacher, $0.024 IC+Cascade on ALFWorld; $0.589 vs $0.17 on AppWorld), normalized costs, and full token-level breakdowns in Appendix C (Tables 4-5)." 291 }, 292 "compute_budget_stated": { 293 "applies": true, 294 "answer": true, 295 "justification": "Teacher demonstration costs quantified ($29.50 for 500 ALFWorld demos, $86.73 for 147 AppWorld demos). Per-episode costs detailed with per-model API pricing (Appendix C.1). Breakeven analysis provided (Appendix C.5)." 296 } 297 }, 298 "experimental_rigor": { 299 "seed_sensitivity_reported": { 300 "applies": true, 301 "answer": false, 302 "justification": "No mention of random seeds or seed sensitivity. All results appear to be single-run. Given the stochastic nature of LLM sampling (even at temperature 0.1), seed variation could affect results." 303 }, 304 "number_of_runs_stated": { 305 "applies": true, 306 "answer": false, 307 "justification": "The number of experimental runs is never stated. It appears all results are from single runs. No 'averaged over K runs' statements appear anywhere." 308 }, 309 "hyperparameter_search_budget": { 310 "applies": true, 311 "answer": false, 312 "justification": "While Figures 3-4 sweep k (number of examples) and DB size, no systematic hyperparameter search budget is reported. The search over temperatures and k values is shown but the total compute spent on tuning is not stated." 313 }, 314 "best_config_selection_justified": { 315 "applies": true, 316 "answer": true, 317 "justification": "The selection of k=6 (ALFWorld) and k=3 (AppWorld) is justified via the cost-accuracy analysis in Section 6.3 and Figure 3, showing diminishing returns. The rationale for the chosen operating points is clear." 318 }, 319 "multiple_comparison_correction": { 320 "applies": true, 321 "answer": false, 322 "justification": "Many pairwise comparisons are made across methods, benchmarks, and configurations without any statistical tests, let alone correction for multiple comparisons." 323 }, 324 "self_comparison_bias_addressed": { 325 "applies": true, 326 "answer": false, 327 "justification": "Authors implement all methods (baselines, ablations, their own) without acknowledging potential bias from evaluating their own system against their own baseline implementations." 328 }, 329 "compute_budget_vs_performance": { 330 "applies": true, 331 "answer": true, 332 "justification": "Figure 2 explicitly plots cost vs performance for all methods. The entire paper is organized around cost-accuracy tradeoffs, with normalized cost on the x-axis and accuracy on the y-axis." 333 }, 334 "benchmark_construct_validity": { 335 "applies": true, 336 "answer": false, 337 "justification": "Section 5 briefly justifies benchmark selection ('tasks that share structural patterns... enabling cross-task learning') but does not discuss whether ALFWorld and AppWorld actually measure the real-world agent cost-reduction capabilities claimed. No discussion of construct validity or comparison with alternative benchmarks." 338 }, 339 "scaffold_confound_addressed": { 340 "applies": true, 341 "answer": true, 342 "justification": "All model comparisons use the same ReAct-style scaffolding framework (Algorithm 1). The scaffold is held constant across teacher, student, and ablation configurations, isolating the model and distillation effects." 343 } 344 }, 345 "data_leakage": { 346 "temporal_leakage_addressed": { 347 "applies": true, 348 "answer": false, 349 "justification": "ALFWorld was published in 2020. Models trained in 2024-2025 could have seen its tasks. No temporal leakage analysis is discussed." 350 }, 351 "feature_leakage_addressed": { 352 "applies": true, 353 "answer": false, 354 "justification": "No discussion of whether the evaluation setup leaks information. The retrieved teacher demonstrations themselves are a form of information injection, but the paper does not discuss whether benchmark-specific information leaks through retrieval." 355 }, 356 "non_independence_addressed": { 357 "applies": true, 358 "answer": false, 359 "justification": "The paper uses benchmark-provided train/test splits but does not verify independence or discuss potential structural similarities between demonstration tasks and test tasks beyond their different splits." 360 }, 361 "leakage_detection_method": { 362 "applies": true, 363 "answer": false, 364 "justification": "No leakage detection or prevention methods are applied (no canary strings, membership inference, decontamination, or temporal analysis)." 365 } 366 } 367 }, 368 "claims": [ 369 { 370 "claim": "In-context distillation with self-consistency cascades achieves 2.5× cost reduction on ALFWorld at teacher-level accuracy, reducing per-episode costs from $0.059 to $0.024.", 371 "evidence": "Figure 2 and Table 4 show IC+Cascade achieves 96% accuracy at 0.42 normalized cost ($0.024/episode) vs teacher at 89% accuracy ($0.059/episode).", 372 "supported": "strong" 373 }, 374 { 375 "claim": "The combined system exceeds teacher accuracy on ALFWorld (96% vs 89%).", 376 "evidence": "Table 1 and Section 6.1 report 0.96 vs 0.89 accuracy. However, this is a single-run result on 134 tasks with no variance reporting, so the difference may not be statistically significant.", 377 "supported": "moderate" 378 }, 379 { 380 "claim": "The method achieves 2× cost reduction on AppWorld at iso-accuracy.", 381 "evidence": "Section 6.1 and Figure 2 show IC+Cascade achieves 0.66 accuracy at 0.29 normalized cost, compared to Random Mix achieving similar accuracy at higher cost.", 382 "supported": "strong" 383 }, 384 { 385 "claim": "The approach generalizes to open-weight LLMs (Llama-3.3-70B).", 386 "evidence": "Table 1 shows Llama-3.3-70B follows similar trends: ALFWorld 0.50→0.87→0.93 (ZS→IC→IC+Cascade), AppWorld 0.11→0.32→0.44.", 387 "supported": "moderate" 388 }, 389 { 390 "claim": "Breakeven versus teacher-only occurs after 843 episodes on ALFWorld and 207 on AppWorld.", 391 "evidence": "Appendix C.5 provides the calculation: N* = $29.50 / ($0.059 - $0.024) ≈ 843 for ALFWorld; N* = $86.73 / $0.42 ≈ 207 for AppWorld.", 392 "supported": "strong" 393 }, 394 { 395 "claim": "The method is competitive with CuGA, a specialized compound agentic system, achieving 65.5% vs 73.2% on AppWorld.", 396 "evidence": "Section 6.2 compares IC+Cascade (65.5%) against CuGA's leaderboard result (73.2%), noting CuGA requires more engineering investment. With oracle routing, their method reaches 77.6%.", 397 "supported": "weak" 398 }, 399 { 400 "claim": "In-context distillation is a training-free alternative to fine-tuning.", 401 "evidence": "Section 6.2 reports fine-tuning GPT-4o-mini matched their accuracy on ALFWorld (94% vs 96%), but fine-tuning failed entirely on AppWorld (API failures, 0% success despite training loss convergence).", 402 "supported": "moderate" 403 }, 404 { 405 "claim": "Per-step retrieval reduces cost without sacrificing accuracy compared to single retrieval.", 406 "evidence": "Table 3 (Appendix A) shows per-step retrieval matches single-retrieval accuracy (0.87 vs 0.87 ALFWorld, 0.55 vs 0.54 AppWorld) at lower cost (0.43 vs 0.54, 0.15 vs 0.24).", 407 "supported": "strong" 408 } 409 ], 410 "red_flags": [ 411 { 412 "flag": "No error bars or variance reporting", 413 "detail": "All results are single-run point estimates on relatively small test sets (134 and 168 tasks). The claim that the student exceeds teacher accuracy (96% vs 89% on 134 tasks) could easily fall within random variation. Without variance across seeds or confidence intervals, the statistical reliability of all comparisons is unknown." 414 }, 415 { 416 "flag": "No limitations section", 417 "detail": "The paper has no dedicated limitations section. The Discussion (Section 7) frames the work positively without acknowledging weaknesses such as the narrow benchmark selection, single-run results, or potential contamination concerns." 418 }, 419 { 420 "flag": "Potential funder conflict", 421 "detail": "OpenAI provided API credits and GPT-4.1-mini (OpenAI's product) is the primary student model shown to perform well. Reve is both a funder and has co-authors on the paper. These relationships are disclosed but their potential influence on study design is not discussed." 422 }, 423 { 424 "flag": "Benchmark contamination risk unaddressed", 425 "detail": "ALFWorld has been public since 2020 and is widely used in LLM agent research. Models trained in 2024-2025 may have seen its tasks. The paper does not discuss this, yet reports that the student exceeds teacher accuracy—which could be partly explained by training data contamination." 426 }, 427 { 428 "flag": "Broad claims from narrow evaluation", 429 "detail": "The title promises 'A Simple, Training-Free Way to Reduce LLM Agent Costs' broadly, but results are on only two benchmarks (one embodied planning, one API orchestration). No real-world deployment or diverse task domains are tested." 430 } 431 ], 432 "cited_papers": [ 433 { 434 "title": "Self-generated in-context examples improve LLM agents for sequential decision-making tasks", 435 "authors": ["Vishnu Sarukkai", "Zhiqiang Xie", "Kayvon Fatahalian"], 436 "year": 2025, 437 "relevance": "Prior work by same authors on retrieval-augmented agent improvement; directly foundational to the in-context distillation approach." 438 }, 439 { 440 "title": "Large language model cascades with mixture of thoughts", 441 "authors": ["Murong Yue", "Jie Zhao"], 442 "year": 2024, 443 "relevance": "Proposes self-consistency-based cascades for LLM routing, which is the core deferral mechanism adapted in this paper." 444 }, 445 { 446 "title": "FrugalGPT: How to use large language models while reducing cost and improving performance", 447 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 448 "year": 2023, 449 "arxiv_id": "2305.05176", 450 "relevance": "Foundational work on LLM cost reduction through model cascades and routing strategies." 451 }, 452 { 453 "title": "RouteLLM: Learning to route LLMs from preference data", 454 "authors": ["Isaac Ong", "Amjad Almahairi", "Vincent Wu"], 455 "year": 2025, 456 "relevance": "Learning-based LLM routing from preference data; represents the trained-router alternative to self-consistency cascades." 457 }, 458 { 459 "title": "Language model cascades: Token-level uncertainty and beyond", 460 "authors": ["Neha Gupta", "Harikrishna Narasimhan", "Wittawat Jitkrittum"], 461 "year": 2024, 462 "relevance": "Token-level cascade methods for cost-accuracy tradeoffs in LLM inference." 463 }, 464 { 465 "title": "Distilling step-by-step! Outperforming larger language models with less training data and smaller model sizes", 466 "authors": ["Cheng-Yu Hsieh", "Chun-Liang Li", "Chih-Kuan Yeh"], 467 "year": 2023, 468 "arxiv_id": "2305.02301", 469 "relevance": "Reasoning distillation approach showing smaller models can match larger ones; foundational to the in-context distillation concept." 470 }, 471 { 472 "title": "Self-consistency improves chain of thought reasoning in language models", 473 "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"], 474 "year": 2022, 475 "arxiv_id": "2203.11171", 476 "relevance": "Proposes self-consistency for improving LLM reasoning; the uncertainty signal mechanism adopted for cascade routing." 477 }, 478 { 479 "title": "ReAct: Synergizing reasoning and acting in language models", 480 "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"], 481 "year": 2023, 482 "relevance": "ReAct framework used as the base agent architecture in this paper's experiments." 483 }, 484 { 485 "title": "AppWorld: A controllable world of apps and people for benchmarking interactive coding agents", 486 "authors": ["Harsh Trivedi", "Tushar Khot", "Mareike Hartmann"], 487 "year": 2024, 488 "relevance": "One of the two primary evaluation benchmarks; tests multi-step API workflow agents." 489 }, 490 { 491 "title": "Fireact: Toward language agent fine-tuning", 492 "authors": ["Baian Chen", "Chang Shu", "Ehsan Shareghi"], 493 "year": 2023, 494 "arxiv_id": "2310.05915", 495 "relevance": "Agent fine-tuning approach representing the parametric distillation alternative that this paper aims to avoid." 496 }, 497 { 498 "title": "AgentTuning: Enabling generalized agent abilities for LLMs", 499 "authors": ["Aohan Zeng", "Mingdao Liu", "Rui Lu"], 500 "year": 2023, 501 "arxiv_id": "2310.12823", 502 "relevance": "Training-based approach to generalizing agent capabilities; contrasts with training-free in-context distillation." 503 }, 504 { 505 "title": "Reflexion: Language agents with verbal reinforcement learning", 506 "authors": ["Noah Shinn", "Federico Cassano", "Ashwin Gopinath"], 507 "year": 2023, 508 "relevance": "Self-improving agent using verbal feedback; related approach to agent capability improvement without weight updates." 509 } 510 ] 511 }