scan.json (21966B)
1 { 2 "paper": { 3 "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance", 4 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"], 5 "year": 2023, 6 "venue": "arXiv", 7 "arxiv_id": "2305.05176" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "FrugalGPT proposes three strategies for reducing LLM inference costs: prompt adaptation, LLM approximation, and LLM cascade. The LLM cascade approach learns to route queries to different LLM APIs based on query difficulty, achieving up to 98% cost reduction while matching GPT-4 performance on HEADLINES, 73% savings on OVERRULING, and 59% on COQA. The paper also demonstrates that cheaper LLMs can complement expensive ones, with GPT-J correcting GPT-4 errors on 6% of HEADLINES queries.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": false, 18 "justification": "No code repository URL or link to source code is provided anywhere in the paper." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The datasets used (HEADLINES, OVERRULING, COQA) are publicly available benchmarks with citations provided (Table 2)." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "No environment specifications, dependencies, or library versions are provided." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided. The paper describes the method but lacks implementation details sufficient for reproduction." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "Results are reported as point estimates (e.g., 0.872 accuracy, 98% cost savings) with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "Claims like 'FrugalGPT can match the performance of the best individual LLM' and '4% accuracy improvement' are made without any statistical significance tests." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Effect sizes are reported with context: '98% cost reduction', 'accuracy improved by 1.5% (from 0.857 to 0.872)', '80% cost reduction' (Table 3, Figure 3)." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "Dataset sizes are stated (HEADLINES 10000, OVERRULING 2400, COQA 7982 in Table 2) but no justification for the train/test split sizes or why these are sufficient." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "All 12 individual LLM APIs serve as baselines, with their performance and cost plotted in Figure 5 alongside FrugalGPT." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Baselines include GPT-4, ChatGPT, and other APIs contemporary to the 2023 publication date." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": false, 77 "justification": "No ablation study is performed to isolate the contribution of individual components (e.g., the scoring function, the routing strategy, the pruning optimizer)." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Both accuracy and cost are reported as metrics, and the paper analyzes their trade-offs (Figure 5, Table 3)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation is performed. Evaluation is entirely automated using ground-truth labels." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "Section 4: 'Each dataset is randomly split into a training set to learn the LLM cascade and a test set for evaluation.'" 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down per dataset (HEADLINES, OVERRULING, COQA) in Table 3 and Figure 5, showing different cost savings and accuracy trade-offs." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 4 discusses failure cases: 'FrugalGPT is not perfect' — the third COQA example shows all LLMs give the same answer but FrugalGPT still queries all, wasting cost." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper notes cases where FrugalGPT's scoring function fails to identify reliable answers early, resulting in unnecessary API calls (Figure 5(c) third example)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims of 'up to 98% cost reduction' and '4% accuracy improvement' are supported by Table 3 (98.3% on HEADLINES) and Figure 5 results." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper's causal claims are about the effect of using FrugalGPT cascade vs. individual LLMs. The experimental design (same queries, same datasets, controlled comparison) adequately supports these claims." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title and abstract suggest general applicability ('How to Use Large Language Models'), but results are on only 3 classification/QA datasets. The paper does not bound its claims to these specific task types." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No discussion of alternative explanations for why the cascade works — e.g., whether the scoring function is overfitting to training distribution, or whether results would hold under distribution shift." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper measures accuracy on classification/QA tasks and frames this as general 'LLM performance', but doesn't discuss that these closed-form tasks may not represent the open-ended generation tasks where LLMs are most used." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": false, 141 "justification": "Models are referred to by marketing names only (GPT-4, ChatGPT, GPT-3, GPT-J) without specific version identifiers or snapshot dates. Table 1 lists sizes but not versions." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": false, 146 "justification": "The paper mentions using few-shot prompts (Table 2 lists number of examples) but does not provide the actual prompt text used for any of the 12 APIs." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": false, 151 "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the 12 LLM APIs queried. The scoring function's DistilBERT training hyperparameters are also not reported." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. FrugalGPT is a cascade/routing system, not an agentic scaffold." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": false, 161 "justification": "No description of how queries were formatted for each API, how the train/test split was created, or how responses were parsed for evaluation." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 5 'Discussions, Limitations and Future Prospects' explicitly discusses limitations including the need for labeled examples and distribution matching." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Section 5 mentions specific limitations: 'we need some labeled examples', 'the training examples should be from the same or similar distribution as the test examples', and 'learning the LLM cascade itself requires resources.'" 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The paper does not explicitly state what settings or task types the approach does NOT apply to. Section 5 mentions future work on latency, fairness, privacy but doesn't bound current claims." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": false, 185 "justification": "The raw LLM API responses, scoring function predictions, and per-query routing decisions are not released for verification." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Data collection is described: datasets are cited with references (Table 2), and API cost data is sourced from provider pricing pages as of March 2023 (Table 1)." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants; all data comes from standard public benchmarks." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": false, 200 "justification": "The pipeline from raw queries to formatted prompts to API responses to accuracy computation is not documented in detail." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": false, 207 "justification": "No funding sources or acknowledgments section is present in the paper." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations (Stanford University) are clearly stated on the first page." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "No funding is disclosed, so independence cannot be assessed." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests or financial interest statement is present. Notably, Matei Zaharia is a co-founder of Databricks, which has interests in LLM cost optimization." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff dates are stated for any of the 12 LLM APIs used." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "No discussion of whether the LLMs were trained on data containing the benchmark datasets (HEADLINES, OVERRULING, COQA)." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "COQA (2019), OVERRULING (2021), and HEADLINES (2021) were all published before the training cutoffs of GPT-4 and ChatGPT. No contamination analysis is provided." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": true, 283 "justification": "Cost is a central focus: Table 1 provides per-token costs for all 12 APIs, Table 3 shows total costs, and Figure 5 plots cost-accuracy trade-offs." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "The compute budget for training the DistilBERT scoring function and the cascade optimizer is not reported." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No results across multiple random seeds are reported. The train/test split and DistilBERT training appear to be single-run." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is not stated anywhere." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "The cascade uses a 'specialized optimizer' (Section 3) but no search budget or number of configurations tried is reported." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "The paper states the optimizer prunes the search space and uses interpolation but does not explain how the final configuration was selected or validated." 311 }, 312 "multiple_comparison_correction": { 313 "applies": false, 314 "answer": false, 315 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors implement FrugalGPT and compare it against individual APIs without acknowledging that they may have tuned their system to outperform on these specific datasets." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": true, 325 "justification": "The entire paper is structured around cost-vs-performance trade-offs, with Figure 5 showing performance as a function of cost budget." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "No discussion of whether HEADLINES (gold price classification), OVERRULING (legal classification), and COQA (QA) are representative of real LLM use cases. All three are closed-form classification tasks, not open-ended generation." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No agentic scaffolding is involved; this is a routing/cascade system." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "All three benchmarks predate the LLMs' training cutoffs. No discussion of whether LLMs may have memorized these datasets." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "No discussion of whether the few-shot examples in prompts leak information about the test distribution." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "The train/test split is described as random but no analysis of whether train and test examples share structural similarities." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention method is applied." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "FrugalGPT can match GPT-4 performance with up to 98% cost reduction", 364 "evidence": "Table 3: On HEADLINES, FrugalGPT achieves the same accuracy as GPT-4 at $0.6 vs $33.1 (98.3% savings). OVERRULING: 73.3% savings. COQA: 59.2% savings.", 365 "supported": "moderate" 366 }, 367 { 368 "claim": "FrugalGPT can improve accuracy over GPT-4 by up to 4% at the same cost", 369 "evidence": "Figure 5 shows FrugalGPT's Pareto frontier above individual LLM points, with up to ~4% accuracy improvement on OVERRULING at matched cost.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Cheap LLMs can correct expensive LLMs on 6% of queries", 374 "evidence": "Figure 4(a): GPT-C, GPT-J, and J1-L each improve GPT-4 by up to 6% on HEADLINES (MPI metric). Figure 4(c): GPT-3 improves GPT-4 by 13% on COQA.", 375 "supported": "moderate" 376 } 377 ], 378 "red_flags": [ 379 { 380 "flag": "No variance or multiple runs", 381 "detail": "All results appear to be single-run with no error bars, confidence intervals, or seed sensitivity analysis. The cascade's learned strategy depends on the train/test split, which could significantly affect results." 382 }, 383 { 384 "flag": "Benchmark contamination unaddressed", 385 "detail": "All three benchmarks (COQA 2019, OVERRULING 2021, HEADLINES 2021) predate the training of GPT-4 and ChatGPT. LLM accuracy numbers may be inflated by memorization, which would also affect the cascade's learned routing decisions." 386 }, 387 { 388 "flag": "Only classification/QA tasks tested", 389 "detail": "All three datasets are essentially classification tasks with known labels. The paper's broad title and framing suggest applicability to general LLM use, but open-ended generation tasks (where LLMs are most commonly used) are not evaluated." 390 }, 391 { 392 "flag": "Missing hyperparameters and prompts", 393 "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 12 APIs. Actual prompts are not provided. These significantly affect both cost and accuracy." 394 }, 395 { 396 "flag": "Potential conflict of interest undisclosed", 397 "detail": "Co-author Matei Zaharia is co-founder of Databricks, which has commercial interests in LLM cost optimization. No competing interests statement is provided." 398 } 399 ], 400 "cited_papers": [ 401 { 402 "title": "Language models are few-shot learners", 403 "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"], 404 "year": 2020, 405 "relevance": "GPT-3 paper; foundational LLM used as one of the cascade APIs in FrugalGPT evaluation." 406 }, 407 { 408 "title": "Chain of thought prompting elicits reasoning in large language models", 409 "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"], 410 "year": 2022, 411 "arxiv_id": "2201.11903", 412 "relevance": "Chain-of-thought prompting technique discussed as a prompt engineering strategy for LLM cost-performance trade-offs." 413 }, 414 { 415 "title": "FrugalML: How to use ML prediction APIs more accurately and cheaply", 416 "authors": ["Lingjiao Chen", "Matei Zaharia", "James Y Zou"], 417 "year": 2020, 418 "relevance": "Predecessor work on ML API cascading for classification tasks; FrugalGPT extends this to LLM generative APIs." 419 }, 420 { 421 "title": "Augmented language models: a survey", 422 "authors": ["Grégoire Mialon", "Roberto Dessì", "Maria Lomeli"], 423 "year": 2023, 424 "arxiv_id": "2302.07842", 425 "relevance": "Survey of augmented LLM techniques including prompt engineering approaches relevant to cost-performance optimization." 426 }, 427 { 428 "title": "LLaMA: Open and efficient foundation language models", 429 "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"], 430 "year": 2023, 431 "arxiv_id": "2302.13971", 432 "relevance": "Open-weight LLM demonstrating that smaller models can achieve competitive performance, relevant to LLM cost optimization." 433 }, 434 { 435 "title": "On the dangers of stochastic parrots: Can language models be too big?", 436 "authors": ["Emily M Bender", "Timnit Gebru", "Angelina McMillan-Major"], 437 "year": 2021, 438 "relevance": "Discusses environmental and social costs of large LLMs, motivating FrugalGPT's cost-reduction approach." 439 }, 440 { 441 "title": "Ask me anything: A simple strategy for prompting language models", 442 "authors": ["Simran Arora", "Avanika Narayan", "Mayee F Chen"], 443 "year": 2022, 444 "arxiv_id": "2210.02441", 445 "relevance": "Demonstrates that aggregating responses from smaller models can match larger model performance, supporting FrugalGPT's diversity hypothesis." 446 }, 447 { 448 "title": "GPT-4 technical report", 449 "authors": ["OpenAI"], 450 "year": 2023, 451 "arxiv_id": "2303.08774", 452 "relevance": "GPT-4 is the primary expensive baseline that FrugalGPT aims to match or exceed at lower cost." 453 } 454 ] 455 }