scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (21966B)
      1 {
      2   "paper": {
      3     "title": "FrugalGPT: How to Use Large Language Models While Reducing Cost and Improving Performance",
      4     "authors": ["Lingjiao Chen", "Matei Zaharia", "James Zou"],
      5     "year": 2023,
      6     "venue": "arXiv",
      7     "arxiv_id": "2305.05176"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "FrugalGPT proposes three strategies for reducing LLM inference costs: prompt adaptation, LLM approximation, and LLM cascade. The LLM cascade approach learns to route queries to different LLM APIs based on query difficulty, achieving up to 98% cost reduction while matching GPT-4 performance on HEADLINES, 73% savings on OVERRULING, and 59% on COQA. The paper also demonstrates that cheaper LLMs can complement expensive ones, with GPT-J correcting GPT-4 errors on 6% of HEADLINES queries.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": false,
     18         "justification": "No code repository URL or link to source code is provided anywhere in the paper."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "The datasets used (HEADLINES, OVERRULING, COQA) are publicly available benchmarks with citations provided (Table 2)."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "No environment specifications, dependencies, or library versions are provided."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided. The paper describes the method but lacks implementation details sufficient for reproduction."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": false,
     40         "justification": "Results are reported as point estimates (e.g., 0.872 accuracy, 98% cost savings) with no confidence intervals or error bars."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Claims like 'FrugalGPT can match the performance of the best individual LLM' and '4% accuracy improvement' are made without any statistical significance tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes are reported with context: '98% cost reduction', 'accuracy improved by 1.5% (from 0.857 to 0.872)', '80% cost reduction' (Table 3, Figure 3)."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "Dataset sizes are stated (HEADLINES 10000, OVERRULING 2400, COQA 7982 in Table 2) but no justification for the train/test split sizes or why these are sufficient."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "No variance, standard deviation, or results across multiple runs are reported. All results appear to be single-run."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "All 12 individual LLM APIs serve as baselines, with their performance and cost plotted in Figure 5 alongside FrugalGPT."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Baselines include GPT-4, ChatGPT, and other APIs contemporary to the 2023 publication date."
     73       },
     74       "ablation_study": {
     75         "applies": true,
     76         "answer": false,
     77         "justification": "No ablation study is performed to isolate the contribution of individual components (e.g., the scoring function, the routing strategy, the pruning optimizer)."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Both accuracy and cost are reported as metrics, and the paper analyzes their trade-offs (Figure 5, Table 3)."
     83       },
     84       "human_evaluation": {
     85         "applies": true,
     86         "answer": false,
     87         "justification": "No human evaluation is performed. Evaluation is entirely automated using ground-truth labels."
     88       },
     89       "held_out_test_set": {
     90         "applies": true,
     91         "answer": true,
     92         "justification": "Section 4: 'Each dataset is randomly split into a training set to learn the LLM cascade and a test set for evaluation.'"
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per dataset (HEADLINES, OVERRULING, COQA) in Table 3 and Figure 5, showing different cost savings and accuracy trade-offs."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Section 4 discusses failure cases: 'FrugalGPT is not perfect' — the third COQA example shows all LLMs give the same answer but FrugalGPT still queries all, wasting cost."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper notes cases where FrugalGPT's scoring function fails to identify reliable answers early, resulting in unnecessary API calls (Figure 5(c) third example)."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Abstract claims of 'up to 98% cost reduction' and '4% accuracy improvement' are supported by Table 3 (98.3% on HEADLINES) and Figure 5 results."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The paper's causal claims are about the effect of using FrugalGPT cascade vs. individual LLMs. The experimental design (same queries, same datasets, controlled comparison) adequately supports these claims."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": false,
    124         "justification": "The title and abstract suggest general applicability ('How to Use Large Language Models'), but results are on only 3 classification/QA datasets. The paper does not bound its claims to these specific task types."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "No discussion of alternative explanations for why the cascade works — e.g., whether the scoring function is overfitting to training distribution, or whether results would hold under distribution shift."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "The paper measures accuracy on classification/QA tasks and frames this as general 'LLM performance', but doesn't discuss that these closed-form tasks may not represent the open-ended generation tasks where LLMs are most used."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": false,
    141         "justification": "Models are referred to by marketing names only (GPT-4, ChatGPT, GPT-3, GPT-J) without specific version identifiers or snapshot dates. Table 1 lists sizes but not versions."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": false,
    146         "justification": "The paper mentions using few-shot prompts (Table 2 lists number of examples) but does not provide the actual prompt text used for any of the 12 APIs."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": false,
    151         "justification": "No hyperparameters (temperature, top-p, max tokens) are reported for any of the 12 LLM APIs queried. The scoring function's DistilBERT training hyperparameters are also not reported."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. FrugalGPT is a cascade/routing system, not an agentic scaffold."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": false,
    161         "justification": "No description of how queries were formatted for each API, how the train/test split was created, or how responses were parsed for evaluation."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "Section 5 'Discussions, Limitations and Future Prospects' explicitly discusses limitations including the need for labeled examples and distribution matching."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Section 5 mentions specific limitations: 'we need some labeled examples', 'the training examples should be from the same or similar distribution as the test examples', and 'learning the LLM cascade itself requires resources.'"
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The paper does not explicitly state what settings or task types the approach does NOT apply to. Section 5 mentions future work on latency, fairness, privacy but doesn't bound current claims."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "The raw LLM API responses, scoring function predictions, and per-query routing decisions are not released for verification."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "Data collection is described: datasets are cited with references (Table 2), and API cost data is sourced from provider pricing pages as of March 2023 (Table 1)."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants; all data comes from standard public benchmarks."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": false,
    200         "justification": "The pipeline from raw queries to formatted prompts to API responses to accuracy computation is not documented in detail."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": false,
    207         "justification": "No funding sources or acknowledgments section is present in the paper."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Author affiliations (Stanford University) are clearly stated on the first page."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": false,
    217         "justification": "No funding is disclosed, so independence cannot be assessed."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interest statement is present. Notably, Matei Zaharia is a co-founder of Databricks, which has interests in LLM cost optimization."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the 12 LLM APIs used."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether the LLMs were trained on data containing the benchmark datasets (HEADLINES, OVERRULING, COQA)."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "COQA (2019), OVERRULING (2021), and HEADLINES (2021) were all published before the training cutoffs of GPT-4 and ChatGPT. No contamination analysis is provided."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Cost is a central focus: Table 1 provides per-token costs for all 12 APIs, Table 3 shows total costs, and Figure 5 plots cost-accuracy trade-offs."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "The compute budget for training the DistilBERT scoring function and the cascade optimizer is not reported."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": false,
    295         "justification": "No results across multiple random seeds are reported. The train/test split and DistilBERT training appear to be single-run."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "The number of experimental runs is not stated anywhere."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The cascade uses a 'specialized optimizer' (Section 3) but no search budget or number of configurations tried is reported."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "The paper states the optimizer prunes the search space and uses interpolation but does not explain how the final configuration was selected or validated."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors implement FrugalGPT and compare it against individual APIs without acknowledging that they may have tuned their system to outperform on these specific datasets."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "The entire paper is structured around cost-vs-performance trade-offs, with Figure 5 showing performance as a function of cost budget."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "No discussion of whether HEADLINES (gold price classification), OVERRULING (legal classification), and COQA (QA) are representative of real LLM use cases. All three are closed-form classification tasks, not open-ended generation."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No agentic scaffolding is involved; this is a routing/cascade system."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "All three benchmarks predate the LLMs' training cutoffs. No discussion of whether LLMs may have memorized these datasets."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the few-shot examples in prompts leak information about the test distribution."
    348       },
    349       "non_independence_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "The train/test split is described as random but no analysis of whether train and test examples share structural similarities."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "FrugalGPT can match GPT-4 performance with up to 98% cost reduction",
    364       "evidence": "Table 3: On HEADLINES, FrugalGPT achieves the same accuracy as GPT-4 at $0.6 vs $33.1 (98.3% savings). OVERRULING: 73.3% savings. COQA: 59.2% savings.",
    365       "supported": "moderate"
    366     },
    367     {
    368       "claim": "FrugalGPT can improve accuracy over GPT-4 by up to 4% at the same cost",
    369       "evidence": "Figure 5 shows FrugalGPT's Pareto frontier above individual LLM points, with up to ~4% accuracy improvement on OVERRULING at matched cost.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "Cheap LLMs can correct expensive LLMs on 6% of queries",
    374       "evidence": "Figure 4(a): GPT-C, GPT-J, and J1-L each improve GPT-4 by up to 6% on HEADLINES (MPI metric). Figure 4(c): GPT-3 improves GPT-4 by 13% on COQA.",
    375       "supported": "moderate"
    376     }
    377   ],
    378   "red_flags": [
    379     {
    380       "flag": "No variance or multiple runs",
    381       "detail": "All results appear to be single-run with no error bars, confidence intervals, or seed sensitivity analysis. The cascade's learned strategy depends on the train/test split, which could significantly affect results."
    382     },
    383     {
    384       "flag": "Benchmark contamination unaddressed",
    385       "detail": "All three benchmarks (COQA 2019, OVERRULING 2021, HEADLINES 2021) predate the training of GPT-4 and ChatGPT. LLM accuracy numbers may be inflated by memorization, which would also affect the cascade's learned routing decisions."
    386     },
    387     {
    388       "flag": "Only classification/QA tasks tested",
    389       "detail": "All three datasets are essentially classification tasks with known labels. The paper's broad title and framing suggest applicability to general LLM use, but open-ended generation tasks (where LLMs are most commonly used) are not evaluated."
    390     },
    391     {
    392       "flag": "Missing hyperparameters and prompts",
    393       "detail": "Temperature, top-p, and other sampling parameters are not reported for any of the 12 APIs. Actual prompts are not provided. These significantly affect both cost and accuracy."
    394     },
    395     {
    396       "flag": "Potential conflict of interest undisclosed",
    397       "detail": "Co-author Matei Zaharia is co-founder of Databricks, which has commercial interests in LLM cost optimization. No competing interests statement is provided."
    398     }
    399   ],
    400   "cited_papers": [
    401     {
    402       "title": "Language models are few-shot learners",
    403       "authors": ["Tom Brown", "Benjamin Mann", "Nick Ryder"],
    404       "year": 2020,
    405       "relevance": "GPT-3 paper; foundational LLM used as one of the cascade APIs in FrugalGPT evaluation."
    406     },
    407     {
    408       "title": "Chain of thought prompting elicits reasoning in large language models",
    409       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    410       "year": 2022,
    411       "arxiv_id": "2201.11903",
    412       "relevance": "Chain-of-thought prompting technique discussed as a prompt engineering strategy for LLM cost-performance trade-offs."
    413     },
    414     {
    415       "title": "FrugalML: How to use ML prediction APIs more accurately and cheaply",
    416       "authors": ["Lingjiao Chen", "Matei Zaharia", "James Y Zou"],
    417       "year": 2020,
    418       "relevance": "Predecessor work on ML API cascading for classification tasks; FrugalGPT extends this to LLM generative APIs."
    419     },
    420     {
    421       "title": "Augmented language models: a survey",
    422       "authors": ["Grégoire Mialon", "Roberto Dessì", "Maria Lomeli"],
    423       "year": 2023,
    424       "arxiv_id": "2302.07842",
    425       "relevance": "Survey of augmented LLM techniques including prompt engineering approaches relevant to cost-performance optimization."
    426     },
    427     {
    428       "title": "LLaMA: Open and efficient foundation language models",
    429       "authors": ["Hugo Touvron", "Thibaut Lavril", "Gautier Izacard"],
    430       "year": 2023,
    431       "arxiv_id": "2302.13971",
    432       "relevance": "Open-weight LLM demonstrating that smaller models can achieve competitive performance, relevant to LLM cost optimization."
    433     },
    434     {
    435       "title": "On the dangers of stochastic parrots: Can language models be too big?",
    436       "authors": ["Emily M Bender", "Timnit Gebru", "Angelina McMillan-Major"],
    437       "year": 2021,
    438       "relevance": "Discusses environmental and social costs of large LLMs, motivating FrugalGPT's cost-reduction approach."
    439     },
    440     {
    441       "title": "Ask me anything: A simple strategy for prompting language models",
    442       "authors": ["Simran Arora", "Avanika Narayan", "Mayee F Chen"],
    443       "year": 2022,
    444       "arxiv_id": "2210.02441",
    445       "relevance": "Demonstrates that aggregating responses from smaller models can match larger model performance, supporting FrugalGPT's diversity hypothesis."
    446     },
    447     {
    448       "title": "GPT-4 technical report",
    449       "authors": ["OpenAI"],
    450       "year": 2023,
    451       "arxiv_id": "2303.08774",
    452       "relevance": "GPT-4 is the primary expensive baseline that FrugalGPT aims to match or exceed at lower cost."
    453     }
    454   ]
    455 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs