scan.json (26172B)
1 { 2 "paper": { 3 "title": "Gorilla: Large Language Model Connected with Massive APIs", 4 "authors": ["Shishir G. Patil", "Tianjun Zhang", "Xin Wang", "Joseph E. Gonzalez"], 5 "year": 2023, 6 "venue": "Neural Information Processing Systems", 7 "arxiv_id": "2305.15334" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "Gorilla, a finetuned LLaMA-7B model, surpasses GPT-4 on API call generation across TorchHub, HuggingFace, and TensorHub benchmarks while significantly reducing hallucination. Retriever-aware training enables adaptation to test-time documentation changes. The paper introduces APIBench with 1,645 APIs and 16,450 instruction-API pairs, and uses AST sub-tree matching for functional correctness evaluation.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "The abstract states 'Gorilla's code, model, data, and demo are available at https://gorilla.cs.berkeley.edu'." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The paper states code, model, data, and demo are available at gorilla.cs.berkeley.edu. The APIBench dataset is described as released." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "Training hyperparameters are listed in Table 4 (learning rate, batch size, etc.) and hardware is mentioned (8xA100 40G), but no requirements.txt, Dockerfile, or detailed dependency specification is provided." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are provided in the paper. The paper points to a website but does not include commands or scripts to replicate experiments." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results in Tables 1-3 are point estimates (e.g., '59.13% accuracy') with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims Gorilla outperforms GPT-4 and other models based solely on comparing accuracy numbers. No statistical significance tests are reported." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "The paper reports percentage improvements with baseline context, e.g., '20.43% better than GPT-4 and 10.75% better than ChatGPT' and '83%' improvement over LLAMA (Section 4.1)." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "No justification is provided for the dataset sizes (10 instruction-API pairs per API, 90/10 or 80/20 train/test splits). No power analysis or sample size rationale." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "No variance, standard deviation, or multi-run results are reported. All results appear to be single-run numbers." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "The paper compares against GPT-4, GPT-3.5-turbo, Claude, and LLaMA-7B across multiple retrieval settings (Table 1)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "GPT-4 (gpt-4-0314), GPT-3.5-turbo, and Claude (claude-v1) were state-of-the-art models at the time of publication in 2023." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": true, 77 "justification": "Table 2 compares Gorilla trained with vs. without retrieval, and tests each with different retrieval methods at inference time. Section 4.1 discusses the impact of retrieval during finetuning vs. evaluation." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "The paper reports overall accuracy, hallucination rate, and error rate as separate metrics (Table 1). Constraint-aware accuracy is reported as an additional metric (Table 3)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "No human evaluation is included. All evaluation is automated via AST sub-tree matching." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": true, 92 "justification": "Section 4 states: 'we have maintained a holdout test set on which we report our findings. The holdout test set was created by dividing the self-instruct dataset's instruction, API pairs into training and testing sets.' Appendix 8.2 specifies 90/10 for HuggingFace and 80/20 for Torch/Tensor Hub." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down per API hub (TorchHub, HuggingFace, TensorHub) and per retrieval setting (0-shot, BM25, GPT-Index, Oracle) in Tables 1-3." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 4.1 discusses hallucination examples from GPT-4 on HuggingFace (Figure 9), and Figure 1 shows failure cases for GPT-4 and Claude. The paper categorizes errors into hallucination vs. wrong API." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that adding a non-optimal retriever at test time can hurt performance: 'adding a non-optimal retriever at test time will sometime misguide the model and result in more errors' with specific degradation numbers (21.50% in TorchHub, 47.57% in HuggingFace)." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "The abstract claims Gorilla 'surpasses the performance of GPT-4 on writing API calls' which is supported by Table 1. Claims about hallucination reduction and test-time adaptation are supported by Tables 1-2 and Figure 6." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The paper makes causal claims about finetuning and retrieval-aware training improving performance. Table 2 provides controlled ablation comparing training with vs. without retrieval, which is an adequate single-variable manipulation." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": false, 124 "justification": "The title claims 'Large Language Model Connected with Massive APIs' but the evaluation only covers ML model hubs (TorchHub, HuggingFace, TensorHub). The paper acknowledges in Section 6 that they 'chose ML APIs' but the title and framing suggest broader applicability than tested." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": false, 129 "justification": "No discussion of alternative explanations. For example, the paper doesn't consider whether Gorilla's advantage is due to memorizing the specific API database used for both training and testing, or whether synthetic instruction generation introduces biases." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": false, 134 "justification": "The paper measures AST sub-tree matching accuracy as a proxy for 'ability to use tools via API calls' but does not discuss this gap. AST matching checks syntactic correctness of API calls but doesn't verify whether the API call actually accomplishes the user's task. The paper acknowledges they have an execution system but 'that is not a focus of this paper.'" 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Specific model checkpoints are given: 'gpt-4-0314', 'gpt-3.5-turbo-0301', 'claude-v1', 'LLaMA-7B', and retriever 'text-davinci-003' (Section 4)." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Figure 8 provides full example prompts including the retrieval augmentation format. The paper specifies the exact concatenation template: '<user_prompt> Use this API documentation for reference: <retrieved_API_doc_JSON>'." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Table 4 lists training hyperparameters: learning rate 2e-5, batch size 64, 5 epochs, warmup ratio 0.03, weight decay 0, max seq length 2048." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. Gorilla is a single-pass finetuned model that optionally has a retriever prepended to the input." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 3.1 describes the full data collection pipeline: scraping model hubs, filtering criteria (top 20 per domain for HuggingFace, filtering models with poor documentation), converting to JSON format, and generating 10 instructions per API via GPT-4 self-instruct with 3 in-context examples." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 6 'Limitations & Social Impacts' discusses limitations including the ML-domain focus and potential bias from ML APIs." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": false, 173 "justification": "The limitations section only discusses the ML domain focus and generic concerns about biased ML predictions. It does not address specific threats like the synthetic evaluation setup, potential data leakage, or the limited scope of AST-based evaluation." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": false, 178 "justification": "The limitations section mentions ML APIs as a limitation but does not explicitly state what the results do NOT show. No specific exclusions or non-claims are articulated." 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "The paper states that code, model, data, and demo are available at gorilla.cs.berkeley.edu, and the dataset of 16,450 instruction-API pairs is released." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 3.1 describes data collection in detail: scraping model hubs, filtering criteria, converting model cards to JSON with specific fields, and generating instructions via GPT-4 self-instruct." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. The data sources are public API hubs (HuggingFace, TorchHub, TensorHub), which are standard public resources." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "The pipeline is documented: 1,645 APIs scraped → converted to JSON → 10 instructions generated per API via self-instruct → 16,450 pairs → split into train/test. Filtering counts are given (925 HuggingFace, 626 TensorHub after filtering, 95 TorchHub)." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Section 7 states: 'This research is supported in part by gifts to UC Berkeley Sky Computing Lab from Astronomer, Google, IBM, Intel, Lacework, Microsoft, Nexla, Samsung SDS, Uber, and VMware.'" 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are listed: UC Berkeley and Microsoft Research. One author (Xin Wang) is from Microsoft Research." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": false, 217 "justification": "Microsoft is both a funder (listed in acknowledgments) and has an author on the paper (Xin Wang, Microsoft Research). Microsoft has commercial interest in LLM tool use. Google, also a funder, competes in LLM space. The funders are not independent of the outcome." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is provided. Given funding from multiple major tech companies and an author from Microsoft Research, a competing interests declaration is warranted but absent." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "No training data cutoff date is stated for GPT-4, GPT-3.5, Claude, or the base LLaMA model used for Gorilla." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": false, 234 "justification": "The evaluation uses APIBench which is constructed from public model hubs. These API descriptions may appear in LLM training data, but no overlap analysis is performed. Additionally, the self-instruct data was generated by GPT-4, and GPT-4 is also a baseline — this creates potential contamination that is not discussed." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "The API documentation from HuggingFace, TorchHub, and TensorHub is publicly available and likely in GPT-4 and GPT-3.5 training data. This contamination risk is not addressed." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference cost, latency, or token consumption is reported for any model. The paper does not mention API costs for GPT-4/GPT-3.5 calls or inference time for Gorilla." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "Training hardware is mentioned (8xA100 40G) but total GPU hours, training time, or API costs for generating the self-instruct data are not stated." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "No multi-seed results reported. All results appear to be from single training runs." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": false, 300 "justification": "The number of experimental runs is never stated. Results appear to be single-run." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search budget is reported. The hyperparameters in Table 4 appear fixed with no mention of search." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": false, 310 "justification": "No discussion of how the final configuration was selected or whether multiple configurations were tried." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "No statistical tests are performed at all, let alone corrections for multiple comparisons across the many model × retriever × dataset comparisons." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors evaluate their own system (Gorilla) against baselines without acknowledging author-evaluation bias. They control the benchmark, the evaluation metric, and the training pipeline." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "Gorilla is a finetuned 7B model compared against much larger models (GPT-4) but the compute difference is not discussed. GPT-4 uses zero-shot inference while Gorilla was specifically finetuned on the target API data — this asymmetry is not addressed." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": false, 330 "justification": "AST sub-tree matching is used as the evaluation metric but the paper does not discuss whether matching API function signatures actually measures the ability to correctly use APIs. The paper acknowledges they have execution capability but chose not to use it." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": true, 334 "answer": false, 335 "justification": "Gorilla is finetuned on the target API data while baselines are prompted zero-shot or with retrieval. This is a confound between finetuning-on-task vs. general capability that is not addressed — the comparison is between a specialist model and general-purpose models." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "The benchmark uses API documentation from public model hubs that existed before the training data cutoff of GPT-4 and GPT-3.5. This temporal leakage is not discussed." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": false, 347 "justification": "The oracle retriever setting provides the ground-truth API documentation to the model, which is a form of answer leakage in evaluation (acknowledged as an upper bound but not analyzed as feature leakage)." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Training and test data are generated from the same APIs via the same self-instruct process. Different instruction variants of the same API appear in both train and test. This non-independence is not discussed." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection or prevention method is used." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "Gorilla surpasses GPT-4 on writing API calls, achieving 20.43% better accuracy in zero-shot setting.", 364 "evidence": "Table 1 shows Gorilla zero-shot accuracy of 59.13% on TorchHub vs GPT-4's 38.70%, 71.68% vs 19.80% on HuggingFace, and 83.79% vs 18.20% on TensorHub.", 365 "supported": "moderate" 366 }, 367 { 368 "claim": "Gorilla substantially mitigates hallucination compared to other LLMs.", 369 "evidence": "Table 1 shows Gorilla zero-shot hallucination rates of 6.98%, 10.95%, 5.40% across three hubs, compared to GPT-4's 36.55%, 37.16%, 78.65%.", 370 "supported": "moderate" 371 }, 372 { 373 "claim": "Retriever-aware training enables Gorilla to adapt to test-time API documentation changes.", 374 "evidence": "Figure 6 shows three qualitative examples where changing the retrieved document changes Gorilla's output API call accordingly. No quantitative evaluation of adaptation is provided.", 375 "supported": "weak" 376 }, 377 { 378 "claim": "Finetuning with retrieval achieves 12.37% better results than without retrieval on TorchHub and 23.46% better on HuggingFace (with oracle retriever).", 379 "evidence": "Table 2 compares Gorilla trained with vs. without oracle retriever, showing 67.20% vs 54.83% on TorchHub and 91.26% vs 45.58% on HuggingFace with oracle at inference.", 380 "supported": "moderate" 381 }, 382 { 383 "claim": "Adding a non-optimal retriever at test time can hurt performance.", 384 "evidence": "Section 4.1 reports performance drops when using BM25 or GPT-Index with a model finetuned without retrieval: 21.50% in TorchHub and 47.57% in HuggingFace.", 385 "supported": "strong" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "Circular evaluation design", 391 "detail": "Gorilla is finetuned on instruction-API pairs from the same APIBench dataset used for evaluation. The train/test split is from the same distribution (same APIs, same self-instruct process), giving it an inherent advantage over baselines that did not see this specific data format." 392 }, 393 { 394 "flag": "Unfair baseline comparison", 395 "detail": "Gorilla is finetuned specifically on API call data while baselines (GPT-4, Claude) are tested zero-shot or with simple retrieval. This tests domain-specific finetuning vs. general capability, not a fair model-to-model comparison." 396 }, 397 { 398 "flag": "No statistical rigor", 399 "detail": "All results are single-run point estimates with no error bars, confidence intervals, or significance tests. Claims of superiority are based on raw number comparisons." 400 }, 401 { 402 "flag": "Qualitative-only evidence for key claim", 403 "detail": "The test-time adaptation claim — a major selling point — is supported only by three cherry-picked qualitative examples (Figure 6) with no systematic quantitative evaluation." 404 }, 405 { 406 "flag": "Self-instruct evaluation bias", 407 "detail": "GPT-4 was used to generate the instruction data, and GPT-4 is also a baseline. The synthetic instructions may be biased toward patterns GPT-4 generates well or poorly in ways that affect the comparison." 408 }, 409 { 410 "flag": "HuggingFace evaluation relaxed for baselines", 411 "detail": "Section 4.1 notes: 'for HuggingFace, for all the models except Gorilla, we only check if they can provide the correct domain names.' This means the evaluation is easier for baselines on HuggingFace, yet Gorilla still uses full AST matching — making the comparison inconsistent." 412 } 413 ], 414 "cited_papers": [ 415 { 416 "title": "Toolformer: Language models can teach themselves to use tools", 417 "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"], 418 "year": 2023, 419 "arxiv_id": "2302.04761", 420 "relevance": "Seminal work on training LLMs to use tools, directly relevant to agentic AI capabilities." 421 }, 422 { 423 "title": "Evaluating large language models trained on code", 424 "authors": ["Mark Chen", "Jerry Tworek"], 425 "year": 2021, 426 "arxiv_id": "2107.03374", 427 "relevance": "Introduces Codex and HumanEval benchmark for LLM code generation evaluation." 428 }, 429 { 430 "title": "Self-instruct: Aligning language model with self generated instructions", 431 "authors": ["Yizhong Wang", "Yeganeh Kordi"], 432 "year": 2022, 433 "arxiv_id": "2212.10560", 434 "relevance": "Core methodology used for generating Gorilla's training data; key technique in LLM training pipelines." 435 }, 436 { 437 "title": "HuggingGPT: Solving AI tasks with ChatGPT and its friends in HuggingFace", 438 "authors": ["Yongliang Shen", "Kaitao Song"], 439 "year": 2023, 440 "arxiv_id": "2303.17580", 441 "relevance": "Concurrent work on connecting LLMs to APIs for task solving via HuggingFace models." 442 }, 443 { 444 "title": "TaskMatrix.AI: Completing tasks by connecting foundation models with millions of APIs", 445 "authors": ["Yaobo Liang", "Chenfei Wu"], 446 "year": 2023, 447 "arxiv_id": "2303.16434", 448 "relevance": "Proposes connecting foundation models with millions of APIs for task completion, directly related to tool-use scaling." 449 }, 450 { 451 "title": "Reflexion: an autonomous agent with dynamic memory and self-reflection", 452 "authors": ["Noah Shinn", "Beck Labash", "Ashwin Gopinath"], 453 "year": 2023, 454 "arxiv_id": "2303.11366", 455 "relevance": "Agentic AI framework with self-reflection, relevant to LLM agent capabilities." 456 }, 457 { 458 "title": "ReAct: Synergizing reasoning and acting in language models", 459 "authors": ["Shunyu Yao", "Jeffrey Zhao"], 460 "year": 2022, 461 "arxiv_id": "2210.03629", 462 "relevance": "Foundational work on combining reasoning and tool use in LLMs." 463 }, 464 { 465 "title": "Competition-level code generation with AlphaCode", 466 "authors": ["Yujia Li", "David Choi"], 467 "year": 2022, 468 "relevance": "Benchmark work on LLM code generation at competition level." 469 }, 470 { 471 "title": "PAL: Program-aided language models", 472 "authors": ["Luyu Gao", "Aman Madaan"], 473 "year": 2022, 474 "arxiv_id": "2211.10435", 475 "relevance": "Demonstrates LLMs using code as a tool for reasoning, related to tool-use capabilities." 476 }, 477 { 478 "title": "StarCoder: may the source be with you!", 479 "authors": ["Raymond Li", "Loubna Ben Allal"], 480 "year": 2023, 481 "arxiv_id": "2305.06161", 482 "relevance": "Open-source code LLM relevant to code generation capabilities." 483 } 484 ] 485 }