scan.json (27632B)
1 { 2 "scan_version": 2, 3 "active_modules": ["experimental_rigor", "data_leakage"], 4 "paper": { 5 "title": "Evaluating and Mitigating Errors in LLM-Generated Web API Integrations", 6 "authors": ["Daniel Maninger", "Leon Chemnitz", "Amir Molzam Sharifloo", "Tushar Lamba", "Jannis Brugger", "Mira Mezini"], 7 "year": 2025, 8 "venue": "ACM Transactions on Software Engineering and Methodology", 9 "arxiv_id": "2509.20172" 10 }, 11 "methodology_tags": ["benchmark-eval"], 12 "key_findings": "Open-source LLMs struggle significantly with generating correct web API invocation code, with the best open-source model (Code Llama 70B) achieving only 30% correctness on full completion tasks. Constrained decoding, which automatically translates OpenAPI specifications to regex-based constraints, eliminates all illegal URLs, methods, and arguments, improving overall correctness by 90-135% on average. The technique is particularly effective for argument completion (in-IDE) scenarios, making mid-size open-source models competitive with larger commercial models.", 13 "checklist": { 14 "artifacts": { 15 "code_released": { 16 "applies": true, 17 "answer": true, 18 "justification": "GitHub repository provided: https://github.com/stg-tud/WAPIIBench (Section 1, footnote 3). Zenodo artifact also provided: https://doi.org/10.5281/zenodo.13758414 (footnote 4)." 19 }, 20 "data_released": { 21 "applies": true, 22 "answer": true, 23 "justification": "The WAPIIBench dataset is available on GitHub and all model-generated codes are provided in the Zenodo artifact (Section 1, Appendix A)." 24 }, 25 "environment_specified": { 26 "applies": true, 27 "answer": false, 28 "justification": "Appendix D lists technologies used (Hugging Face Transformers, Axios, etc.) and Appendix E lists hyperparameters (fp16, 1 beam, temperature 0), but no requirements.txt, Dockerfile, or detailed environment setup with library versions is mentioned." 29 }, 30 "reproduction_instructions": { 31 "applies": true, 32 "answer": false, 33 "justification": "No step-by-step reproduction instructions are described in the paper. The GitHub repo is referenced but no README or reproduction guide is mentioned." 34 } 35 }, 36 "statistical_methodology": { 37 "confidence_intervals_or_error_bars": { 38 "applies": true, 39 "answer": false, 40 "justification": "All results are reported as point estimates (e.g., '30% correctness', '0.75 precision') with no confidence intervals or error bars." 41 }, 42 "significance_tests": { 43 "applies": true, 44 "answer": false, 45 "justification": "The paper claims constrained decoding 'significantly improves' correctness but provides no statistical significance tests. Comparisons between models and between constrained/unconstrained are based solely on comparing numbers." 46 }, 47 "effect_sizes_reported": { 48 "applies": true, 49 "answer": true, 50 "justification": "Relative correctness gains are reported with baseline context: e.g., '+90% average gain' for full completion, '+135%' for argument completion, with specific per-model breakdowns from baseline to constrained performance (Figure 5, Tables 18a-d)." 51 }, 52 "sample_size_justified": { 53 "applies": true, 54 "answer": false, 55 "justification": "The dataset has 395 samples (one per endpoint across 4 APIs). No justification is given for why these 4 APIs or this sample size is sufficient." 56 }, 57 "variance_reported": { 58 "applies": true, 59 "answer": false, 60 "justification": "Greedy decoding (temperature=0) is used, producing deterministic single-run results. No variance across runs is reported because there is only one run per model. This means results could be sensitive to prompt wording or other factors without any measure of stability." 61 } 62 }, 63 "evaluation_design": { 64 "baselines_included": { 65 "applies": true, 66 "answer": true, 67 "justification": "Unconstrained LLM generation serves as the baseline for evaluating constrained decoding. Multiple models across families are compared (Section 3, Tables 3-4)." 68 }, 69 "baselines_contemporary": { 70 "applies": true, 71 "answer": true, 72 "justification": "Models include recent families: DeepSeek-Coder-V2, Qwen2.5-Coder, Llama 3.1, GPT-4o, and GPT-4o mini, all contemporary at time of writing." 73 }, 74 "ablation_study": { 75 "applies": true, 76 "answer": false, 77 "justification": "No ablation study of the constraint components. The paper does not test, e.g., constraining only URLs vs. only arguments vs. all, to measure individual contribution of each constraint type." 78 }, 79 "multiple_metrics": { 80 "applies": true, 81 "answer": true, 82 "justification": "Multiple fine-grained metrics are used: correct implementations, correct URLs, illegal URLs, correct methods, argument precision, argument recall, argument value conditional accuracy, and more (Table 2)." 83 }, 84 "human_evaluation": { 85 "applies": true, 86 "answer": false, 87 "justification": "Evaluation is entirely automated via the mock execution pipeline. No human evaluation of generated code quality beyond the initial dataset curation." 88 }, 89 "held_out_test_set": { 90 "applies": true, 91 "answer": false, 92 "justification": "The entire dataset of 395 samples is used for evaluation. There is no train/dev/test split since the models are not fine-tuned, but the same dataset is used for both analysis and reporting, with no held-out portion." 93 }, 94 "per_category_breakdown": { 95 "applies": true, 96 "answer": true, 97 "justification": "Results are broken down by API (Tables 12-17), by model family, and by completion setup (full vs. argument). Per-model-family analysis is provided." 98 }, 99 "failure_cases_discussed": { 100 "applies": true, 101 "answer": true, 102 "justification": "Section 3.2 discusses specific failure patterns: Qwen2.5-Coder refusing to continue starter code, Llama 3.1 skipping method parts, hallucinated endpoints and arguments. Error taxonomy is described in RQ1 answer." 103 }, 104 "negative_results_reported": { 105 "applies": true, 106 "answer": true, 107 "justification": "The paper reports that larger models are not always better within model families (Section 3.2), that constrained decoding slightly reduces executability rates (Section 5.2, Limitation 6), and that some models performed worse than expected." 108 } 109 }, 110 "claims_and_evidence": { 111 "abstract_claims_supported": { 112 "applies": true, 113 "answer": true, 114 "justification": "Abstract claims of 'none of the evaluated open-source models was able to solve more than 40% of the tasks' is supported by Tables 3-4. Claims of '+90% and +135%' gains are supported by Figure 5 and Tables 18a-d." 115 }, 116 "causal_claims_justified": { 117 "applies": true, 118 "answer": true, 119 "justification": "The causal claim that constrained decoding improves correctness is justified by controlled comparison: same models, same dataset, same prompts, with only the decoding strategy changed. This is a valid single-variable manipulation." 120 }, 121 "generalization_bounded": { 122 "applies": true, 123 "answer": true, 124 "justification": "Section 1 explicitly states scope: JavaScript, Axios, OpenAPI-compliant APIs. Section 6 (Limitations) discusses that results may not generalize to other APIs given the limited number of APIs, and that prompt engineering could improve individual model results." 125 }, 126 "alternative_explanations_discussed": { 127 "applies": true, 128 "answer": true, 129 "justification": "Section 6 discusses multiple alternative explanations: API specification quality issues, prompt sensitivity, dataset quality, variable value limitations of constraints. Section 3.2 discusses training data prevalence as explanation for per-API performance differences." 130 }, 131 "proxy_outcome_distinction": { 132 "applies": true, 133 "answer": true, 134 "justification": "The paper measures request configuration correctness (URL, method, arguments matching ground truth) and clearly frames this as correctness of API invocation code, not broader code quality or developer productivity. The measurements match the claims." 135 } 136 }, 137 "setup_transparency": { 138 "model_versions_specified": { 139 "applies": true, 140 "answer": true, 141 "justification": "Appendix E provides exact HuggingFace model identifiers for all open-source models (e.g., 'bigcode/starcoder2-15b', 'deepseek-ai/deepseek-coder-6.7b-base'). GPT-4o and GPT-4o mini are identified by marketing names with footnote links but no snapshot dates." 142 }, 143 "prompts_provided": { 144 "applies": true, 145 "answer": true, 146 "justification": "Full prompts are provided: dataset generation prompt in Listing 4 (Appendix F) and code generation prompt in Listing 5 (Appendix F)." 147 }, 148 "hyperparameters_reported": { 149 "applies": true, 150 "answer": true, 151 "justification": "Appendix E states: fp16, 1 beam, temperature 0.0. Section 3 states greedy decoding for all experiments." 152 }, 153 "scaffolding_described": { 154 "applies": false, 155 "answer": false, 156 "justification": "No agentic scaffolding is used. The approach is direct code generation with optional constrained decoding." 157 }, 158 "data_preprocessing_documented": { 159 "applies": true, 160 "answer": true, 161 "justification": "Section 2.1 describes the full dataset creation pipeline: Gemini 1.5 Pro generation, automated consistency checks (9 samples failed), manual review of all 395 samples (58 samples had issues and were fixed), with specific criteria for what was checked." 162 } 163 }, 164 "limitations_and_scope": { 165 "limitations_section_present": { 166 "applies": true, 167 "answer": true, 168 "justification": "Section 6 'Limitations and Threats to Validity' contains 6 numbered, substantive limitation discussions." 169 }, 170 "threats_to_validity_specific": { 171 "applies": true, 172 "answer": true, 173 "justification": "Limitations are specific: dataset may contain faulty samples, Gemini-generated tasks use optional parameters sparingly, constraints act only locally and cannot handle variable values, executability rate slightly lower with constrained models due to token misalignment (Section 6, items 1-6)." 174 }, 175 "scope_boundaries_stated": { 176 "applies": true, 177 "answer": true, 178 "justification": "Section 1 explicitly states scope: OpenAPI-compliant web APIs, JavaScript/Axios, code completion tasks. Section 6 item 5 states the work is 'not intended for highly exploratory development scenarios.' Limitation 3 notes results 'may not generalize to other APIs.'" 179 } 180 }, 181 "data_integrity": { 182 "raw_data_available": { 183 "applies": true, 184 "answer": true, 185 "justification": "All model-generated codes are provided in the Zenodo artifact (Appendix A), and the dataset is on GitHub. This enables independent verification of the results." 186 }, 187 "data_collection_described": { 188 "applies": true, 189 "answer": true, 190 "justification": "Section 2.1 details dataset creation: 4 APIs selected (Asana, Google Calendar, Google Sheets, Slack), one task per endpoint (395 total), generated with Gemini 1.5 Pro with full specification and detailed prompt." 191 }, 192 "recruitment_methods_described": { 193 "applies": false, 194 "answer": false, 195 "justification": "No human participants. Data source is synthetic dataset generation from API specifications." 196 }, 197 "data_pipeline_documented": { 198 "applies": true, 199 "answer": true, 200 "justification": "Section 2.1 documents: Gemini generation → automated consistency checks (9 failures) → manual inspection of all 395 samples → 58 samples fixed. Section 2.3 describes the execution pipeline including mock environment and code truncation heuristics." 201 } 202 }, 203 "conflicts_of_interest": { 204 "funding_disclosed": { 205 "applies": true, 206 "answer": true, 207 "justification": "Acknowledgments section lists funding from Hessian Ministry of Higher Education (3AI cluster), National Research Center ATHENE, and LOEWE initiative." 208 }, 209 "affiliations_disclosed": { 210 "applies": true, 211 "answer": true, 212 "justification": "Author affiliations are clearly listed: TU Darmstadt, hessian.AI, Pariton AI, and ATHENE. Leon Chemnitz's affiliation with Pariton AI (a company) is disclosed." 213 }, 214 "funder_independent_of_outcome": { 215 "applies": true, 216 "answer": true, 217 "justification": "Funding is from public research institutions (Hessian Ministry, LOEWE, ATHENE) with no apparent financial interest in any specific LLM's performance." 218 }, 219 "financial_interests_declared": { 220 "applies": true, 221 "answer": false, 222 "justification": "No competing interests statement is present. Leon Chemnitz is affiliated with Pariton AI but no financial interests declaration is made." 223 } 224 }, 225 "contamination": { 226 "training_cutoff_stated": { 227 "applies": true, 228 "answer": false, 229 "justification": "The paper evaluates models' memorized knowledge of API specifications but does not state training cutoff dates for any of the evaluated models." 230 }, 231 "train_test_overlap_discussed": { 232 "applies": true, 233 "answer": true, 234 "justification": "Section 2.2 explicitly discusses that models rely on 'memorized knowledge about the APIs' from training data. Section 3.2 notes models 'have had training exposure to API specifications or examples from which they memorized usage patterns.' The synthetic nature of the dataset mitigates direct overlap." 235 }, 236 "benchmark_contamination_addressed": { 237 "applies": true, 238 "answer": false, 239 "justification": "While the tasks are synthetically generated (reducing contamination risk), the paper does not discuss whether the underlying API specifications or similar API usage examples existed in training data. The APIs used (Slack, Google Calendar, etc.) are extremely well-documented online." 240 } 241 }, 242 "human_studies": { 243 "pre_registered": { 244 "applies": false, 245 "answer": false, 246 "justification": "No human participants in this study." 247 }, 248 "irb_or_ethics_approval": { 249 "applies": false, 250 "answer": false, 251 "justification": "No human participants in this study." 252 }, 253 "demographics_reported": { 254 "applies": false, 255 "answer": false, 256 "justification": "No human participants in this study." 257 }, 258 "inclusion_exclusion_criteria": { 259 "applies": false, 260 "answer": false, 261 "justification": "No human participants in this study." 262 }, 263 "randomization_described": { 264 "applies": false, 265 "answer": false, 266 "justification": "No human participants in this study." 267 }, 268 "blinding_described": { 269 "applies": false, 270 "answer": false, 271 "justification": "No human participants in this study." 272 }, 273 "attrition_reported": { 274 "applies": false, 275 "answer": false, 276 "justification": "No human participants in this study." 277 } 278 }, 279 "cost_and_practicality": { 280 "inference_cost_reported": { 281 "applies": true, 282 "answer": false, 283 "justification": "No inference cost, latency, or tokens consumed is reported for any of the 21 models evaluated, despite constrained decoding having significant computational overhead." 284 }, 285 "compute_budget_stated": { 286 "applies": true, 287 "answer": false, 288 "justification": "No total compute budget, GPU hours, or hardware specifications are stated. Running 21 models on 395 tasks in two setups (constrained and unconstrained) represents significant compute that is not quantified." 289 } 290 }, 291 "experimental_rigor": { 292 "seed_sensitivity_reported": { 293 "applies": true, 294 "answer": false, 295 "justification": "Greedy decoding (temperature=0) produces deterministic results, so seed sensitivity is not applicable in the traditional sense. However, the paper does not test sensitivity to prompt variations or other sources of variance." 296 }, 297 "number_of_runs_stated": { 298 "applies": true, 299 "answer": true, 300 "justification": "Implicitly stated: greedy decoding with temperature=0 means each experiment is a single deterministic run. This is clear from Appendix E hyperparameters." 301 }, 302 "hyperparameter_search_budget": { 303 "applies": true, 304 "answer": false, 305 "justification": "No hyperparameter search was conducted. The paper uses a single prompt and greedy decoding for all models. While this is a deliberate design choice for fairness, the paper acknowledges in Limitation 3 and Section 3.2 that prompt engineering could change results." 306 }, 307 "best_config_selection_justified": { 308 "applies": true, 309 "answer": true, 310 "justification": "The paper uses identical configuration for all models (same prompt, greedy decoding) to ensure fair comparison. No configuration selection was performed." 311 }, 312 "multiple_comparison_correction": { 313 "applies": true, 314 "answer": false, 315 "justification": "No statistical tests are performed at all, so multiple comparison correction does not arise. However, the paper makes many implicit comparisons across 21 models without any statistical framework." 316 }, 317 "self_comparison_bias_addressed": { 318 "applies": true, 319 "answer": false, 320 "justification": "The authors propose constrained decoding and evaluate it themselves. No independent evaluation or acknowledgment of author-evaluation bias is provided." 321 }, 322 "compute_budget_vs_performance": { 323 "applies": true, 324 "answer": false, 325 "justification": "Constrained decoding adds computational overhead (token-by-token constraint checking, timeouts noted in Tables 10-11) but performance is not reported as a function of compute. No cost comparison between constrained and unconstrained generation." 326 }, 327 "benchmark_construct_validity": { 328 "applies": true, 329 "answer": true, 330 "justification": "Section 2 discusses what WAPIIBench measures (functional correctness of API invocations via mock execution) vs. limitations (Section 6 items 1-3: simplified program context, no response handling, synthetic tasks). The paper is honest about what the benchmark does and doesn't capture." 331 }, 332 "scaffold_confound_addressed": { 333 "applies": false, 334 "answer": false, 335 "justification": "No scaffolding is involved. Models generate code directly via completion." 336 } 337 }, 338 "data_leakage": { 339 "temporal_leakage_addressed": { 340 "applies": true, 341 "answer": false, 342 "justification": "The APIs used (Slack, Google Calendar, etc.) have extensive documentation online predating all models' training. The paper does not discuss whether models saw API documentation or usage examples during training, despite this being central to the evaluation." 343 }, 344 "feature_leakage_addressed": { 345 "applies": true, 346 "answer": true, 347 "justification": "The evaluation explicitly does NOT provide API specifications to models under test (Section 2.2): models must rely on memorized knowledge. The paper clearly separates what information models receive (task description only) from what they do not (specifications)." 348 }, 349 "non_independence_addressed": { 350 "applies": true, 351 "answer": false, 352 "justification": "Tasks within the same API share structural patterns (same base URL, similar argument types). No discussion of whether per-API correlation affects aggregate results." 353 }, 354 "leakage_detection_method": { 355 "applies": true, 356 "answer": false, 357 "justification": "No leakage detection method is applied. The paper does not check whether the specific task formulations or API usage patterns appeared in model training data." 358 } 359 } 360 }, 361 "claims": [ 362 { 363 "claim": "None of the evaluated open-source models was able to solve more than 40% of the web API invocation tasks.", 364 "evidence": "Tables 3-4 show Code Llama 70B achieving 30% (full completion) and 40% (argument completion) as the best open-source model.", 365 "supported": "strong" 366 }, 367 { 368 "claim": "Constrained decoding increases overall correctness by 90% (full completion) and 135% (argument completion) on average.", 369 "evidence": "Figure 5 and Tables 18a-d show per-model relative gains, with averages calculated across all models with non-zero baselines.", 370 "supported": "strong" 371 }, 372 { 373 "claim": "Constrained decoding eliminates all illegal URLs, HTTP methods, and arguments.", 374 "evidence": "Tables 5-6 and 10-11 show 0% illegal URLs, methods, and arguments across all models with constrained decoding.", 375 "supported": "strong" 376 }, 377 { 378 "claim": "Code Llama 70B with constrained decoding matches the performance of GPT-4o mini.", 379 "evidence": "Table 4 shows GPT-4o mini at 63% argument completion correctness; Table 6 shows Code Llama 70B constrained at 63%. Full completion: GPT-4o mini at 39%, Code Llama 70B constrained at 46% (Table 5).", 380 "supported": "strong" 381 }, 382 { 383 "claim": "LLMs frequently hallucinate endpoint URLs and parameter names in web API invocations.", 384 "evidence": "Tables 3-4 show 15-39% illegal URLs and 6-31% illegal arguments across models. Section 3.2 discusses hallucination patterns.", 385 "supported": "strong" 386 } 387 ], 388 "red_flags": [ 389 { 390 "flag": "No statistical tests despite comparative claims", 391 "detail": "The paper claims constrained decoding 'significantly improves' correctness but provides no statistical significance tests. All comparisons are based on point estimates from single deterministic runs." 392 }, 393 { 394 "flag": "Synthetic benchmark may not reflect real-world API usage", 395 "detail": "Section 6 acknowledges that Gemini-generated tasks 'use optional parameters rather sparingly and often use placeholder values instead of realistic example values, limiting the transferability of findings.' Tasks have limited program context and evaluate only outgoing requests." 396 }, 397 { 398 "flag": "No cost reporting for constrained decoding", 399 "detail": "Constrained decoding adds significant computational overhead (Tables 10-11 show timeouts and unsatisfiable constraints), but no latency or cost comparison with unconstrained generation is provided. This is important for practical applicability." 400 }, 401 { 402 "flag": "Only 4 APIs tested", 403 "detail": "Results are based on only 4 APIs (Asana, Google Calendar, Google Sheets, Slack). The paper acknowledges this in Limitation 3 but the small number of APIs limits generalizability claims about web API invocations broadly." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "Evaluating Large Language Models Trained on Code", 409 "authors": ["Mark Chen", "Jerry Tworek"], 410 "year": 2021, 411 "arxiv_id": "2107.03374", 412 "relevance": "Introduces HumanEval benchmark and functional correctness evaluation for code generation — foundational to the evaluation methodology used in this paper." 413 }, 414 { 415 "title": "What's Wrong with Your Code Generated by Large Language Models? An Extensive Study", 416 "authors": ["Shihan Dou"], 417 "year": 2024, 418 "arxiv_id": "2407.06153", 419 "relevance": "Studies errors in LLM-generated code including hallucinated function names and arguments, directly related to API hallucination findings." 420 }, 421 { 422 "title": "Monitor-Guided Decoding of Code LMs with Static Analysis of Repository Context", 423 "authors": ["Lakshya A. Agrawal"], 424 "year": 2023, 425 "relevance": "Uses constrained decoding with static analysis to prevent method hallucination in Java — directly comparable approach for local function APIs." 426 }, 427 { 428 "title": "Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation", 429 "authors": ["Luca Beurer-Kellner"], 430 "year": 2024, 431 "relevance": "Proposes minimally invasive constrained decoding framework that could address executability issues noted in this paper's limitations." 432 }, 433 { 434 "title": "Gorilla: Large Language Model Connected with Massive APIs", 435 "authors": ["Shishir G. Patil"], 436 "year": 2024, 437 "relevance": "Studies LLM API hallucination and uses fine-tuning and RAG to reduce it — complementary approach to the constrained decoding proposed here." 438 }, 439 { 440 "title": "Bugs in large language models generated code: an empirical study", 441 "authors": ["Florian Tambon"], 442 "year": 2025, 443 "doi": "10.1007/S10664-025-10614-4", 444 "relevance": "Empirical study of bugs in LLM-generated code showing syntax errors are a small part of errors — motivates need for semantic constraints." 445 }, 446 { 447 "title": "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair", 448 "authors": ["Yuxiang Wei"], 449 "year": 2023, 450 "relevance": "Uses constrained decoding with completion engines (Repilot) to prevent hallucinations in automated program repair." 451 }, 452 { 453 "title": "Asleep at the Keyboard? Assessing the Security of GitHub Copilot's Code Contributions", 454 "authors": ["Hammond Pearce"], 455 "year": 2022, 456 "doi": "10.1109/SP46214.2022.9833571", 457 "relevance": "Evaluates security vulnerabilities in AI-generated code, relevant to concerns about correctness in LLM code generation." 458 }, 459 { 460 "title": "The effects of generative AI on high-skilled work: Evidence from three field experiments with software developers", 461 "authors": ["Zheyuan Kevin Cui"], 462 "year": 2025, 463 "relevance": "Field experiments measuring LLM impact on developer productivity — provides context for the practical importance of correct API code generation." 464 }, 465 { 466 "title": "On Mitigating Code LLM Hallucinations with API Documentation", 467 "authors": ["Nihal Jain"], 468 "year": 2025, 469 "relevance": "Studies SDK-wrapped web API hallucination mitigation via RAG with CloudAPIBench — directly comparable approach for web API invocations." 470 }, 471 { 472 "title": "Towards Mitigating API Hallucination in Code Generated by LLMs with Hierarchical Dependency Aware", 473 "authors": ["Yujia Chen"], 474 "year": 2025, 475 "relevance": "MARIN system uses constrained decoding and RAG for Java API hallucination prevention — related constrained decoding approach for local APIs." 476 }, 477 { 478 "title": "Applying RLAIF for Code Generation with API-usage in Lightweight LLMs", 479 "authors": ["Sujan Dutta"], 480 "year": 2024, 481 "arxiv_id": "2406.20060", 482 "relevance": "Uses RLAIF fine-tuning to improve API usage in code generation — complementary approach to constrained decoding." 483 } 484 ] 485 }