scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (32806B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating and Mitigating Errors in LLM-Generated Web API Integrations",
      6     "authors": [
      7       "Daniel Maninger",
      8       "Leon Chemnitz",
      9       "Amir Molzam Sharifloo",
     10       "Tushar Lamba",
     11       "Jannis Brugger",
     12       "Mira Mezini"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv (accepted to ACM TOSEM)",
     16     "arxiv_id": "2509.20172",
     17     "doi": null
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Abstract claims of 'none of the evaluated open-source models was able to solve more than 40% of the tasks' is supported by Tables 3-4. Claims of '+90% and +135%' gains are supported by Figure 5 and Tables 18a-d.",
     25         "source": "opus"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The causal claim that constrained decoding improves correctness is justified by controlled comparison: same models, same dataset, same prompts, with only the decoding strategy changed. This is a valid single-variable manipulation.",
     31         "source": "opus"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section 1 explicitly states scope: JavaScript, Axios, OpenAPI-compliant APIs. Section 6 (Limitations) discusses that results may not generalize to other APIs given the limited number of APIs, and that prompt engineering could improve individual model results.",
     37         "source": "opus"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Section 6 discusses multiple alternative explanations: API specification quality issues, prompt sensitivity, dataset quality, variable value limitations of constraints. Section 3.2 discusses training data prevalence as explanation for per-API performance differences.",
     43         "source": "opus"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper measures request configuration correctness (URL, method, arguments matching ground truth) and clearly frames this as correctness of API invocation code, not broader code quality or developer productivity. The measurements match the claims.",
     49         "source": "opus"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6 'Limitations and Threats to Validity' contains 6 numbered, substantive limitation discussions.",
     57         "source": "opus"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Limitations are specific: dataset may contain faulty samples, Gemini-generated tasks use optional parameters sparingly, constraints act only locally and cannot handle variable values, executability rate slightly lower with constrained models due to token misalignment (Section 6, items 1-6).",
     63         "source": "opus"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Section 1 explicitly states scope: OpenAPI-compliant web APIs, JavaScript/Axios, code completion tasks. Section 6 item 5 states the work is 'not intended for highly exploratory development scenarios.' Limitation 3 notes results 'may not generalize to other APIs.'",
     69         "source": "opus"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgments section lists funding from Hessian Ministry of Higher Education (3AI cluster), National Research Center ATHENE, and LOEWE initiative.",
     77         "source": "opus"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Author affiliations are clearly listed: TU Darmstadt, hessian.AI, Pariton AI, and ATHENE. Leon Chemnitz's affiliation with Pariton AI (a company) is disclosed.",
     83         "source": "opus"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "Funding is from public research institutions (Hessian Ministry, LOEWE, ATHENE) with no apparent financial interest in any specific LLM's performance.",
     89         "source": "opus"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "No competing interests statement is present. Leon Chemnitz is affiliated with Pariton AI but no financial interests declaration is made.",
     95         "source": "opus"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper defines 'endpoint' (URL + HTTP method combination), 'constrained decoding,' and the two evaluation setups (full vs. argument completion); it also provides a taxonomy distinguishing five API invocation categories in Section 7.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Four explicit contributions are listed: WAPIIBench dataset, evaluation pipeline, constraint generator translating OpenAPI specs to regex constraints, and empirical insights on LLM performance with and without constrained decoding.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 7 systematically distinguishes this work from five categories of related work (domain-specific APIs, SDK-wrapped APIs, local function APIs, tool APIs, constrained decoding for code), explaining specific differences rather than just listing citations.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "GitHub repository provided: https://github.com/stg-tud/WAPIIBench (Section 1, footnote 3). Zenodo artifact also provided: https://doi.org/10.5281/zenodo.13758414 (footnote 4).",
    126           "source": "opus"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The WAPIIBench dataset is available on GitHub and all model-generated codes are provided in the Zenodo artifact (Section 1, Appendix A).",
    132           "source": "opus"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Appendix D lists technologies used (Hugging Face Transformers, Axios, etc.) and Appendix E lists hyperparameters (fp16, 1 beam, temperature 0), but no requirements.txt, Dockerfile, or detailed environment setup with library versions is mentioned.",
    138           "source": "opus"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No step-by-step reproduction instructions are described in the paper. The GitHub repo is referenced but no README or reproduction guide is mentioned.",
    144           "source": "opus"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results are reported as point estimates (e.g., '30% correctness', '0.75 precision') with no confidence intervals or error bars.",
    152           "source": "opus"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper claims constrained decoding 'significantly improves' correctness but provides no statistical significance tests. Comparisons between models and between constrained/unconstrained are based solely on comparing numbers.",
    158           "source": "opus"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Relative correctness gains are reported with baseline context: e.g., '+90% average gain' for full completion, '+135%' for argument completion, with specific per-model breakdowns from baseline to constrained performance (Figure 5, Tables 18a-d).",
    164           "source": "opus"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The dataset has 395 samples (one per endpoint across 4 APIs). No justification is given for why these 4 APIs or this sample size is sufficient.",
    170           "source": "opus"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Greedy decoding (temperature=0) is used, producing deterministic single-run results. No variance across runs is reported because there is only one run per model. This means results could be sensitive to prompt wording or other factors without any measure of stability.",
    176           "source": "opus"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Unconstrained LLM generation serves as the baseline for evaluating constrained decoding. Multiple models across families are compared (Section 3, Tables 3-4).",
    184           "source": "opus"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Models include recent families: DeepSeek-Coder-V2, Qwen2.5-Coder, Llama 3.1, GPT-4o, and GPT-4o mini, all contemporary at time of writing.",
    190           "source": "opus"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "No ablation study of the constraint components. The paper does not test, e.g., constraining only URLs vs. only arguments vs. all, to measure individual contribution of each constraint type.",
    196           "source": "opus"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Multiple fine-grained metrics are used: correct implementations, correct URLs, illegal URLs, correct methods, argument precision, argument recall, argument value conditional accuracy, and more (Table 2).",
    202           "source": "opus"
    203         },
    204         "human_evaluation": {
    205           "applies": true,
    206           "answer": false,
    207           "justification": "Evaluation is entirely automated via the mock execution pipeline. No human evaluation of generated code quality beyond the initial dataset curation.",
    208           "source": "opus"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": false,
    213           "justification": "The entire dataset of 395 samples is used for evaluation. There is no train/dev/test split since the models are not fine-tuned, but the same dataset is used for both analysis and reporting, with no held-out portion.",
    214           "source": "opus"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results are broken down by API (Tables 12-17), by model family, and by completion setup (full vs. argument). Per-model-family analysis is provided.",
    220           "source": "opus"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 3.2 discusses specific failure patterns: Qwen2.5-Coder refusing to continue starter code, Llama 3.1 skipping method parts, hallucinated endpoints and arguments. Error taxonomy is described in RQ1 answer.",
    226           "source": "opus"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper reports that larger models are not always better within model families (Section 3.2), that constrained decoding slightly reduces executability rates (Section 5.2, Limitation 6), and that some models performed worse than expected.",
    232           "source": "opus"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Appendix E provides exact HuggingFace model identifiers for all open-source models (e.g., 'bigcode/starcoder2-15b', 'deepseek-ai/deepseek-coder-6.7b-base'). GPT-4o and GPT-4o mini are identified by marketing names with footnote links but no snapshot dates.",
    240           "source": "opus"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Full prompts are provided: dataset generation prompt in Listing 4 (Appendix F) and code generation prompt in Listing 5 (Appendix F).",
    246           "source": "opus"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Appendix E states: fp16, 1 beam, temperature 0.0. Section 3 states greedy decoding for all experiments.",
    252           "source": "opus"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "No agentic scaffolding is used. The approach is direct code generation with optional constrained decoding.",
    258           "source": "opus"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Section 2.1 describes the full dataset creation pipeline: Gemini 1.5 Pro generation, automated consistency checks (9 samples failed), manual review of all 395 samples (58 samples had issues and were fixed), with specific criteria for what was checked.",
    264           "source": "opus"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "All model-generated codes are provided in the Zenodo artifact (Appendix A), and the dataset is on GitHub. This enables independent verification of the results.",
    272           "source": "opus"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 2.1 details dataset creation: 4 APIs selected (Asana, Google Calendar, Google Sheets, Slack), one task per endpoint (395 total), generated with Gemini 1.5 Pro with full specification and detailed prompt.",
    278           "source": "opus"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants. Data source is synthetic dataset generation from API specifications.",
    284           "source": "opus"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Section 2.1 documents: Gemini generation → automated consistency checks (9 failures) → manual inspection of all 395 samples → 58 samples fixed. Section 2.3 describes the execution pipeline including mock environment and code truncation heuristics.",
    290           "source": "opus"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "The paper evaluates models' memorized knowledge of API specifications but does not state training cutoff dates for any of the evaluated models.",
    298           "source": "opus"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": true,
    303           "justification": "Section 2.2 explicitly discusses that models rely on 'memorized knowledge about the APIs' from training data. Section 3.2 notes models 'have had training exposure to API specifications or examples from which they memorized usage patterns.' The synthetic nature of the dataset mitigates direct overlap.",
    304           "source": "opus"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "While the tasks are synthetically generated (reducing contamination risk), the paper does not discuss whether the underlying API specifications or similar API usage examples existed in training data. The APIs used (Slack, Google Calendar, etc.) are extremely well-documented online.",
    310           "source": "opus"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants in this study.",
    318           "source": "opus"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants in this study.",
    324           "source": "opus"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants in this study.",
    330           "source": "opus"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants in this study.",
    336           "source": "opus"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants in this study.",
    342           "source": "opus"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants in this study.",
    348           "source": "opus"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants in this study.",
    354           "source": "opus"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference cost, latency, or tokens consumed is reported for any of the 21 models evaluated, despite constrained decoding having significant computational overhead.",
    362           "source": "opus"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No total compute budget, GPU hours, or hardware specifications are stated. Running 21 models on 395 tasks in two setups (constrained and unconstrained) represents significant compute that is not quantified.",
    368           "source": "opus"
    369         }
    370       },
    371       "experimental_rigor": {
    372         "seed_sensitivity_reported": {
    373           "applies": true,
    374           "answer": false,
    375           "justification": "Greedy decoding (temperature=0) produces deterministic results, so seed sensitivity is not applicable in the traditional sense. However, the paper does not test sensitivity to prompt variations or other sources of variance.",
    376           "source": "opus"
    377         },
    378         "number_of_runs_stated": {
    379           "applies": true,
    380           "answer": true,
    381           "justification": "Implicitly stated: greedy decoding with temperature=0 means each experiment is a single deterministic run. This is clear from Appendix E hyperparameters.",
    382           "source": "opus"
    383         },
    384         "hyperparameter_search_budget": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "No hyperparameter search was conducted. The paper uses a single prompt and greedy decoding for all models. While this is a deliberate design choice for fairness, the paper acknowledges in Limitation 3 and Section 3.2 that prompt engineering could change results.",
    388           "source": "opus"
    389         },
    390         "best_config_selection_justified": {
    391           "applies": true,
    392           "answer": true,
    393           "justification": "The paper uses identical configuration for all models (same prompt, greedy decoding) to ensure fair comparison. No configuration selection was performed.",
    394           "source": "opus"
    395         },
    396         "multiple_comparison_correction": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "No statistical tests are performed at all, so multiple comparison correction does not arise. However, the paper makes many implicit comparisons across 21 models without any statistical framework.",
    400           "source": "opus"
    401         },
    402         "self_comparison_bias_addressed": {
    403           "applies": true,
    404           "answer": false,
    405           "justification": "The authors propose constrained decoding and evaluate it themselves. No independent evaluation or acknowledgment of author-evaluation bias is provided.",
    406           "source": "opus"
    407         },
    408         "compute_budget_vs_performance": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "Constrained decoding adds computational overhead (token-by-token constraint checking, timeouts noted in Tables 10-11) but performance is not reported as a function of compute. No cost comparison between constrained and unconstrained generation.",
    412           "source": "opus"
    413         },
    414         "benchmark_construct_validity": {
    415           "applies": true,
    416           "answer": true,
    417           "justification": "Section 2 discusses what WAPIIBench measures (functional correctness of API invocations via mock execution) vs. limitations (Section 6 items 1-3: simplified program context, no response handling, synthetic tasks). The paper is honest about what the benchmark does and doesn't capture.",
    418           "source": "opus"
    419         },
    420         "scaffold_confound_addressed": {
    421           "applies": false,
    422           "answer": false,
    423           "justification": "No scaffolding is involved. Models generate code directly via completion.",
    424           "source": "opus"
    425         }
    426       },
    427       "data_leakage": {
    428         "temporal_leakage_addressed": {
    429           "applies": true,
    430           "answer": false,
    431           "justification": "The APIs used (Slack, Google Calendar, etc.) have extensive documentation online predating all models' training. The paper does not discuss whether models saw API documentation or usage examples during training, despite this being central to the evaluation.",
    432           "source": "opus"
    433         },
    434         "feature_leakage_addressed": {
    435           "applies": true,
    436           "answer": true,
    437           "justification": "The evaluation explicitly does NOT provide API specifications to models under test (Section 2.2): models must rely on memorized knowledge. The paper clearly separates what information models receive (task description only) from what they do not (specifications).",
    438           "source": "opus"
    439         },
    440         "non_independence_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "Tasks within the same API share structural patterns (same base URL, similar argument types). No discussion of whether per-API correlation affects aggregate results.",
    444           "source": "opus"
    445         },
    446         "leakage_detection_method": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No leakage detection method is applied. The paper does not check whether the specific task formulations or API usage patterns appeared in model training data.",
    450           "source": "opus"
    451         }
    452       }
    453     }
    454   },
    455   "claims": [
    456     {
    457       "claim": "No evaluated open-source LLM solves more than 40% of web API invocation tasks (full completion setup)",
    458       "evidence": "Table 3: best open-source model Code Llama 70B achieves 30% correct implementations (t); GPT-4o achieves 60%",
    459       "supported": "strong"
    460     },
    461     {
    462       "claim": "Constrained decoding improves overall correctness by ~90% (full completion) and ~135% (argument completion) on average",
    463       "evidence": "Table 18a shows average unconstrained 9% → constrained 22% (+90%); Table 18c shows 22% → 41% (+135%), excluding models with 0% baseline",
    464       "supported": "strong"
    465     },
    466     {
    467       "claim": "Constrained decoding eliminates all illegal URLs, HTTP methods, and arguments (reduces to 0%)",
    468       "evidence": "Tables 5 and 6 show illegal URLs (e) = 0.00 and illegal methods (e) = 0.00 for all constrained models; Table 11 shows illegal arguments (e) = 0.00",
    469       "supported": "strong"
    470     },
    471     {
    472       "claim": "LLMs frequently hallucinate endpoint URLs and parameter names rather than failing to understand task structure",
    473       "evidence": "Section 3.2: 14-39% of URLs are illegal (e) in full completion; 6-31% of arguments are illegal (e) in argument completion; models show higher method accuracy (84-91%) than URL accuracy (32-72%)",
    474       "supported": "strong"
    475     },
    476     {
    477       "claim": "Larger models are not always better for web API code generation",
    478       "evidence": "Section 3.2 observes that medium-sized variants in DeepSeek-Coder, Qwen2.5-Coder, and Code Llama families perform worse than both smaller and larger variants",
    479       "supported": "moderate"
    480     },
    481     {
    482       "claim": "Constrained decoding makes mid-size open-source models competitive with larger or commercial models",
    483       "evidence": "Section 5.2: Code Llama 70B with constrained decoding matches GPT-4o mini performance",
    484       "supported": "moderate"
    485     },
    486     {
    487       "claim": "Constrained decoding slightly reduces executability rates compared to unconstrained generation",
    488       "evidence": "Comparing Tables 8 vs 10 and 9 vs 11: unconstrained executability typically 94-99%, constrained 80-94%, due to token-constraint misalignment",
    489       "supported": "strong"
    490     }
    491   ],
    492   "methodology_tags": [
    493     "benchmark-eval"
    494   ],
    495   "key_findings": "WAPIIBench reveals that state-of-the-art open-source LLMs struggle severely with web API invocation code generation, with the best open-source model achieving only 30% correct implementations in the full completion setting, primarily due to hallucinated endpoint URLs and illegal argument names. Constrained decoding using regex-based constraints automatically derived from OpenAPI specifications eliminates all hallucinated (illegal) endpoints and arguments by construction, yielding average relative correctness gains of 90% (full completion) and 135% (argument completion) across models. The technique is especially effective for IDE-style code completion where partial code context is available, and makes mid-size open-source models competitive with GPT-4o mini. The approach provides hard guarantees on specification compliance rather than stochastic improvements from RAG or fine-tuning, at the cost of a small reduction in code executability.",
    496   "red_flags": [
    497     {
    498       "flag": "Training data contamination unaddressed",
    499       "detail": "The paper evaluates models on 4 well-known, publicly documented APIs (Asana, Google Calendar, Slack, Google Sheets) whose OpenAPI specifications are likely present in training data. The paper itself notes models appear to have memorized API specs, yet never discusses training cutoffs or the possibility that results reflect recall rather than generalization."
    500     },
    501     {
    502       "flag": "No statistical significance testing",
    503       "detail": "All performance comparisons between models and conditions are presented as point estimates without confidence intervals or significance tests, making it impossible to determine whether observed differences are meaningful or due to chance."
    504     },
    505     {
    506       "flag": "Benchmark generalizability limited",
    507       "detail": "Only 4 APIs are included, and the limitations section notes results may not generalize to other APIs; per-API breakdowns (Tables 12-17) show large within-model variance (e.g., StarCoder2: 46% on Google Calendar vs 13% on Slack for full completion)."
    508     },
    509     {
    510       "flag": "Synthetic benchmark circularity risk",
    511       "detail": "The dataset was generated by Gemini 1.5 Pro and then manually reviewed; Gemini is excluded from evaluation 'as it generated the dataset.' This is handled correctly but the synthetic nature limits ecological validity — tasks use placeholder values and sparse optional parameters."
    512     },
    513     {
    514       "flag": "+inf% gain inflation",
    515       "detail": "Several models with 0% unconstrained baseline show '+inf%' gains under constrained decoding; these are excluded from the average but represent a substantial fraction (8/21 models for full completion), which may inflate the practical significance of the approach."
    516     }
    517   ],
    518   "cited_papers": [
    519     {
    520       "title": "Evaluating Large Language Models Trained on Code (HumanEval)",
    521       "relevance": "Foundational benchmark for LLM code generation; establishes the execution-based evaluation paradigm this work extends to web API invocations"
    522     },
    523     {
    524       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    525       "relevance": "One of the primary evaluated open-source code models; baseline for comparison"
    526     },
    527     {
    528       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    529       "relevance": "Key evaluated model family; represents code-specialized LLMs"
    530     },
    531     {
    532       "title": "Qwen2.5-Coder Technical Report",
    533       "relevance": "Key evaluated model family with unexpectedly poor full-completion performance"
    534     },
    535     {
    536       "title": "Gorilla: Large Language Model Connected with Massive APIs",
    537       "relevance": "Related work on LLM API usage combining RAG and fine-tuning; direct point of comparison for the alternative approaches discussed"
    538     },
    539     {
    540       "title": "Monitor-Guided Decoding of Code LMs with Static Analysis of Repository Context",
    541       "relevance": "Most closely related constrained decoding work for code generation; differs by targeting local method calls rather than web API invocations"
    542     },
    543     {
    544       "title": "Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation",
    545       "relevance": "Core constrained decoding framework referenced as compatible alternative engine for the constraint implementation"
    546     },
    547     {
    548       "title": "PICARD: Parsing Incrementally for Constrained Auto-Regressive Decoding from Language Models",
    549       "relevance": "Directly analogous constrained decoding approach applied to SQL generation; establishes precedent for the approach"
    550     },
    551     {
    552       "title": "What's Wrong with Your Code Generated by Large Language Models? An Extensive Study",
    553       "relevance": "Prior characterization of LLM code errors including hallucinations; confirms findings extend to web API domain"
    554     },
    555     {
    556       "title": "Out of the BLEU: How should we assess quality of the Code Generation models?",
    557       "relevance": "Motivation for functional evaluation over syntactic similarity metrics used in this benchmark"
    558     }
    559   ],
    560   "engagement_factors": {
    561     "practical_relevance": {
    562       "score": 3,
    563       "justification": "Directly addresses a common developer pain point (LLMs generating wrong API calls) with a deployable, implementation-agnostic solution that works in IDE code completion workflows."
    564     },
    565     "surprise_contrarian": {
    566       "score": 1,
    567       "justification": "The finding that LLMs struggle with API code is expected; the magnitude of constrained decoding improvement (+90-135%) is notable but not dramatically counterintuitive."
    568     },
    569     "fear_safety": {
    570       "score": 1,
    571       "justification": "Paper briefly notes that directly executing LLM-generated API code is unsafe as it could send arbitrary requests to external servers, motivating the sandboxed evaluation environment."
    572     },
    573     "drama_conflict": {
    574       "score": 0,
    575       "justification": "No controversial findings or conflict with established results; straightforward benchmark paper with constructive improvement approach."
    576     },
    577     "demo_ability": {
    578       "score": 2,
    579       "justification": "WAPIIBench and constraint generator are publicly available on GitHub; practitioners can apply constrained decoding to their own OpenAPI specs using the released code."
    580     },
    581     "brand_recognition": {
    582       "score": 1,
    583       "justification": "TU Darmstadt and Hessian Center for AI are recognized academic institutions but not top-tier industry labs; no famous product affiliation."
    584     }
    585   },
    586   "hn_data": {
    587     "threads": [
    588       {
    589         "hn_id": "45381333",
    590         "title": "Federation of Agents: Semantics-Aware, Large-Scale Communication Fabric",
    591         "points": 3,
    592         "comments": 0,
    593         "url": "https://news.ycombinator.com/item?id=45381333",
    594         "created_at": "2025-09-26T01:02:53Z"
    595       }
    596     ],
    597     "top_points": 3,
    598     "total_points": 3,
    599     "total_comments": 0
    600   }
    601 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs