scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (27760B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Evaluating and Mitigating Errors in LLM-Generated Web API Integrations",
      6     "authors": [
      7       "Daniel Maninger",
      8       "Leon Chemnitz",
      9       "Amir Molzam Sharifloo",
     10       "Tushar Lamba",
     11       "Jannis Brugger",
     12       "Mira Mezini"
     13     ],
     14     "year": 2025,
     15     "venue": "arXiv / ACM Trans. Softw. Eng. Methodol.",
     16     "arxiv_id": "2509.20172",
     17     "doi": "XXXXXXX.XXXXXXX"
     18   },
     19   "checklist": {
     20     "claims_and_evidence": {
     21       "abstract_claims_supported": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "All abstract claims are backed by results: <40% for open-source models (Table 3), hallucination patterns quantified (15-39% illegal URLs), and the 90%/135% improvement figures come directly from Table 18a/c averages.",
     25         "source": "haiku"
     26       },
     27       "causal_claims_justified": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "The causal claim that constrained decoding improves correctness is supported by controlled comparisons on the same models and benchmark (constrained vs. unconstrained), which is adequate for this type of intervention study.",
     31         "source": "haiku"
     32       },
     33       "generalization_bounded": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Scope is explicitly restricted throughout: JavaScript/Axios, OpenAPI specifications, 4 specific APIs, base (non-instruction-tuned) models, zero-shot prompting. Section 6 and the conclusion explicitly list what does not transfer.",
     37         "source": "haiku"
     38       },
     39       "alternative_explanations_discussed": {
     40         "applies": true,
     41         "answer": false,
     42         "justification": "The paper notes 'it remains to be investigated why the quantitative benefit of constrained decoding is so model-dependent' but does not systematically discuss alternative explanations for the main findings (e.g., whether prompt engineering could close the gap, or whether API prevalence in training data fully explains performance differences).",
     43         "source": "haiku"
     44       },
     45       "proxy_outcome_distinction": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper carefully distinguishes what is measured (functional request configuration match) from what is claimed, explicitly separating executable vs. total metrics and noting that syntactic similarity is insufficient (Section 2.3).",
     49         "source": "haiku"
     50       }
     51     },
     52     "limitations_and_scope": {
     53       "limitations_section_present": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 6 'Limitations and Threats to Validity' lists 6 numbered, substantive limitations.",
     57         "source": "haiku"
     58       },
     59       "threats_to_validity_specific": {
     60         "applies": true,
     61         "answer": true,
     62         "justification": "Threats are specific: synthetic tasks use sparse optional parameters, constraints only act locally (variables used as parameter values are uncontrolled), constraints miss free-text description constraints, 4-API coverage may not generalize, lower executability under constrained decoding due to token/constraint misalignment.",
     63         "source": "haiku"
     64       },
     65       "scope_boundaries_stated": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Explicit scope: JavaScript + Axios only, OpenAPI standard, 4 real-world APIs, base models only (instruction-tuned excluded), zero-shot prompting; conclusion explicitly states evaluation pipeline is specialized and cannot directly transfer.",
     69         "source": "haiku"
     70       }
     71     },
     72     "conflicts_of_interest": {
     73       "funding_disclosed": {
     74         "applies": true,
     75         "answer": true,
     76         "justification": "Acknowledgments name three funders: Hessian Ministry (3AI cluster), ATHENE (Foundational Models for Secure Software Development), and LOEWE initiative with grant number.",
     77         "source": "haiku"
     78       },
     79       "affiliations_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "All author affiliations are listed on the first page, including Leon Chemnitz's commercial affiliation with Pariton AI.",
     83         "source": "haiku"
     84       },
     85       "funder_independent_of_outcome": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All funders are government or academic bodies (Hessian Ministry, LOEWE, ATHENE); none have a stake in the benchmarked models or the constrained decoding approach evaluated.",
     89         "source": "haiku"
     90       },
     91       "financial_interests_declared": {
     92         "applies": true,
     93         "answer": false,
     94         "justification": "There is no competing interests or financial interests declaration. One author is at a commercial AI company (Pariton AI) but this is not addressed beyond the affiliation listing.",
     95         "source": "haiku"
     96       }
     97     },
     98     "scope_and_framing": {
     99       "key_terms_defined": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "'Endpoint' is explicitly defined as 'unique combination of URL and HTTP method'; 'full completion' and 'argument completion' setups are defined; 'constrained decoding' is introduced with a formal description; request configuration components are specified.",
    103         "source": "haiku"
    104       },
    105       "intended_contribution_clear": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Four contributions are explicitly bulleted in the introduction: WAPIIBench dataset, open-source evaluation pipeline, OpenAPI-to-regex constraint generator, and novel empirical insights on correctness with and without constrained decoding.",
    109         "source": "haiku"
    110       },
    111       "engagement_with_prior_work": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "Section 7 systematically distinguishes the work from five categories of related API work (general/domain-specific/SDK/local/tool APIs), contrasts with existing constrained decoding approaches (MGD, PICARD, Synchromesh, ToolDec), and explains why prior methods are unsuitable for this setting.",
    115         "source": "haiku"
    116       }
    117     }
    118   },
    119   "type_checklist": {
    120     "empirical": {
    121       "artifacts": {
    122         "code_released": {
    123           "applies": true,
    124           "answer": true,
    125           "justification": "WAPIIBench is publicly available on GitHub (github.com/stg-tud/WAPIIBench) and all model-generated codes are at Zenodo (doi:10.5281/zenodo.13758414).",
    126           "source": "haiku"
    127         },
    128         "data_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "The 395-sample dataset and all model-generated outputs are available via GitHub and Zenodo respectively.",
    132           "source": "haiku"
    133         },
    134         "environment_specified": {
    135           "applies": true,
    136           "answer": false,
    137           "justification": "Appendix D lists technology names (Hugging Face Transformers, Axios, axios-mock-adapter, regex) and hyperparameters (16-bit precision, temperature=0.0), but no requirements.txt, Dockerfile, or versioned dependency specification is provided.",
    138           "source": "haiku"
    139         },
    140         "reproduction_instructions": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "The paper describes the pipeline conceptually in detail but contains no step-by-step instructions for rerunning the evaluation; these would presumably be in the GitHub repo README.",
    144           "source": "haiku"
    145         }
    146       },
    147       "statistical_methodology": {
    148         "confidence_intervals_or_error_bars": {
    149           "applies": true,
    150           "answer": false,
    151           "justification": "All results are reported as point estimates. Greedy decoding (temperature=0.0) is used, so no CIs are reported; there is no uncertainty quantification over the 395-sample test set.",
    152           "source": "haiku"
    153         },
    154         "significance_tests": {
    155           "applies": true,
    156           "answer": false,
    157           "justification": "The paper makes comparative claims (e.g., 'constrained decoding significantly improves') without any statistical significance tests despite comparing across up to 21 models.",
    158           "source": "haiku"
    159         },
    160         "effect_sizes_reported": {
    161           "applies": true,
    162           "answer": true,
    163           "justification": "Relative percentage gains are reported throughout (e.g., '+90% average full completion', '+135% average argument completion') with absolute baseline values in Table 18.",
    164           "source": "haiku"
    165         },
    166         "sample_size_justified": {
    167           "applies": true,
    168           "answer": false,
    169           "justification": "The 395 samples (one per API endpoint across 4 APIs) is justified pragmatically by coverage but no power analysis or statistical justification for this sample size is provided.",
    170           "source": "haiku"
    171         },
    172         "variance_reported": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "Greedy decoding eliminates run-to-run variance, but no variance across APIs, seeds, or other sources is reported; per-API breakdowns are only shown for two models.",
    176           "source": "haiku"
    177         }
    178       },
    179       "evaluation_design": {
    180         "baselines_included": {
    181           "applies": true,
    182           "answer": true,
    183           "justification": "Unconstrained models serve as baselines for constrained decoding comparisons; GPT-4o is included as an upper bound; models are compared against each other across two evaluation setups.",
    184           "source": "haiku"
    185         },
    186         "baselines_contemporary": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Baselines include GPT-4o, Qwen2.5-Coder, DeepSeek-Coder-V2, and Code Llama — all current models at time of writing, selected from coding leaderboards.",
    190           "source": "haiku"
    191         },
    192         "ablation_study": {
    193           "applies": true,
    194           "answer": false,
    195           "justification": "The paper compares constrained vs. unconstrained and full vs. argument completion, but no ablation of constraint components (e.g., URL constraints only, argument constraints only) is performed.",
    196           "source": "haiku"
    197         },
    198         "multiple_metrics": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "Table 2 and Table 7 define 19+ metrics: correct/illegal implementations, correct/illegal URLs, correct/illegal methods, argument precision, recall, Jaccard index, value conditional accuracy, executability.",
    202           "source": "haiku"
    203         },
    204         "human_evaluation": {
    205           "applies": false,
    206           "answer": false,
    207           "justification": "Human review was used for dataset construction (all 395 samples), not for evaluating LLM output quality. Functional correctness via automated execution is the evaluation method.",
    208           "source": "haiku"
    209         },
    210         "held_out_test_set": {
    211           "applies": true,
    212           "answer": false,
    213           "justification": "All 395 samples are used as the full evaluation set with no held-out portion; the dataset was constructed by the authors, creating a risk that the constrained decoding approach could be inadvertently tuned to it.",
    214           "source": "haiku"
    215         },
    216         "per_category_breakdown": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Results broken down by API (Asana, Google Calendar, Google Sheets, Slack) in Appendix Tables 12-17 for StarCoder2 and GPT-4o, and by metric type throughout.",
    220           "source": "haiku"
    221         },
    222         "failure_cases_discussed": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Section 3.2 discusses specific failure modes: Qwen2.5-Coder refused to continue starter code, Llama 3.1 skipped the method part, URL hallucinations are quantified at 15-39%, and authorization argument inflation is discussed.",
    226           "source": "haiku"
    227         },
    228         "negative_results_reported": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "Multiple models (6 Qwen2.5-Coder variants, 2 Llama 3.1 variants) achieved 0% in full completion. Constrained decoding produces slightly lower executability rates. The model-size-performance relationship is non-monotonic.",
    232           "source": "haiku"
    233         }
    234       },
    235       "setup_transparency": {
    236         "model_versions_specified": {
    237           "applies": true,
    238           "answer": true,
    239           "justification": "Appendix E provides exact Hugging Face model IDs for all 24 evaluated models (e.g., 'bigcode/starcoder2-15b', 'deepseek-ai/deepseek-coder-6.7b-base', 'openai/gpt-4o').",
    240           "source": "haiku"
    241         },
    242         "prompts_provided": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Appendix F provides full prompts for both dataset generation (Listing 4, Gemini 1.5 Pro) and model evaluation (Listing 5, all evaluated models), with all fill-value placeholders described.",
    246           "source": "haiku"
    247         },
    248         "hyperparameters_reported": {
    249           "applies": true,
    250           "answer": true,
    251           "justification": "Appendix E specifies: 16-bit floating point precision, 1 beam (greedy decoding), temperature=0.0.",
    252           "source": "haiku"
    253         },
    254         "scaffolding_described": {
    255           "applies": false,
    256           "answer": false,
    257           "justification": "There is no agentic scaffolding; models generate single completions. The constrained decoding framework is described in detail, but this is the intervention, not scaffolding.",
    258           "source": "haiku"
    259         },
    260         "data_preprocessing_documented": {
    261           "applies": true,
    262           "answer": true,
    263           "justification": "Dataset creation is thoroughly documented: Gemini 1.5 Pro generation, automated consistency checks (9 failures manually corrected), manual review of all 395 samples with 58 corrections, and specific criteria for task validity.",
    264           "source": "haiku"
    265         }
    266       },
    267       "data_integrity": {
    268         "raw_data_available": {
    269           "applies": true,
    270           "answer": true,
    271           "justification": "All model-generated codes and evaluation results are available via Zenodo (doi:10.5281/zenodo.13758414).",
    272           "source": "haiku"
    273         },
    274         "data_collection_described": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "Section 2.1 thoroughly describes dataset creation: API selection rationale, use of Gemini 1.5 Pro with full specifications, automated checks, and 3-criterion manual validation process.",
    278           "source": "haiku"
    279         },
    280         "recruitment_methods_described": {
    281           "applies": false,
    282           "answer": false,
    283           "justification": "No human participants were recruited; the dataset was synthetically generated and curated by the authors.",
    284           "source": "haiku"
    285         },
    286         "data_pipeline_documented": {
    287           "applies": true,
    288           "answer": true,
    289           "justification": "Figure 1 shows the complete 4-stage pipeline (dataset creation → code generation → code execution → correctness analysis), and each stage is described in Sections 2.1-2.4 with the mock execution environment explained.",
    290           "source": "haiku"
    291         }
    292       },
    293       "contamination": {
    294         "training_cutoff_stated": {
    295           "applies": true,
    296           "answer": false,
    297           "justification": "Model training cutoffs are not stated for any of the evaluated models, despite the evaluation relying on memorized API knowledge where cutoff is directly relevant.",
    298           "source": "haiku"
    299         },
    300         "train_test_overlap_discussed": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The paper acknowledges that models 'rely solely on memorized knowledge' and that performance varies by API prevalence in training data, but does not formally discuss train/test overlap as a contamination concern.",
    304           "source": "haiku"
    305         },
    306         "benchmark_contamination_addressed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "WAPIIBench is new, but the underlying API specifications are public and likely in training data; the paper does not address whether synthetic tasks based on publicly available specs could be partially memorized.",
    310           "source": "haiku"
    311         }
    312       },
    313       "human_studies": {
    314         "pre_registered": {
    315           "applies": false,
    316           "answer": false,
    317           "justification": "No human participants.",
    318           "source": "haiku"
    319         },
    320         "irb_or_ethics_approval": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants.",
    324           "source": "haiku"
    325         },
    326         "demographics_reported": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants.",
    330           "source": "haiku"
    331         },
    332         "inclusion_exclusion_criteria": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants.",
    336           "source": "haiku"
    337         },
    338         "randomization_described": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants.",
    342           "source": "haiku"
    343         },
    344         "blinding_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants.",
    348           "source": "haiku"
    349         },
    350         "attrition_reported": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants.",
    354           "source": "haiku"
    355         }
    356       },
    357       "cost_and_practicality": {
    358         "inference_cost_reported": {
    359           "applies": true,
    360           "answer": false,
    361           "justification": "No inference latency or cost overhead for constrained decoding vs. unconstrained is reported; the paper notes performance optimizations are 'out of scope' but gives no runtime measurements.",
    362           "source": "haiku"
    363         },
    364         "compute_budget_stated": {
    365           "applies": true,
    366           "answer": false,
    367           "justification": "No total computational budget (GPU hours, cost) is stated for running 21+ models across 395 samples in 4 configurations.",
    368           "source": "haiku"
    369         }
    370       }
    371     }
    372   },
    373   "claims": [
    374     {
    375       "claim": "No evaluated open-source model solved more than 40% of full completion tasks; best is Code Llama (70B) at 30%.",
    376       "evidence": "Table 3 shows Code Llama (70B) at 0.30 correct implementations (t) for full completion; 6 model variants achieve 0%.",
    377       "supported": "strong"
    378     },
    379     {
    380       "claim": "Constrained decoding increases average correctness by ~90% (full completion) and ~135% (argument completion) relative to unconstrained baselines.",
    381       "evidence": "Table 18a shows average gain of +90% for full completion (t); Table 18c shows +135% for argument completion (t), excluding zero-baseline models.",
    382       "supported": "strong"
    383     },
    384     {
    385       "claim": "Constrained decoding eliminates all illegal URLs, HTTP methods, and arguments.",
    386       "evidence": "Tables 5, 6, 10, 11 all show 0.00 for illegal URLs, illegal methods, and illegal arguments under constrained decoding for all models.",
    387       "supported": "strong"
    388     },
    389     {
    390       "claim": "LLMs hallucinate endpoints and arguments: 15-39% illegal URLs and 6-31% illegal arguments in unconstrained evaluation.",
    391       "evidence": "Tables 3 and 4 show illegal URL rates (e) of 0.15-0.39 and illegal argument rates (e) of 0.06-0.25 across models.",
    392       "supported": "strong"
    393     },
    394     {
    395       "claim": "GPT-4o substantially outperforms open-source models (60% vs. 30% full completion correctness).",
    396       "evidence": "Table 3: GPT-4o achieves 0.60 correct implementations (t) vs. best open-source Code Llama (70B) at 0.30.",
    397       "supported": "strong"
    398     },
    399     {
    400       "claim": "Larger models within a family are not consistently better; medium-sized variants sometimes underperform both smaller and larger variants.",
    401       "evidence": "Section 3.2 notes this in DeepSeek-Coder, Qwen2.5-Coder, and Code Llama families, visible in Tables 8-9.",
    402       "supported": "moderate"
    403     },
    404     {
    405       "claim": "Constrained decoding makes open-source Code Llama (70B) competitive with GPT-4o mini (46% vs. 39% full completion).",
    406       "evidence": "Table 5: Code Llama (70B) constrained = 0.46; Table 3: GPT-4o mini = 0.39 unconstrained. Direct comparison is valid since GPT-4o mini cannot be constrained.",
    407       "supported": "moderate"
    408     }
    409   ],
    410   "methodology_tags": [
    411     "benchmark-eval",
    412     "empirical"
    413   ],
    414   "key_findings": "LLMs struggle significantly with web API invocation code generation: the best open-source model (Code Llama 70B) achieves only 30% correctness on full completion tasks, while GPT-4o reaches 60%. Primary failure modes are endpoint URL hallucination (15-39% illegal URLs) and incorrect argument usage (6-31% illegal arguments). Constrained decoding derived automatically from OpenAPI specifications eliminates all illegal outputs and yields average correctness gains of ~90% (full completion) and ~135% (argument completion) across 21 open-source models, enabling mid-size models to approach commercial model performance without retraining or RAG.",
    415   "red_flags": [
    416     {
    417       "flag": "Fully synthetic benchmark",
    418       "detail": "All 395 tasks were generated by Gemini 1.5 Pro from API specifications; the paper acknowledges that synthetic tasks use sparse optional parameters and placeholder values, limiting real-world transferability."
    419     },
    420     {
    421       "flag": "Only 4 APIs",
    422       "detail": "Coverage limited to Asana, Google Calendar, Google Sheets, and Slack. Per-API breakdowns (Tables 12-17) reveal high variance in model performance (e.g., Google Calendar consistently best), but generalization to other APIs is unvalidated."
    423     },
    424     {
    425       "flag": "No statistical significance tests",
    426       "detail": "Comparative claims ('significantly improves') are made without hypothesis tests. With 395 samples and small absolute differences between some models, some comparisons may not be statistically distinguishable."
    427     },
    428     {
    429       "flag": "Average gain excludes zero-baseline models",
    430       "detail": "The '90%' average gain explicitly excludes models that achieved 0% unconstrained (Qwen2.5-Coder, Llama 3.1 variants shown as '+inf%'). Including these would inflate the reported average; the selection criterion biases the headline number."
    431     },
    432     {
    433       "flag": "Base models only",
    434       "detail": "Instruction-tuned models (the typical practitioner choice) are excluded due to parsing difficulties with their outputs. Results may substantially underestimate capability of deployed models."
    435     },
    436     {
    437       "flag": "No latency or cost reporting",
    438       "detail": "Constrained decoding has real overhead (timeout errors appear in Tables 10-11), but no inference time or cost comparison is provided, which is critical for practical adoption assessment."
    439     },
    440     {
    441       "flag": "Training cutoffs not stated",
    442       "detail": "API specifications are public and likely in training data; without stating training cutoffs, the degree of memorization vs. generalization cannot be assessed."
    443     }
    444   ],
    445   "cited_papers": [
    446     {
    447       "title": "Monitor-Guided Decoding of Code LMs with Static Analysis of Repository Context",
    448       "relevance": "Key related constrained decoding approach for local method APIs; paper explicitly contrasts its web API approach against MGD's scope and capabilities."
    449     },
    450     {
    451       "title": "Evaluating Large Language Models Trained on Code (Codex/HumanEval)",
    452       "relevance": "Foundational benchmark paper for code LLM evaluation; cited for functional testing evaluation methodology."
    453     },
    454     {
    455       "title": "StarCoder 2 and The Stack v2: The Next Generation",
    456       "relevance": "One of the primary open-source code models evaluated in WAPIIBench."
    457     },
    458     {
    459       "title": "DeepSeek-Coder: When the Large Language Model Meets Programming",
    460       "relevance": "Key open-source code model family evaluated; best open-source full-completion performance comes from Code Llama, but DeepSeek is extensively compared."
    461     },
    462     {
    463       "title": "Gorilla: Large Language Model Connected with Massive APIs",
    464       "relevance": "Closely related work on LLMs using APIs with RAG; the paper positions constrained decoding as providing correctness guarantees that RAG cannot."
    465     },
    466     {
    467       "title": "API-Bank: A Comprehensive Benchmark for Tool-Augmented LLMs",
    468       "relevance": "Related tool-use benchmark; contrasted with WAPIIBench's focus on REST API invocation code rather than tool-calling interfaces."
    469     },
    470     {
    471       "title": "Berkeley Function Calling Leaderboard",
    472       "relevance": "Contemporary benchmark for API/function calling; cited as related evaluation infrastructure with different scope (tool APIs vs. REST integration code)."
    473     },
    474     {
    475       "title": "Efficient Guided Generation for Large Language Models",
    476       "relevance": "Key constrained decoding framework; cited as production-quality alternative to authors' custom implementation."
    477     },
    478     {
    479       "title": "Bugs in large language models generated code: an empirical study",
    480       "relevance": "Empirical evidence for hallucination in LLM code generation; corroborates WAPIIBench findings about function/argument hallucination."
    481     },
    482     {
    483       "title": "Qwen2.5-Coder Technical Report",
    484       "relevance": "Open-source code model family evaluated; showed unexpected 0% performance in full completion despite strong leaderboard results."
    485     }
    486   ],
    487   "engagement_factors": {
    488     "practical_relevance": {
    489       "score": 3,
    490       "justification": "API integration is ubiquitous in software development; the benchmark (GitHub) and constraint generator (OpenAPI → regex) are directly usable by practitioners building coding assistants."
    491     },
    492     "surprise_contrarian": {
    493       "score": 2,
    494       "justification": "Multiple results are surprising: some Qwen/Llama models achieve 0% despite leaderboard rankings; larger models aren't consistently better; constrained decoding yields 90-135% gains without any model modification."
    495     },
    496     "fear_safety": {
    497       "score": 1,
    498       "justification": "Security of LLM-generated code is briefly mentioned in the introduction, but the paper doesn't focus on security risks of hallucinated API calls."
    499     },
    500     "drama_conflict": {
    501       "score": 1,
    502       "justification": "The finding that popular models (Qwen2.5-Coder, Llama 3.1) fail completely on full completion despite strong benchmarks is mildly provocative but not framed confrontationally."
    503     },
    504     "demo_ability": {
    505       "score": 3,
    506       "justification": "WAPIIBench is available on GitHub and can be run against any Hugging Face model; practitioners can immediately test their models against the benchmark."
    507     },
    508     "brand_recognition": {
    509       "score": 1,
    510       "justification": "TU Darmstadt and hessian.AI are not globally famous labs; GPT-4o is a famous product being evaluated but not created here."
    511     }
    512   },
    513   "hn_data": {
    514     "threads": [
    515       {
    516         "hn_id": "45381333",
    517         "title": "Federation of Agents: Semantics-Aware, Large-Scale Communication Fabric",
    518         "points": 3,
    519         "comments": 0,
    520         "url": "https://news.ycombinator.com/item?id=45381333",
    521         "created_at": "2025-09-26T01:02:53Z"
    522       }
    523     ],
    524     "top_points": 3,
    525     "total_points": 3,
    526     "total_comments": 0
    527   }
    528 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs