scan.json (24048B)
1 { 2 "paper": { 3 "title": "Play by the Type Rules: Inferring Constraints for LLM Functions in Declarative Programs", 4 "authors": ["Parker Glenn", "Alfy Samuel", "Daben Liu"], 5 "year": 2025, 6 "venue": "arXiv", 7 "arxiv_id": "2509.20208", 8 "doi": "10.48550/arXiv.2509.20208" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "Type-constrained decoding for LLM functions in SQL-like query languages improves accuracy by up to 7% on HybridQA multi-hop QA with 53% latency improvement over comparable systems. Small language models (3b) can approach larger model (8b) performance as function executors when given appropriate type constraints. BlendSQL's DB-first approach reduces latency from 1.7s to 0.76s versus LOTUS on TAG-Bench.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "GitHub repository provided: https://github.com/parkervg/blendsql. Specific version blendsql==0.0.48 mentioned in Appendix B." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "Uses publicly available datasets: HybridQA (Chen et al., 2020) and TAG-Bench (Biswal et al., 2024). No proprietary data collected." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": true, 29 "justification": "Appendix B specifies blendsql==0.0.48, llama-cpp-python 0.3.16, specific llama.cpp commit hash, lotus-ai==1.1.3, ollama 0.6.7, specific model quants (Q4_K_M), GPU hardware (RTX 5080, A10, A100)." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "While code is released and environment details provided, no step-by-step reproduction instructions or scripts to replicate the main experiments are described. Appendix B mentions specific files (tag_queries.py, hand_written.py) but no README or reproduction guide." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Table 2 reports ± for latency (0.76 +/- 0.002) but the main accuracy results in Table 3 and Figure 3 report only point estimates with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "No statistical significance tests are reported. Claims like '7% accuracy improvement' and comparisons between typing policies are based on raw number differences without any test." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "Absolute improvements with baselines are provided throughout: e.g., denotation accuracy rises by 6.6 points with type constraints (Section 5.1), 53% latency improvement (Table 2), and full accuracy/F1/denotation tables (Table 3) give baseline context." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "Uses first 1,000 examples from HybridQA validation set and 60 TAG-Bench questions without justifying why these sizes or subsets were chosen." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Latency results in Table 2 are averaged over 5 runs with std dev, but the main accuracy results (Table 3, Figure 3) appear to be single-run with no variance reported." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "Multiple baselines: No Context, All Context, RAG, and LOTUS comparison (Table 2, Table 3). Also compares three typing policies (No Type Hints, Type Hints, Type Hints + Constrained Decoding)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "LOTUS (Patel et al., 2024) is contemporary. Models used include Llama 3 series (2024) and gemma-3-12b-it (2024). TAG-Bench and HybridQA are established benchmarks." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Figure 3 ablates the three typing policies. Figure 6 ablates documentation vs. few-shot only. Figure 7 ablates CFG-constrained generation. Figure 8 shows hyperparameter sweeps for k values." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "Reports Accuracy, F1, and Denotation Accuracy on HybridQA (Table 3). Reports latency and average tokens per program on TAG-Bench (Table 2)." 84 }, 85 "human_evaluation": { 86 "applies": true, 87 "answer": false, 88 "justification": "No human evaluation is included. All evaluation is automated via exact match, F1, and denotation accuracy metrics." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": false, 93 "justification": "Results are reported on the HybridQA validation set, not a held-out test set. The paper states 'first 1,000 examples from the HybridQA validation set.'" 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Figure 3 provides per-model-size breakdowns across all parsing/execution model combinations. Table 4 categorizes execution errors. Figure 9 shows per-sample latency." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Table 4 categorizes execution errors (Empty LLMQA Context, Generic SQLite Syntax, Hallucinated Column, etc.). Section 5.2 discusses 102/1000 execution failures. Appendix A.1 explores syntax errors." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "Figure 7 shows CFG-constrained generation actually harms 70b model performance. Figure 6 shows removing documentation can moderately improve smaller models. The 1b model shows steep performance drop-off." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "Abstract claims '7% accuracy improvement' and '53% improvement in latency' are supported by Table 3 (type constraints lift) and Table 2 (1.7s to 0.76s). 'Small language models can excel as function executors' is supported by Table 3." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "Causal claims about type constraints improving performance are supported by controlled ablation (Figure 3) varying only the typing policy while holding other factors constant. The claim is about the mechanism (constrained decoding), and the ablation isolates this variable." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The conclusion states the approach 'can be extended to any typed declarative programming language' but evaluation is only on SQLite-based BlendSQL with two datasets (HybridQA and TAG-Bench). The title and framing are broader than the tested setting." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for observed improvements. For example, no discussion of whether constrained decoding simply eliminates formatting errors vs. actually improving reasoning, or whether improvements are dataset-specific." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "Claims are stated at the granularity of the measurements: denotation accuracy on HybridQA, latency on TAG-Bench. No broader framing like 'reasoning ability' without qualification." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "Specific model versions given: Llama-3.2-1B-Instruct, Llama-3.2-3B-Instruct, Llama-3.1-8B-Instruct, Llama-3.3-70B-Instruct, gemma-3-12b-it. Specific quant (Q4_K_M) and commit hashes provided." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Full prompt templates for LLMQA (Figure 4) and LLMMAP (Figure 5) are provided with complete text including all template variables and formatting." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": false, 152 "justification": "Max context length of 8000 is stated. Retrieval k values reported. However, LLM generation hyperparameters (temperature, top-p, max tokens) are not reported for the model inference calls." 153 }, 154 "scaffolding_described": { 155 "applies": true, 156 "answer": true, 157 "justification": "The BlendSQL execution pipeline is described in detail: AST parsing, depth-first traversal, function execution flow (Algorithm 1, Figure 2), temporary table creation, and query compilation to SQL." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "Section 4.2 describes the evaluation setup: 1,000 HybridQA validation examples, article text split into sentences for search index, lowercase normalization, k values for retrieval. Table 2 setup is described in Appendix B." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "No dedicated limitations or threats-to-validity section is present in the paper." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No threats to validity are discussed. The paper does not address specific limitations of the study design." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "No explicit scope boundaries are stated. The conclusion broadens claims ('can be extended to any typed declarative programming language') without stating what was NOT tested." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental outputs, model predictions, or per-example results are released. Only aggregate numbers in tables and figures." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "Data sources are clearly described: HybridQA validation set (Chen et al., 2020), TAG-Bench from BIRD-SQL (Li et al., 2023). Evaluation procedure is documented." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data comes from standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The execution pipeline from question to answer is documented: query parsing → AST traversal → function execution → type constraint inference → constrained decoding → SQL compilation (Algorithm 1, Figure 2)." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": false, 208 "justification": "No funding sources or acknowledgments section is present. Authors are from Capital One, a financial company." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "Author affiliations with Capital One are clearly stated in the header." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": false, 218 "justification": "Capital One is a financial services company. While not directly selling the evaluated tool, the work is corporate-sponsored research with no funding independence discussion." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests or financial interests statement is present in the paper." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "No training data cutoff dates are stated for any of the Llama or Gemma models used. HybridQA (2020) and TAG-Bench could potentially be in training data." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether HybridQA or TAG-Bench data appeared in the training sets of the Llama 3 or Gemma models." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "HybridQA was published in 2020. The Llama 3 models (2024) were likely trained on data that could include HybridQA. This is not addressed. The 'No Context' baseline showing some accuracy (up to 6.6% for 70b) suggests parametric knowledge overlap." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "Latency is reported: 0.76s per sample for BlendSQL vs 1.7s for LOTUS (Table 2). Average tokens per program (76 vs 127) also reported." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": true, 289 "justification": "Hardware specified: 4x24GB A10 GPUs for smaller models, 4x80GB A100 for 70b, RTX 5080 for latency experiments (Appendix B). Max context length 8000." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "Latency results averaged over 5 runs, but accuracy results appear to be single-run with no seed sensitivity analysis." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": false, 301 "justification": "Table 2 states 'average runtime across 5 runs' for latency, but the main accuracy experiments (Table 3, Figure 3) do not state the number of runs." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "Figure 8 shows hyperparameter sweeps for k values but no total search budget or method is described for other hyperparameters." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": false, 311 "justification": "The selection of k=1 for LLMSEARCHMAP and k=10 for LLMQA is shown in Figure 8 but the selection criterion (which split was used) is not described." 312 }, 313 "multiple_comparison_correction": { 314 "applies": true, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so no multiple comparison correction is applicable. However, many comparisons are made across model sizes and typing policies without any correction." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "Authors compare their BlendSQL system against LOTUS using their own implementations. No acknowledgment of potential bias in re-implementing baselines." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "Table 2 explicitly compares performance at matched hardware (same RTX 5080, same quantized model) and across different hardware/model configurations. Latency and tokens per program allow compute-aware comparison." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether HybridQA or TAG-Bench actually measure the capabilities claimed (multi-hop reasoning, type alignment). The benchmarks are taken at face value." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": true, 335 "answer": false, 336 "justification": "BlendSQL and LOTUS use different scaffolding (SQL compilation vs Pandas API). When comparing systems (Table 2), the scaffold difference is not addressed — performance differences could stem from the scaffold rather than the typing approach." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "HybridQA was published in 2020, Llama 3 models trained in 2024. No discussion of whether models saw HybridQA data during training. The 'No Context' baseline achieving up to 6.6% accuracy suggests some memorization." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the evaluation setup leaks information, e.g., whether type hints or constrained decoding provide answer-leaking signal." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between HybridQA examples or potential overlap with training data." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention method is used despite the temporal gap between benchmark creation (2020) and model training (2024)." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Type Hints + Constrained Decoding improves denotation accuracy by up to 7% over unconstrained generation on HybridQA", 365 "evidence": "Figure 3 shows consistent improvements across all model size combinations. Biggest lift: 3b executor with 70b parser, 33.3→45.3 (+12 points) from No Type Hints to Type Hints + Constrained Decoding.", 366 "supported": "strong" 367 }, 368 { 369 "claim": "BlendSQL achieves 53% latency improvement over LOTUS on TAG-Bench", 370 "evidence": "Table 2: 0.76s vs 1.7s on same hardware (RTX 5080) with same model (Llama-3.1-8b-Instruct Q4_K_M), averaged over 5 runs.", 371 "supported": "strong" 372 }, 373 { 374 "claim": "Small language models (3b) can approach larger model (8b) performance as function executors when given program decompositions", 375 "evidence": "Table 3: 3b Program Execution achieves 45.3 denotation accuracy vs 8b RAG at 45.6, though the 3b uses 70b-generated programs while the 8b does its own end-to-end reasoning.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "The approach can be extended to any typed declarative programming language", 380 "evidence": "Section 7 conclusion claim. Only tested on SQLite-based BlendSQL. No evidence for other languages.", 381 "supported": "unsupported" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "No contamination analysis despite temporal gap", 387 "detail": "HybridQA was published in 2020 and models trained in 2024. The 'No Context' baseline shows 6.6% accuracy for the 70b model, suggesting parametric knowledge of the dataset. No contamination analysis is performed." 388 }, 389 { 390 "flag": "No limitations section", 391 "detail": "The paper lacks any discussion of limitations, threats to validity, or scope boundaries." 392 }, 393 { 394 "flag": "Company evaluating its own system", 395 "detail": "Capital One authors evaluating BlendSQL (created by first author Glenn et al., 2024) without acknowledging self-evaluation bias." 396 }, 397 { 398 "flag": "No statistical tests for accuracy claims", 399 "detail": "All accuracy comparisons are raw number differences with no significance tests, confidence intervals, or multi-run variance." 400 }, 401 { 402 "flag": "Validation set used as test set", 403 "detail": "Main results are reported on HybridQA validation set, not a held-out test set. Hyperparameter sweeps (Figure 8) appear to use the same data." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "BlendSQL: A Scalable Dialect for Unifying Hybrid Question Answering in Relational Algebra", 409 "authors": ["Parker Glenn", "Parag Dakle", "Liang Wang", "Preethi Raghavan"], 410 "year": 2024, 411 "relevance": "Predecessor system by same authors; foundational work on integrating LLM functions into SQL queries." 412 }, 413 { 414 "title": "Semantic Operators: A Declarative Model for Rich, AI-based Analytics over Text Data", 415 "authors": ["Liana Patel", "Siddharth Jha", "Parth Asawa"], 416 "year": 2024, 417 "arxiv_id": "2407.11418", 418 "relevance": "LOTUS system compared against in efficiency experiments; key baseline for LLM-DBMS integration." 419 }, 420 { 421 "title": "Type-Constrained Code Generation with Language Models", 422 "authors": ["Niels Mündler", "Jingxuan He", "Hao Wang", "Koushik Sen", "Dawn Song", "Martin Vechev"], 423 "year": 2025, 424 "relevance": "Most related prior work on enforcing type safety in LLM code generation." 425 }, 426 { 427 "title": "Efficient Guided Generation for Large Language Models", 428 "authors": ["Brandon T Willard", "Rémi Louf"], 429 "year": 2023, 430 "arxiv_id": "2307.09702", 431 "relevance": "Foundational work on constrained decoding for LLMs used in the approach." 432 }, 433 { 434 "title": "Text2SQL is Not Enough: Unifying AI and Databases with TAG", 435 "authors": ["Asim Biswal", "Liana Patel", "Siddarth Jha"], 436 "year": 2024, 437 "arxiv_id": "2408.14717", 438 "relevance": "TAG-Bench dataset used for efficiency evaluation; demonstrates need for LLM-database integration." 439 }, 440 { 441 "title": "The Llama 3 Herd of Models", 442 "authors": ["Abhimanyu Dubey"], 443 "year": 2024, 444 "relevance": "Primary model family used across all experiments (1b, 3b, 8b, 70b variants)." 445 }, 446 { 447 "title": "Binding Language Models in Symbolic Languages", 448 "authors": ["Zhoujun Cheng", "Tianbao Xie", "Peng Shi"], 449 "year": 2023, 450 "relevance": "Denotation accuracy metric and approach to combining LLMs with symbolic languages." 451 }, 452 { 453 "title": "XGrammar: Flexible and Efficient Structured Generation Engine for Large Language Models", 454 "authors": ["Yixin Dong", "Charlie F Ruan"], 455 "year": 2024, 456 "arxiv_id": "2411.15100", 457 "relevance": "Optimized constrained decoding engine relevant to the grammar-constrained generation approach." 458 }, 459 { 460 "title": "Beyond Quacking: Deep Integration of Language Models and RAG into DuckDB", 461 "authors": ["Anas Dorbani", "Sunny Yasser", "Jimmy Lin", "Amine Mhedhbi"], 462 "year": 2025, 463 "relevance": "Alternative approach to integrating LLMs with DBMS (DuckDB UDFs)." 464 } 465 ] 466 }