scan.json (23564B)
1 { 2 "paper": { 3 "title": "Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation", 4 "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"], 5 "year": 2024, 6 "venue": "International Conference on Machine Learning", 7 "arxiv_id": "2403.06988", 8 "doi": "10.48550/arXiv.2403.06988" 9 }, 10 "scan_version": 2, 11 "active_modules": ["experimental_rigor", "data_leakage"], 12 "methodology_tags": ["benchmark-eval"], 13 "key_findings": "DOMINO, a minimally-invasive constrained decoding algorithm, enforces context-free grammar constraints during LLM generation without accuracy loss and often with speedup (up to 2.71x) over unconstrained decoding. Naive constrained decoding methods cause token misalignment that can reduce task accuracy by up to 11 percentage points (GSM8K with GUIDANCE). The paper introduces speculative decoding for constrained generation and pre-computed subterminal trees for efficient vocabulary-grammar alignment.", 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "No repository URL or code archive is provided in the paper. No mention of code release." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": true, 24 "justification": "The paper uses publicly available benchmarks: GSM8K and CoNLL-2003, which are standard public datasets. No proprietary data was collected." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper mentions NVIDIA A100 40GB and H100 80GB GPUs and the transformers and llama.cpp backends, but does not provide a requirements.txt, Dockerfile, or detailed library versions." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "No step-by-step reproduction instructions, README, or scripts are provided." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "Results in Tables 2-4 report only point estimates (e.g., accuracy 0.415, throughput 1.77x) with no confidence intervals or error bars." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper claims DOMINO outperforms baselines but provides no statistical significance tests — comparisons are made by directly comparing numbers." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports relative performance (e.g., '1.77× throughput', accuracy changes from 0.415 to 0.308) with baseline context, allowing the reader to gauge effect magnitude." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "400 test samples are used from each dataset and 100 repetitions for throughput experiments, but no justification for these choices is provided." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "No standard deviations or variance across runs are reported. Throughput experiments use 100 repetitions but only report aggregate numbers without spread measures." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares against unconstrained generation, GUIDANCE (template and CFG variants), and llama.cpp grammars across all experiments (Tables 2, 3)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "GUIDANCE, llama.cpp, and SYNCHROMESH are contemporary constrained decoding methods. Also references GCD (2023) and PICARD (2021). Table 1 provides a comprehensive comparison." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Table 4 ablates the lookahead parameter k. Figure 5 studies the number of speculative tokens s. The paper also compares opportunistic masking vs speculative decoding modes." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports task accuracy, well-formedness, perplexity, and throughput (performance impact) in Table 2. Additional throughput comparisons in Table 3." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "Human evaluation is not relevant here — the claims are about format validity, task accuracy on benchmarks, and throughput, all of which are objectively measurable." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "The paper uses 400 test samples from the test splits of GSM8K and CoNLL-2003, with few-shot demonstrations from the training split (§4.1)." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down per dataset (GSM8K, CoNLL-2003), per model (Mistral 7B, Llama-2 13B), and per grammar type (JSON, JSON with schema, C, XML, Fixed Template) in Tables 2-3." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper discusses failure cases: Figure 1 shows how naive constraining distorts tokenization, Figure 2 shows template-induced perplexity explosion, Table 4 shows k=0 and k=1 cause accuracy drops, and the paper notes C grammar is hardest for speculative decoding." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper reports that C code generation incurs overhead (0.78x) and speculative decoding is not effective for free-form JSON (§4.3). Lower k values cause severe accuracy degradation (Table 4)." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims (1) existing methods impair accuracy, (2) DOMINO enforces constraints with subword alignment, (3) virtually no overhead and up to 2x speedup. All are supported by Tables 2-4 and Figure 5." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper makes causal claims about token misalignment causing accuracy drops, supported by controlled ablations: same prompts, same models, same datasets, varying only the constraining method. The lookahead ablation (Table 4) provides additional controlled evidence." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The paper tests on only two LLMs (Mistral 7B, Llama-2 13B) and two task-accuracy datasets (GSM8K, CoNLL-2003), but the title and abstract make general claims about constrained generation for LLMs without bounding to these specific models or tasks." 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": false, 130 "justification": "No discussion of alternative explanations for accuracy differences (e.g., could the accuracy changes be due to prompt sensitivity rather than tokenization misalignment specifically?). No threats-to-validity or robustness checks." 131 }, 132 "proxy_outcome_distinction": { 133 "applies": true, 134 "answer": true, 135 "justification": "The paper directly measures what it claims: task accuracy on GSM8K/CoNLL-2003, throughput in tokens/second, and format validity. No proxy gap exists — measurements match claims." 136 } 137 }, 138 "setup_transparency": { 139 "model_versions_specified": { 140 "applies": true, 141 "answer": true, 142 "justification": "The paper specifies Mistral 7B (citing Jiang et al., 2023) and Llama-2 13B (citing Touvron et al., 2023c). These are specific model versions with known architectures and weights." 143 }, 144 "prompts_provided": { 145 "applies": true, 146 "answer": true, 147 "justification": "Appendix C and D provide the actual grammars and prompts used for all experiments. Full prompt text and JSON format examples are included." 148 }, 149 "hyperparameters_reported": { 150 "applies": true, 151 "answer": true, 152 "justification": "Temperature 1.0 is stated for throughput experiments (§4.3), max tokens 128, 5-shot prompting for accuracy experiments. The speculative decoding parameter s is studied in Figure 5." 153 }, 154 "scaffolding_described": { 155 "applies": false, 156 "answer": false, 157 "justification": "No agentic scaffolding is used. DOMINO is a decoding-time algorithm, not an agent." 158 }, 159 "data_preprocessing_documented": { 160 "applies": true, 161 "answer": true, 162 "justification": "The paper describes how few-shot demonstrations were constructed (manually creating JSON responses from training split examples, §4.1), the 400 test sample subset, and the prompt format (Appendix D)." 163 } 164 }, 165 "limitations_and_scope": { 166 "limitations_section_present": { 167 "applies": true, 168 "answer": false, 169 "justification": "There is no dedicated limitations or threats-to-validity section. The paper has only a brief Impact Statement that does not discuss limitations." 170 }, 171 "threats_to_validity_specific": { 172 "applies": true, 173 "answer": false, 174 "justification": "No specific threats to validity are discussed anywhere in the paper." 175 }, 176 "scope_boundaries_stated": { 177 "applies": true, 178 "answer": false, 179 "justification": "The paper does not explicitly state what settings or models are NOT covered. It evaluates on only 2 models and 2 accuracy benchmarks but does not acknowledge these scope limitations." 180 } 181 }, 182 "data_integrity": { 183 "raw_data_available": { 184 "applies": true, 185 "answer": false, 186 "justification": "No raw experimental data (generated outputs, token-level logs, per-sample results) are provided. Only aggregate tables." 187 }, 188 "data_collection_described": { 189 "applies": true, 190 "answer": true, 191 "justification": "The paper describes the data sources (GSM8K test split, CoNLL-2003 test split), the sampling procedure (400 test samples, 5 prompts per workload), and the experimental protocol (100 repetitions, 10 warmup)." 192 }, 193 "recruitment_methods_described": { 194 "applies": false, 195 "answer": false, 196 "justification": "No human participants. Data sources are standard public benchmarks." 197 }, 198 "data_pipeline_documented": { 199 "applies": true, 200 "answer": true, 201 "justification": "The pipeline is documented: prompts are constructed with 5-shot demonstrations from training split, JSON format responses are manually created, models generate output under various constraints, outputs are evaluated for accuracy, validity, and perplexity." 202 } 203 }, 204 "conflicts_of_interest": { 205 "funding_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "The Acknowledgements section states: 'This work has received funding from the Swiss State Secretariat for Education, Research and Innovation (SERI)' under the SAFEAI grant." 209 }, 210 "affiliations_disclosed": { 211 "applies": true, 212 "answer": true, 213 "justification": "All authors are listed as affiliated with ETH Zurich, Department of Computer Science. No commercial product is being evaluated." 214 }, 215 "funder_independent_of_outcome": { 216 "applies": true, 217 "answer": true, 218 "justification": "The funder is SERI (Swiss government research agency) via an ERC-style grant for AI safety research. The funder has no financial stake in DOMINO's performance." 219 }, 220 "financial_interests_declared": { 221 "applies": true, 222 "answer": false, 223 "justification": "No competing interests statement is present. The first author is also an author of LMQL, one of the tools compared against, which is not disclosed as a potential conflict." 224 } 225 }, 226 "contamination": { 227 "training_cutoff_stated": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper does not state the training data cutoff dates for Mistral 7B or Llama-2 13B." 231 }, 232 "train_test_overlap_discussed": { 233 "applies": true, 234 "answer": false, 235 "justification": "No discussion of whether GSM8K or CoNLL-2003 examples appeared in the models' training data. Both benchmarks predate these models." 236 }, 237 "benchmark_contamination_addressed": { 238 "applies": true, 239 "answer": false, 240 "justification": "GSM8K (2021) and CoNLL-2003 (2002) were both available online before Mistral and Llama-2 were trained, creating contamination risk. This is not discussed." 241 } 242 }, 243 "human_studies": { 244 "pre_registered": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants in this study." 248 }, 249 "irb_or_ethics_approval": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants in this study." 253 }, 254 "demographics_reported": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants in this study." 258 }, 259 "inclusion_exclusion_criteria": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants in this study." 263 }, 264 "randomization_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants in this study." 268 }, 269 "blinding_described": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants in this study." 273 }, 274 "attrition_reported": { 275 "applies": false, 276 "answer": false, 277 "justification": "No human participants in this study." 278 } 279 }, 280 "cost_and_practicality": { 281 "inference_cost_reported": { 282 "applies": true, 283 "answer": true, 284 "justification": "The paper extensively reports throughput (tokens/second) and relative performance overhead for all methods. Figure 5 shows tokens/second for different configurations. Table 3 reports relative throughput vs unconstrained baseline." 285 }, 286 "compute_budget_stated": { 287 "applies": true, 288 "answer": false, 289 "justification": "The paper mentions GPU types (A100, H100) but does not state total GPU hours, wall-clock time for experiments, or total computational budget. Precomputation time (1-5s, C at ~20s) is noted but total experiment budget is not." 290 } 291 }, 292 "experimental_rigor": { 293 "seed_sensitivity_reported": { 294 "applies": true, 295 "answer": false, 296 "justification": "No reporting of results across multiple random seeds. The paper uses temperature 1.0 for throughput but does not report seed sensitivity for accuracy experiments." 297 }, 298 "number_of_runs_stated": { 299 "applies": true, 300 "answer": true, 301 "justification": "The paper states 100 repetitions per configuration for throughput experiments (§4.3) and 10 warmup repetitions. For accuracy, 400 test samples are evaluated but the number of runs is not explicitly stated (appears to be single-run with greedy/sampled decoding)." 302 }, 303 "hyperparameter_search_budget": { 304 "applies": true, 305 "answer": false, 306 "justification": "No hyperparameter search budget is reported. The lookahead k and speculative tokens s are studied but the selection process for other settings is not documented." 307 }, 308 "best_config_selection_justified": { 309 "applies": true, 310 "answer": true, 311 "justification": "The paper systematically studies the key parameters (k in Table 4, s in Figure 5) rather than just reporting the best configuration. The choice of k=∞ is justified by the ablation showing lower values degrade accuracy." 312 }, 313 "multiple_comparison_correction": { 314 "applies": false, 315 "answer": false, 316 "justification": "No statistical tests are performed at all, so multiple comparison correction is not applicable." 317 }, 318 "self_comparison_bias_addressed": { 319 "applies": true, 320 "answer": false, 321 "justification": "The authors created DOMINO and are also authors of LMQL (one comparison point). They implement GUIDANCE programs and llama.cpp baselines themselves. No acknowledgment of self-comparison bias." 322 }, 323 "compute_budget_vs_performance": { 324 "applies": true, 325 "answer": true, 326 "justification": "The paper's core contribution is precisely about the compute-performance tradeoff. Table 3 compares throughput across methods. Figure 5 shows performance as a function of speculative tokens. The precomputation cost is noted (1-5s, C at 20s)." 327 }, 328 "benchmark_construct_validity": { 329 "applies": true, 330 "answer": false, 331 "justification": "No discussion of whether GSM8K math reasoning and CoNLL-2003 NER are appropriate benchmarks for measuring the impact of constrained decoding on general task accuracy. These are relatively narrow task types." 332 }, 333 "scaffold_confound_addressed": { 334 "applies": false, 335 "answer": false, 336 "justification": "No scaffolding is involved in this work. DOMINO is a decoding-time algorithm applied directly to LLM inference." 337 } 338 }, 339 "data_leakage": { 340 "temporal_leakage_addressed": { 341 "applies": true, 342 "answer": false, 343 "justification": "GSM8K (2021) and CoNLL-2003 (2002) predate the models used. No discussion of whether models may have seen these benchmarks during training." 344 }, 345 "feature_leakage_addressed": { 346 "applies": true, 347 "answer": false, 348 "justification": "No discussion of whether the 5-shot demonstrations or prompting format leaks information about expected answers." 349 }, 350 "non_independence_addressed": { 351 "applies": true, 352 "answer": false, 353 "justification": "No discussion of independence between training data and test examples for the models used." 354 }, 355 "leakage_detection_method": { 356 "applies": true, 357 "answer": false, 358 "justification": "No leakage detection or prevention methods are used or discussed." 359 } 360 } 361 }, 362 "claims": [ 363 { 364 "claim": "Naive constrained decoding methods significantly impair task accuracy due to token misalignment, reducing GSM8K accuracy from 41.5% to 30.8% (Mistral 7B with GUIDANCE).", 365 "evidence": "Table 2 shows accuracy drops for GUIDANCE across all model/dataset combinations. The largest drop is 11 percentage points on GSM8K with GUIDANCE on Llama-2 13B (0.262 → 0.152).", 366 "supported": "strong" 367 }, 368 { 369 "claim": "DOMINO achieves minimally-invasive constraining with no accuracy loss compared to unconstrained generation.", 370 "evidence": "Table 2 shows DOMINO matching or slightly exceeding unconstrained accuracy on all 4 model/dataset combinations (e.g., 0.418 vs 0.415 on GSM8K Mistral 7B).", 371 "supported": "moderate" 372 }, 373 { 374 "claim": "DOMINO achieves up to 2.71x throughput over unconstrained generation via speculative decoding.", 375 "evidence": "Table 2 reports 2.71x for CoNLL-2003 with Llama-2 13B. Table 3 shows up to 1.91x for fixed templates with Llama-2 13B.", 376 "supported": "moderate" 377 }, 378 { 379 "claim": "Template-based constrained generation causes perplexity explosion and different output compared to unconstrained generation.", 380 "evidence": "Figure 2 shows template-based output has perplexity 24.50-26.75 vs 4.17 for unconstrained, and naturalized template output reaches 49.39 perplexity.", 381 "supported": "moderate" 382 } 383 ], 384 "red_flags": [ 385 { 386 "flag": "No error bars or variance", 387 "detail": "All accuracy and throughput numbers are point estimates with no variance, confidence intervals, or statistical tests despite claiming superiority over baselines." 388 }, 389 { 390 "flag": "No limitations section", 391 "detail": "The paper has no limitations, threats to validity, or scope boundaries section despite making broad claims about constrained decoding for LLMs based on only 2 models." 392 }, 393 { 394 "flag": "Self-comparison bias", 395 "detail": "The first author is also an author of LMQL. The authors implement all baselines (GUIDANCE programs, llama.cpp grammars) themselves without acknowledging potential bias in baseline implementation quality." 396 }, 397 { 398 "flag": "Very narrow model evaluation", 399 "detail": "Only two relatively small models (7B and 13B) are tested for accuracy. No evaluation on larger models or instruction-tuned variants where constrained decoding behavior may differ significantly." 400 }, 401 { 402 "flag": "Contamination not addressed", 403 "detail": "GSM8K and CoNLL-2003 predate both models, creating contamination risk that could affect absolute accuracy numbers (though relative comparisons between decoding methods are less affected)." 404 } 405 ], 406 "cited_papers": [ 407 { 408 "title": "Evaluating large language models trained on code", 409 "authors": ["Mark Chen", "Jerry Tworek"], 410 "year": 2021, 411 "relevance": "Codex/HumanEval benchmark foundational to LLM code generation evaluation." 412 }, 413 { 414 "title": "Prompting is programming: A query language for large language models", 415 "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"], 416 "year": 2023, 417 "relevance": "LMQL constrained generation framework, direct comparison baseline." 418 }, 419 { 420 "title": "Efficient guided generation for large language models", 421 "authors": ["Brandon T. Willard", "Rémi Louf"], 422 "year": 2023, 423 "relevance": "Outlines framework for constrained generation with pre-computed regex checkers." 424 }, 425 { 426 "title": "Synchromesh: Reliable code generation from pre-trained language models", 427 "authors": ["Gabriel Poesia"], 428 "year": 2022, 429 "relevance": "Prior work on minimally invasive constrained decoding for code generation." 430 }, 431 { 432 "title": "Grammar-constrained decoding for structured NLP tasks without finetuning", 433 "authors": ["Saibo Geng"], 434 "year": 2023, 435 "relevance": "GCD constrained decoding method, direct comparison." 436 }, 437 { 438 "title": "PICARD: Parsing incrementally for constrained auto-regressive decoding from language models", 439 "authors": ["Torsten Scholak"], 440 "year": 2021, 441 "relevance": "Online parser-guided constrained decoding for SQL generation." 442 }, 443 { 444 "title": "Accelerating large language model decoding with speculative sampling", 445 "authors": ["Charlie Chen"], 446 "year": 2023, 447 "relevance": "Speculative decoding technique that DOMINO adapts for constrained generation." 448 }, 449 { 450 "title": "Training verifiers to solve math word problems", 451 "authors": ["Karl Cobbe"], 452 "year": 2021, 453 "relevance": "GSM8K benchmark used for evaluating constrained decoding accuracy impact." 454 } 455 ] 456 }