scan.json (24572B)
1 { 2 "paper": { 3 "title": "SynCode: LLM Generation with Grammar Augmentation", 4 "authors": ["Shubham Ugare", "Tarun Suresh", "Hangoo Kang", "Sasa Misailovic", "Gagandeep Singh"], 5 "year": 2024, 6 "venue": "Trans. Mach. Learn. Res.", 7 "arxiv_id": "2403.01632" 8 }, 9 "scan_version": 2, 10 "active_modules": ["experimental_rigor", "data_leakage"], 11 "checklist": { 12 "artifacts": { 13 "code_released": { 14 "applies": true, 15 "answer": true, 16 "justification": "GitHub link provided: https://github.com/uiuc-focal-lab/syncode (stated in the abstract and Section 1)." 17 }, 18 "data_released": { 19 "applies": true, 20 "answer": true, 21 "justification": "The paper uses publicly available datasets: JSON-Mode-Eval (HuggingFace), Spider, HumanEval, and MBXP. All are standard public benchmarks." 22 }, 23 "environment_specified": { 24 "applies": true, 25 "answer": false, 26 "justification": "The paper mentions PyTorch, HuggingFace transformers, and Lark library, and states hardware (48-core Intel Xeon Silver 4214R, 2 NVidia RTX A5000 GPUs) but does not provide a requirements.txt, Dockerfile, or specific library versions." 27 }, 28 "reproduction_instructions": { 29 "applies": true, 30 "answer": false, 31 "justification": "No step-by-step reproduction instructions are provided in the paper. A GitHub link is given but the paper itself does not contain commands or a reproduction guide." 32 } 33 }, 34 "statistical_methodology": { 35 "confidence_intervals_or_error_bars": { 36 "applies": true, 37 "answer": false, 38 "justification": "Results in Tables 1-4 are reported as point estimates without confidence intervals or error bars." 39 }, 40 "significance_tests": { 41 "applies": true, 42 "answer": false, 43 "justification": "The paper claims SynCode 'outperforms' baselines and 'improves' accuracy but no statistical significance tests are reported." 44 }, 45 "effect_sizes_reported": { 46 "applies": true, 47 "answer": true, 48 "justification": "The paper reports percentage reductions (e.g., '96.07% of syntax errors'), absolute differences, and provides baseline context (e.g., Table 3 shows standard vs SynCode counts with % reduction)." 49 }, 50 "sample_size_justified": { 51 "applies": true, 52 "answer": false, 53 "justification": "No justification for the choice of sample sizes (e.g., n=20 samples per problem for code generation, 100 JSON problems). No power analysis." 54 }, 55 "variance_reported": { 56 "applies": true, 57 "answer": false, 58 "justification": "No variance, standard deviation, or spread measures are reported across runs. Single-run results appear to be reported for most experiments." 59 } 60 }, 61 "evaluation_design": { 62 "baselines_included": { 63 "applies": true, 64 "answer": true, 65 "justification": "Multiple baselines compared: Outlines, guidance, llama.cpp, and GCD for JSON; standard generation for SQL and code. Table 6 provides a comprehensive comparison of constrained decoding methods." 66 }, 67 "baselines_contemporary": { 68 "applies": true, 69 "answer": true, 70 "justification": "Baselines include contemporary constrained decoding tools: Outlines v0.1.1, guidance v0.1.16, llama.cpp v0.3.1, GCD, and Domino (2024). These are state-of-the-art at time of publication." 71 }, 72 "ablation_study": { 73 "applies": true, 74 "answer": true, 75 "justification": "Appendix A.4 provides ablation studies on incremental parsing (showing 9x speedup) and max new tokens. Section 6.4 analyzes mask store overhead separately." 76 }, 77 "multiple_metrics": { 78 "applies": true, 79 "answer": true, 80 "justification": "Multiple metrics used: syntax errors, validation accuracy, generation time (JSON); accuracy, execution %, tokens, time (SQL); syntax errors, pass@1, pass@10 (code)." 81 }, 82 "human_evaluation": { 83 "applies": true, 84 "answer": false, 85 "justification": "No human evaluation of output quality. All evaluation is automated (compilers for syntax, unit tests for functional correctness, schema validation for JSON)." 86 }, 87 "held_out_test_set": { 88 "applies": true, 89 "answer": true, 90 "justification": "Standard public benchmarks are used as test sets (HumanEval, MBXP, Spider, JSON-Mode-Eval). No fine-tuning is done on these sets — SynCode modifies decoding, not model weights." 91 }, 92 "per_category_breakdown": { 93 "applies": true, 94 "answer": true, 95 "justification": "SQL results broken down by difficulty level (easy/medium/hard/extra hard) in Table 2. Code results broken down by model, language, and dataset in Tables 3-4." 96 }, 97 "failure_cases_discussed": { 98 "applies": true, 99 "answer": true, 100 "justification": "Section 6.3 and Appendix A.6 discuss failure cases — remaining syntax errors are due to LLM failing to halt before max token limit. An example is shown in Figure 11." 101 }, 102 "negative_results_reported": { 103 "applies": true, 104 "answer": true, 105 "justification": "Table 4 shows that SynCode provides only marginal improvement in functional correctness (pass@k), honestly reporting that syntactic correction does not substantially improve logical correctness." 106 } 107 }, 108 "claims_and_evidence": { 109 "abstract_claims_supported": { 110 "applies": true, 111 "answer": true, 112 "justification": "Abstract claims are supported: 'eliminates all syntax errors' for JSON (Table 1 shows 0 errors), '96.07% of syntax errors' reduction for Python/Go (Table 3 shows >90% across all configurations)." 113 }, 114 "causal_claims_justified": { 115 "applies": true, 116 "answer": true, 117 "justification": "Causal claims ('SynCode improves/reduces') are justified through controlled single-variable manipulation — same models tested with and without SynCode masking, all else equal. Ablation studies further isolate component contributions." 118 }, 119 "generalization_bounded": { 120 "applies": true, 121 "answer": false, 122 "justification": "The title claims 'LLM Generation with Grammar Augmentation' broadly, but the paper only evaluates on a subset of Python/Go grammar (excluding features like lambda functions), specific models (mostly small 1-7B), and 4 languages. These bounds are mentioned in passing but the title/abstract are broader than the evidence." 123 }, 124 "alternative_explanations_discussed": { 125 "applies": true, 126 "answer": false, 127 "justification": "No discussion of alternative explanations. For example, the paper doesn't consider whether the JSON validation improvements might be partly due to shorter/simpler outputs rather than just syntactic correctness." 128 }, 129 "proxy_outcome_distinction": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper's claims match the granularity of measurements: syntax error counts, compilation rates, pass@k. It does not overclaim — e.g., it explicitly notes that syntactic correctness provides only 'slight improvement in functional correctness' (Section 6.3)." 133 } 134 }, 135 "setup_transparency": { 136 "model_versions_specified": { 137 "applies": true, 138 "answer": true, 139 "justification": "Specific model versions are stated: Llama-2-7B-chat, Gemma2-2B-it, LLaMA-7B, WizardCoder-1B, CodeGen-350M, Llama-3.2-1B, Llama-3.2-3B. These are specific enough to identify exact models." 140 }, 141 "prompts_provided": { 142 "applies": true, 143 "answer": true, 144 "justification": "Full prompts are provided in Appendix A.7: JSON prompts (Listings 1-2), SQL prompt (Listing 3), Python and Go prompts from HumanEval (Listings 4-5)." 145 }, 146 "hyperparameters_reported": { 147 "applies": true, 148 "answer": true, 149 "justification": "Key hyperparameters reported: temperature=0.2, top_p=0.95 for code generation (Section 6.3), max_new_tokens=400 for JSON (Section 6.1), n=20 and n=1 samples, greedy decoding for SQL." 150 }, 151 "scaffolding_described": { 152 "applies": false, 153 "answer": false, 154 "justification": "SynCode is a constrained decoding framework, not agentic scaffolding. No agent loops, tools, or multi-step workflows are involved." 155 }, 156 "data_preprocessing_documented": { 157 "applies": true, 158 "answer": true, 159 "justification": "Data preprocessing is described: grammars specified in Appendix A.8, explicit prompts shown, baseline version numbers given, warmup runs performed. For SQL, additional baseline (Standard+) extraction via regex is described." 160 } 161 }, 162 "limitations_and_scope": { 163 "limitations_section_present": { 164 "applies": true, 165 "answer": false, 166 "justification": "No dedicated limitations section. Some limitations are scattered (e.g., grammar subset, max token limit), but there is no structured limitations or threats-to-validity discussion." 167 }, 168 "threats_to_validity_specific": { 169 "applies": true, 170 "answer": false, 171 "justification": "No threats-to-validity section. Specific limitations like the grammar subset constraint and non-termination issue are mentioned but not framed as validity threats." 172 }, 173 "scope_boundaries_stated": { 174 "applies": true, 175 "answer": false, 176 "justification": "Scope boundaries are implicit but not explicitly stated. The paper acknowledges the grammar is a 'substantial subset' and that SynCode 'does not support enforcing semantic constraints' but does not have a structured statement of what the results do NOT show." 177 } 178 }, 179 "data_integrity": { 180 "raw_data_available": { 181 "applies": true, 182 "answer": false, 183 "justification": "No raw experimental data (individual outputs, per-example results) is released. Only aggregate results in tables." 184 }, 185 "data_collection_described": { 186 "applies": true, 187 "answer": true, 188 "justification": "Data sources are well-described: JSON-Mode-Eval (100 zero-shot problems), Spider (1,034 problems broken down by difficulty), HumanEval (164 problems), MBXP (974 problems). Section 5 details datasets." 189 }, 190 "recruitment_methods_described": { 191 "applies": false, 192 "answer": false, 193 "justification": "No human participants. Data sources are standard public benchmarks." 194 }, 195 "data_pipeline_documented": { 196 "applies": true, 197 "answer": true, 198 "justification": "The pipeline is documented: grammar → DFA mask store (offline) → inference with constrained decoding → evaluation via compilers/unit tests. Section 5 describes the experimental setup and evaluation methodology." 199 } 200 }, 201 "conflicts_of_interest": { 202 "funding_disclosed": { 203 "applies": true, 204 "answer": false, 205 "justification": "No funding or acknowledgments section found in the paper. One author has VMware Research affiliation." 206 }, 207 "affiliations_disclosed": { 208 "applies": true, 209 "answer": true, 210 "justification": "Author affiliations clearly listed: University of Illinois Urbana-Champaign and VMware Research (for Gagandeep Singh)." 211 }, 212 "funder_independent_of_outcome": { 213 "applies": true, 214 "answer": false, 215 "justification": "No funding information disclosed, so independence cannot be assessed. One author is affiliated with VMware Research." 216 }, 217 "financial_interests_declared": { 218 "applies": true, 219 "answer": false, 220 "justification": "No competing interests or financial interests statement found in the paper." 221 } 222 }, 223 "contamination": { 224 "training_cutoff_stated": { 225 "applies": true, 226 "answer": false, 227 "justification": "No training data cutoff dates stated for any of the evaluated models (Llama-2, Gemma2, CodeGen, WizardCoder, LLaMA)." 228 }, 229 "train_test_overlap_discussed": { 230 "applies": true, 231 "answer": false, 232 "justification": "No discussion of whether HumanEval, MBXP, or Spider problems appeared in the training data of the evaluated models." 233 }, 234 "benchmark_contamination_addressed": { 235 "applies": true, 236 "answer": false, 237 "justification": "HumanEval was published in 2021, and models like Llama-2 (2023) likely trained on data containing it. This contamination risk is not discussed." 238 } 239 }, 240 "human_studies": { 241 "pre_registered": { 242 "applies": false, 243 "answer": false, 244 "justification": "No human participants in this study." 245 }, 246 "irb_or_ethics_approval": { 247 "applies": false, 248 "answer": false, 249 "justification": "No human participants in this study." 250 }, 251 "demographics_reported": { 252 "applies": false, 253 "answer": false, 254 "justification": "No human participants in this study." 255 }, 256 "inclusion_exclusion_criteria": { 257 "applies": false, 258 "answer": false, 259 "justification": "No human participants in this study." 260 }, 261 "randomization_described": { 262 "applies": false, 263 "answer": false, 264 "justification": "No human participants in this study." 265 }, 266 "blinding_described": { 267 "applies": false, 268 "answer": false, 269 "justification": "No human participants in this study." 270 }, 271 "attrition_reported": { 272 "applies": false, 273 "answer": false, 274 "justification": "No human participants in this study." 275 } 276 }, 277 "cost_and_practicality": { 278 "inference_cost_reported": { 279 "applies": true, 280 "answer": true, 281 "justification": "Generation time per prompt is reported in Tables 1-2 (e.g., 3.07s for SynCode JSON). Table 5 reports DFA mask store creation time. Section 6.4 analyzes memory overhead." 282 }, 283 "compute_budget_stated": { 284 "applies": true, 285 "answer": true, 286 "justification": "Hardware specified (48-core Intel Xeon Silver 4214R, 2 NVidia RTX A5000 GPUs). Mask store creation times and memory usage reported in Table 5 (e.g., 602s, 1.87GB for CodeGen-350M Python)." 287 } 288 }, 289 "experimental_rigor": { 290 "seed_sensitivity_reported": { 291 "applies": true, 292 "answer": false, 293 "justification": "No seed sensitivity analysis. Code generation uses temperature sampling (temp=0.2) but results across multiple seeds are not reported." 294 }, 295 "number_of_runs_stated": { 296 "applies": true, 297 "answer": true, 298 "justification": "Number of samples stated: n=20 per problem for HumanEval code generation, n=1 for MBXP (Section 6.3). Greedy decoding for SQL (deterministic, single run)." 299 }, 300 "hyperparameter_search_budget": { 301 "applies": true, 302 "answer": false, 303 "justification": "No hyperparameter search budget reported. The choice of temperature=0.2 and top_p=0.95 is not justified." 304 }, 305 "best_config_selection_justified": { 306 "applies": true, 307 "answer": false, 308 "justification": "No justification for why specific hyperparameters or grammar subsets were chosen. No description of configuration selection process." 309 }, 310 "multiple_comparison_correction": { 311 "applies": false, 312 "answer": false, 313 "justification": "No statistical significance tests are performed, so multiple comparison correction is not applicable." 314 }, 315 "self_comparison_bias_addressed": { 316 "applies": true, 317 "answer": false, 318 "justification": "The authors implement SynCode and compare it against baselines (Outlines, guidance, llama.cpp) but do not acknowledge the bias of evaluating their own system." 319 }, 320 "compute_budget_vs_performance": { 321 "applies": true, 322 "answer": true, 323 "justification": "Generation time is reported alongside performance for all methods in Tables 1-2, allowing compute-performance comparison. The paper explicitly discusses SynCode's efficiency advantage." 324 }, 325 "benchmark_construct_validity": { 326 "applies": true, 327 "answer": false, 328 "justification": "No discussion of whether HumanEval, MBXP, or JSON-Mode-Eval actually measure the capabilities claimed. The paper uses these benchmarks without questioning their construct validity." 329 }, 330 "scaffold_confound_addressed": { 331 "applies": false, 332 "answer": false, 333 "justification": "No scaffolding is involved. SynCode is a constrained decoding method applied directly to model outputs." 334 } 335 }, 336 "data_leakage": { 337 "temporal_leakage_addressed": { 338 "applies": true, 339 "answer": false, 340 "justification": "No discussion of temporal leakage. HumanEval (2021) and MBXP could have been in training data for models released in 2023-2024." 341 }, 342 "feature_leakage_addressed": { 343 "applies": true, 344 "answer": false, 345 "justification": "No discussion of whether evaluation setup leaks information. The few-shot prompting setup provides examples that could prime the model." 346 }, 347 "non_independence_addressed": { 348 "applies": true, 349 "answer": false, 350 "justification": "No discussion of whether train and test data share structural similarities or overlap." 351 }, 352 "leakage_detection_method": { 353 "applies": true, 354 "answer": false, 355 "justification": "No leakage detection or prevention method used." 356 } 357 } 358 }, 359 "claims": [ 360 { 361 "claim": "SynCode eliminates all syntax errors for JSON generation.", 362 "evidence": "Table 1 shows 0 syntax errors for both Llama-2-7B-chat and Gemma2-2B-it with SynCode, compared to 98 and 59 errors for standard generation (Section 6.1).", 363 "supported": "strong" 364 }, 365 { 366 "claim": "SynCode reduces 96.07% of syntax errors in generated Python and Go code on average.", 367 "evidence": "Table 3 shows >90% reduction across all model-dataset combinations for both Python and Go. Average computed across 6 configurations (Section 6.3).", 368 "supported": "strong" 369 }, 370 { 371 "claim": "SynCode achieves 100% JSON schema validation accuracy with Gemma2-2B-it on explicit prompts.", 372 "evidence": "Table 1 shows 100% validation accuracy for Gemma2-2B-it with SynCode on explicit prompts (Section 6.1).", 373 "supported": "strong" 374 }, 375 { 376 "claim": "SynCode improves both compilation rate and execution accuracy for SQL generation.", 377 "evidence": "Table 2 shows improvements across all 4 models: e.g., Llama-3.2-3B execution rate from 67.4% to 81.4%, accuracy from 28.6% to 34.9% (Section 6.2).", 378 "supported": "strong" 379 }, 380 { 381 "claim": "SynCode is faster than all baseline grammar-guided generation methods for JSON with Llama-2-7B-chat.", 382 "evidence": "Table 1: SynCode 3.07s vs llama.cpp 21.91s, guidance 5.14s, Outlines 38.07s (Section 6.1).", 383 "supported": "strong" 384 }, 385 { 386 "claim": "SynCode provides only slight improvement in functional correctness (pass@k).", 387 "evidence": "Table 4 shows marginal pass@1 and pass@10 improvements (e.g., pass@10 for LLaMA-7B Python: 17.1% → 18.9%). Section 6.3 acknowledges this honestly.", 388 "supported": "strong" 389 } 390 ], 391 "methodology_tags": ["benchmark-eval"], 392 "key_findings": "SynCode is a grammar-guided constrained decoding framework that uses a precomputed DFA mask store to efficiently filter syntactically invalid tokens during LLM generation. It eliminates all JSON syntax errors and reduces Python/Go syntax errors by 96% on average across multiple models and benchmarks. SynCode is provably sound (retains all valid tokens) and faster than competing constrained decoding approaches for JSON. However, functional correctness improvements are marginal, suggesting syntactic correctness alone does not substantially improve code logic.", 393 "red_flags": [ 394 { 395 "flag": "No statistical significance tests", 396 "detail": "All comparative claims ('outperforms', 'improves') are based solely on comparing point estimates without any significance testing, despite using stochastic sampling (temperature=0.2) for code generation." 397 }, 398 { 399 "flag": "No contamination analysis", 400 "detail": "HumanEval (2021) and other benchmarks were likely in training data for models released in 2023-2024. The paper does not discuss this, though it matters less here since SynCode modifies decoding rather than model capability." 401 }, 402 { 403 "flag": "Incomplete grammar coverage", 404 "detail": "Python and Go grammars are 'substantial subsets' excluding features like lambda functions. The 96% error reduction claim is for this subset, but the paper's framing suggests general applicability." 405 }, 406 { 407 "flag": "No limitations section", 408 "detail": "The paper lacks a dedicated limitations or threats-to-validity section despite several known limitations (grammar subset, non-termination, no semantic constraints)." 409 } 410 ], 411 "cited_papers": [ 412 { 413 "title": "Efficient Guided Generation for Large Language Models", 414 "authors": ["Brandon T. Willard", "Rémi Louf"], 415 "year": 2023, 416 "arxiv_id": "2307.09702", 417 "relevance": "Core baseline for constrained decoding (Outlines), directly compared against SynCode." 418 }, 419 { 420 "title": "Evaluating Large Language Models Trained on Code", 421 "authors": ["Mark Chen", "Jerry Tworek"], 422 "year": 2021, 423 "arxiv_id": "2107.03374", 424 "relevance": "Introduces HumanEval benchmark used for code generation evaluation." 425 }, 426 { 427 "title": "PICARD: Parsing Incrementally for Constrained Auto-Regressive Decoding from Language Models", 428 "authors": ["Torsten Scholak", "Nathan Schucher", "Dzmitry Bahdanau"], 429 "year": 2021, 430 "doi": "10.18653/v1/2021.emnlp-main.779", 431 "relevance": "Prior constrained decoding approach for SQL generation using beam search." 432 }, 433 { 434 "title": "Synchromesh: Reliable Code Generation from Pre-trained Language Models", 435 "authors": ["Gabriel Poesia", "Alex Polozov"], 436 "year": 2022, 437 "relevance": "Prior CFG-guided generation framework addressing token misalignment problem." 438 }, 439 { 440 "title": "Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation", 441 "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"], 442 "year": 2024, 443 "arxiv_id": "2403.06988", 444 "relevance": "Domino framework for CFG-guided generation with precomputed prefix trees, compared algorithmically to SynCode." 445 }, 446 { 447 "title": "Prompting Is Programming: A Query Language for Large Language Models", 448 "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"], 449 "year": 2023, 450 "doi": "10.1145/3591300", 451 "relevance": "LMQL constrained generation framework supporting regex constraints." 452 }, 453 { 454 "title": "Copiloting the Copilots: Fusing Large Language Models with Completion Engines for Automated Program Repair", 455 "authors": ["Yuxiang Wei", "Chunqiu Steven Xia", "Lingming Zhang"], 456 "year": 2023, 457 "doi": "10.1145/3611643.3616271", 458 "relevance": "Uses language-server suggestions to enforce semantic constraints during LLM code generation." 459 }, 460 { 461 "title": "Multi-lingual Evaluation of Code Generation Models", 462 "authors": ["Ben Athiwaratkun"], 463 "year": 2023, 464 "arxiv_id": "2210.14868", 465 "relevance": "Introduces MBXP multilingual code generation benchmark used in evaluation." 466 }, 467 { 468 "title": "Holistic Evaluation of Language Models", 469 "authors": ["Percy Liang"], 470 "year": 2023, 471 "arxiv_id": "2211.09110", 472 "relevance": "HELM benchmark suite highlighting LLM reliability challenges that motivate constrained generation." 473 }, 474 { 475 "title": "Constrained Decoding for Fill-in-the-Middle Code Language Models via Efficient Left and Right Quotienting of Context-Sensitive Grammars", 476 "authors": ["Daniel Melcer", "Nathan Fulton"], 477 "year": 2024, 478 "arxiv_id": "2402.17988", 479 "relevance": "Extends constrained decoding to fill-in-the-middle code generation beyond left-to-right generation." 480 } 481 ] 482 }