scan.json (26275B)
1 { 2 "paper": { 3 "title": "ChopChop: A Programmable Framework for Semantically Constraining the Output of Language Models", 4 "authors": ["Shaan Nagy", "Timothy Zhou", "Nadia Polikarpova", "Loris D'Antoni"], 5 "year": 2026, 6 "venue": "Proc. ACM Program. Lang. (POPL)", 7 "arxiv_id": "2509.00360", 8 "doi": "10.1145/3776708" 9 }, 10 "checklist": { 11 "artifacts": { 12 "code_released": { 13 "applies": true, 14 "answer": true, 15 "justification": "Section 'Data-Availability Statement' states: 'The code for this paper is available on GitHub. An artifact containing a Docker image is available on Zenodo [33].' Reference [33] points to a Zenodo artifact with DOI 10.5281/zenodo.17337963." 16 }, 17 "data_released": { 18 "applies": true, 19 "answer": true, 20 "justification": "The benchmarks are fully enumerated in the appendix: 10 equivalence-guided decoding benchmarks are listed in Section A.4, and 72 MBPP TypeScript tasks from MultiPL-E (a public dataset) are listed by name in Section A.3.2. The Zenodo artifact also contains these." 21 }, 22 "environment_specified": { 23 "applies": true, 24 "answer": true, 25 "justification": "The Zenodo artifact [33] contains a Docker image, which provides a fully reproducible environment. Hardware is specified in Section 6.1: 'Supermicro SYS-4029GP-TRT with two Intel(R) Xeon(R) Gold 6230 CPUs, 384 GB RAM, a 4 TB SSD, and eight Nvidia Geforce RTX 2080Ti GPUs.'" 26 }, 27 "reproduction_instructions": { 28 "applies": true, 29 "answer": true, 30 "justification": "A Docker image artifact is provided on Zenodo [33], which is designed for reproducibility. The full set of prompts, benchmarks, and rewrite rules are provided in the appendix (Sections A.3.2, A.4)." 31 } 32 }, 33 "statistical_methodology": { 34 "confidence_intervals_or_error_bars": { 35 "applies": true, 36 "answer": false, 37 "justification": "Tables 1 and 2 report only point estimates (counts of successful generations and mean overhead in ms/token). No confidence intervals, error bars, or uncertainty measures are reported." 38 }, 39 "significance_tests": { 40 "applies": true, 41 "answer": false, 42 "justification": "The paper makes comparative claims (e.g., 'improving the performance of DeepSeek-Coder-6.7b, CodeLlama-7B, and CodeLlama-13B by an average of 28.0%, 10.1%, and 23.2%') but no statistical significance tests are reported." 43 }, 44 "effect_sizes_reported": { 45 "applies": true, 46 "answer": true, 47 "justification": "The paper reports percentage improvements with baseline context: '28.0%, 10.1%, and 23.2% across all temperatures' (Section 6.2.1). Table 1 provides raw counts for all conditions, allowing readers to compute exact effect sizes." 48 }, 49 "sample_size_justified": { 50 "applies": true, 51 "answer": false, 52 "justification": "The equivalence benchmark has only 10 tasks and the TypeScript benchmark uses 72/809 tasks from MBPP. No justification is given for why 10 equivalence tasks are sufficient, and the choice of 72 TypeScript tasks is driven by their language fragment, not by power analysis." 53 }, 54 "variance_reported": { 55 "applies": true, 56 "answer": false, 57 "justification": "Each benchmark/model/temperature combination appears to be a single run. No variance, standard deviation, or multiple-run results are reported. Table 1 shows single-run counts only." 58 } 59 }, 60 "evaluation_design": { 61 "baselines_included": { 62 "applies": true, 63 "answer": true, 64 "justification": "Section 6.1 defines two baselines: unconstrained decoding and grammar-constrained decoding (GCD). For TypeScript, the tool by Mündler et al. [32] is also compared." 65 }, 66 "baselines_contemporary": { 67 "applies": true, 68 "answer": true, 69 "justification": "Mündler et al. [32] (2025) is a recent contemporary baseline for type-constrained decoding. Grammar-constrained decoding is the standard approach in the field. These are appropriate and competitive baselines." 70 }, 71 "ablation_study": { 72 "applies": true, 73 "answer": true, 74 "justification": "The comparison of unconstrained vs. grammar-constrained vs. semantic-constrained decoding effectively serves as an ablation, showing the contribution of syntactic constraints alone vs. adding semantic constraints. The 'No Delimit' vs. 'Delimit' variants and the 'Rules + No Delimit' vs. 'Rules + Delimit' variants (Table 3) further isolate prompt effects." 75 }, 76 "multiple_metrics": { 77 "applies": true, 78 "answer": true, 79 "justification": "The paper reports both effectiveness (success rate/count of correct programs, Table 1) and overhead (ms/token latency, Table 2), as well as the distribution of tokens tried before finding a realizable one (Figure 12)." 80 }, 81 "human_evaluation": { 82 "applies": true, 83 "answer": true, 84 "justification": "Section 6.2.1 states: 'we manually check the unconstrained and syntax-constrained data for false negatives (programs that could be considered equivalent, but are not proved equivalent by our rewrite rules). We only found 8 such instances out of the 600 total programs.' Section 6.2.2 provides qualitative evaluation of generated outputs." 85 }, 86 "held_out_test_set": { 87 "applies": false, 88 "answer": false, 89 "justification": "This paper does not train or tune any model. It evaluates a decoding framework on fixed benchmarks, so train/test splitting is not applicable." 90 }, 91 "per_category_breakdown": { 92 "applies": true, 93 "answer": true, 94 "justification": "Table 1 breaks results down by model (3 models), temperature (5 settings), benchmark type (equivalence No Delimit, equivalence Delimit, TypeScript), and decoding strategy. Additional breakdowns appear in Table 3 (with rewrite rules in prompt)." 95 }, 96 "failure_cases_discussed": { 97 "applies": true, 98 "answer": true, 99 "justification": "Section 6.2.2 discusses failure modes: natural language parsed as valid grammar (Figure 11a), inequivalent programs (Figure 11b), const mutation errors (Figure 11c), and cases where 'the LM is typically unable to recover because it has already committed to a solution.' The paper also discusses timeouts from deeply nested arithmetic expressions." 100 }, 101 "negative_results_reported": { 102 "applies": true, 103 "answer": true, 104 "justification": "The paper reports that grammar-constrained decoding performs worse than unconstrained generation for TypeScript (Section 6.2.1). It also reports that Mündler et al. [32] outperforms ChopChop at medium and high temperatures for TypeScript due to supporting a larger language fragment. Performance degradation at high temperatures is also reported." 105 } 106 }, 107 "claims_and_evidence": { 108 "abstract_claims_supported": { 109 "applies": true, 110 "answer": true, 111 "justification": "The abstract claims ChopChop 'improves success rates while maintaining practical decoding latency,' which is supported by Table 1 (improved success rates) and Table 2 (overhead of tens to hundreds of ms/token, compared to 81 ms baseline for unconstrained CodeLlama-13B)." 112 }, 113 "causal_claims_justified": { 114 "applies": true, 115 "answer": true, 116 "justification": "The causal claims are of the form 'applying semantic constrained decoding improves success rates.' This is justified by controlled single-variable manipulation: the only difference between conditions is the decoding strategy, with the same model, temperature, benchmarks, and prompts held constant." 117 }, 118 "generalization_bounded": { 119 "applies": true, 120 "answer": true, 121 "justification": "The paper is careful to scope its claims. It specifies the exact language fragments (a toy functional language and a subset of TypeScript), the specific models tested (DeepSeek-Coder-6.7b, CodeLlama-7B, CodeLlama-13B), and acknowledges limitations such as the smaller TypeScript fragment compared to Mündler et al." 122 }, 123 "alternative_explanations_discussed": { 124 "applies": true, 125 "answer": true, 126 "justification": "Section 6.2.1 discusses alternative explanations for baselines' poor performance: the toy language being out-of-distribution, models emitting natural language, and the rewrite rules not capturing all equivalences (only 8 false negatives out of 600). Section 6.2.1 also discusses why GCD underperforms unconstrained decoding for TypeScript (language fragment limitation)." 127 } 128 }, 129 "setup_transparency": { 130 "model_versions_specified": { 131 "applies": true, 132 "answer": true, 133 "justification": "Section 6.1 specifies: 'DeepSeek-Coder-6.7b, CodeLlama-7B, and CodeLlama-13B' with 'instruction-tuned versions.' These are specific model names with sizes. The paper notes these are open-weight models run locally, so API versioning is not applicable." 134 }, 135 "prompts_provided": { 136 "applies": true, 137 "answer": true, 138 "justification": "Full prompt text is provided in Section A.3.2 (TypeScript context) and Section A.4 (equivalence context, including the code examples). The exact benchmark inputs (function signatures for TypeScript, programs for equivalence) are also listed." 139 }, 140 "hyperparameters_reported": { 141 "applies": true, 142 "answer": true, 143 "justification": "Section 6.1 specifies five sampling temperatures (0.01, 0.3, 0.5, 0.7, 1.0) and a 150-second timeout. The naive sampling strategy (backtrack by one token on rejection) is described. The paper states these are 'instruction-tuned versions' of the models." 144 }, 145 "scaffolding_described": { 146 "applies": false, 147 "answer": false, 148 "justification": "ChopChop is not an agentic scaffolding system. It is a constrained decoding framework that operates at the token generation level. There are no agents, tools, retry loops, or feedback mechanisms beyond the constrained decoding algorithm itself, which is thoroughly described." 149 }, 150 "data_preprocessing_documented": { 151 "applies": true, 152 "answer": true, 153 "justification": "Section 6.1 describes how benchmarks were selected: 72/809 MBPP tasks that can be solved in the TypeScript language fragment. The prompting strategy is documented. For equivalence, 10 benchmark tasks were manually created and listed in full." 154 } 155 }, 156 "limitations_and_scope": { 157 "limitations_section_present": { 158 "applies": true, 159 "answer": true, 160 "justification": "Section 8 (Conclusion) contains substantive discussion of limitations: users must write pruners as corecursive functions that return regular spaces, the implementation is 'relatively unoptimized, incurring noticeable per-token overhead,' and the type pruner uses bounded depth-3 types sacrificing soundness for deeply nested types (Section A.3.1)." 161 }, 162 "threats_to_validity_specific": { 163 "applies": true, 164 "answer": true, 165 "justification": "The paper discusses specific threats: the toy language being out-of-distribution (Section 6.2.1), the TypeScript fragment being smaller than Mündler et al.'s (Section 6.2.1), equivalence being approximate via rewrite rules (Section 6.2.1, 8 false negatives found), constrained decoding skewing LM token distributions (Section 6.2.2), and type bounding sacrificing soundness (Section A.3.1)." 166 }, 167 "scope_boundaries_stated": { 168 "applies": true, 169 "answer": true, 170 "justification": "The paper explicitly states it tests on small-to-medium models (6.7B-13B), uses only a subset of TypeScript, and its equivalence checker is approximate. Section 8 notes that 'users of ChopChop must directly define a pruner... as a corecursive function on program spaces' and that this function 'must return a space that is regular,' limiting applicability." 171 } 172 }, 173 "data_integrity": { 174 "raw_data_available": { 175 "applies": true, 176 "answer": true, 177 "justification": "The Zenodo artifact [33] contains a Docker image for reproducing the experiments. The full benchmark lists are in the appendix. This enables independent verification of results." 178 }, 179 "data_collection_described": { 180 "applies": true, 181 "answer": true, 182 "justification": "The equivalence benchmarks are 10 manually created tasks (listed in Section A.4). The TypeScript benchmarks are 72 tasks from MBPP/MultiPL-E, selected by filtering for tasks solvable in the supported language fragment (Section 6.1)." 183 }, 184 "recruitment_methods_described": { 185 "applies": false, 186 "answer": false, 187 "justification": "No human participants were involved. The data consists of standard benchmarks (MBPP/MultiPL-E) and manually created tasks." 188 }, 189 "data_pipeline_documented": { 190 "applies": true, 191 "answer": true, 192 "justification": "Section 6.1 describes the pipeline: for each benchmark/model/decoder/temperature, run constrained decoding until END token or 150s timeout; a run is successful if a complete program is emitted that satisfies semantic constraints. The TypeScript task selection (72/809 from MBPP) is documented." 193 } 194 }, 195 "conflicts_of_interest": { 196 "funding_disclosed": { 197 "applies": true, 198 "answer": true, 199 "justification": "The Acknowledgments section lists: Microsoft Faculty Fellowship, UCSD JSOE Scholarship, NSF grants CCF-2422214, CCF-2506134, CCF-2446711, and NSF Graduate Research Fellowship DGE-2038238." 200 }, 201 "affiliations_disclosed": { 202 "applies": true, 203 "answer": true, 204 "justification": "All authors are from UC San Diego. The paper explicitly discloses: 'Loris D'Antoni holds concurrent appointments as a Professor at the University of California San Diego and as an Amazon Scholar. This paper describes work performed at the University of California San Diego and is not associated with Amazon.'" 205 }, 206 "funder_independent_of_outcome": { 207 "applies": true, 208 "answer": true, 209 "justification": "Funding is from NSF and Microsoft Faculty Fellowship. The paper does not evaluate any Microsoft or NSF product. The Amazon affiliation is explicitly stated as not associated with this work. The funders have no stake in the specific results." 210 }, 211 "financial_interests_declared": { 212 "applies": true, 213 "answer": false, 214 "justification": "No competing interests or financial disclosure statement is present. The Amazon Scholar affiliation is disclosed but there is no explicit competing interests declaration." 215 } 216 }, 217 "contamination": { 218 "training_cutoff_stated": { 219 "applies": true, 220 "answer": false, 221 "justification": "The paper uses DeepSeek-Coder-6.7b, CodeLlama-7B, and CodeLlama-13B but does not state their training data cutoff dates. This is relevant because the MBPP benchmark and MultiPL-E could be in the training data." 222 }, 223 "train_test_overlap_discussed": { 224 "applies": true, 225 "answer": false, 226 "justification": "No discussion of whether the MBPP/MultiPL-E benchmark tasks appeared in the training data of the models used. MBPP was published in 2021, predating all models used." 227 }, 228 "benchmark_contamination_addressed": { 229 "applies": true, 230 "answer": false, 231 "justification": "MBPP (2021) and MultiPL-E (2022) were both published before the training cutoffs of all three models. The paper does not discuss contamination risk for the TypeScript benchmark tasks. The 10 equivalence tasks appear to be novel (manually created), which would mitigate contamination for that benchmark." 232 } 233 }, 234 "human_studies": { 235 "pre_registered": { 236 "applies": false, 237 "answer": false, 238 "justification": "No human participants in this study. It evaluates automated systems on benchmarks." 239 }, 240 "irb_or_ethics_approval": { 241 "applies": false, 242 "answer": false, 243 "justification": "No human participants in this study." 244 }, 245 "demographics_reported": { 246 "applies": false, 247 "answer": false, 248 "justification": "No human participants in this study." 249 }, 250 "inclusion_exclusion_criteria": { 251 "applies": false, 252 "answer": false, 253 "justification": "No human participants in this study." 254 }, 255 "randomization_described": { 256 "applies": false, 257 "answer": false, 258 "justification": "No human participants in this study." 259 }, 260 "blinding_described": { 261 "applies": false, 262 "answer": false, 263 "justification": "No human participants in this study." 264 }, 265 "attrition_reported": { 266 "applies": false, 267 "answer": false, 268 "justification": "No human participants in this study." 269 } 270 }, 271 "cost_and_practicality": { 272 "inference_cost_reported": { 273 "applies": true, 274 "answer": true, 275 "justification": "Table 2 reports overhead in milliseconds per produced token for all model/benchmark/temperature combinations. Section 6.3 also reports that 'CodeLlama-13B takes on average 81 ms to produce a token when unconstrained' for reference, and that '96.4% of the tokens produced during semantic constrained decoding for TypeScript pass on the first try.'" 276 }, 277 "compute_budget_stated": { 278 "applies": true, 279 "answer": true, 280 "justification": "Hardware is fully specified (Section 6.1): Supermicro SYS-4029GP-TRT with 2x Xeon Gold 6230 CPUs, 384GB RAM, 8x RTX 2080Ti GPUs. Per-token overhead is reported in Table 2. The 150-second timeout per run is stated." 281 } 282 } 283 }, 284 "claims": [ 285 { 286 "claim": "ChopChop improves success rates of language models for generating semantically valid programs compared to unconstrained and grammar-constrained decoding.", 287 "evidence": "Table 1 shows semantic constrained decoding consistently outperforms baselines across models and temperatures. For equivalence-guided decoding (No Delimit), ChopChop achieves 33-43 successes vs. 0 for baselines. For TypeScript, it improves by 28.0%, 10.1%, and 23.2% for the three models (Section 6.2.1).", 288 "supported": "strong" 289 }, 290 { 291 "claim": "ChopChop maintains practical decoding latency with small overhead per token.", 292 "evidence": "Table 2 reports overhead ranging from tens to a few hundred milliseconds per token. Section 6.3 notes CodeLlama-13B takes 81ms per token unconstrained, and 96.4% of tokens pass semantic checking on the first try.", 293 "supported": "strong" 294 }, 295 { 296 "claim": "Semantic constrained decoding is sound: any output program satisfies the user-provided semantic constraints.", 297 "evidence": "Theorem 3.3 proves soundness of semantic constrained decoding. The proof in Appendix A.1 shows that the algorithm only returns outputs that pass the constraint check on Line 6 of Algorithm 1.", 298 "supported": "strong" 299 }, 300 { 301 "claim": "Semantic constrained decoding is complete up to top-p and top-k sampling when the realizability checker is complete.", 302 "evidence": "Theorem 3.4 and proof in Appendix A.1 establish this formally.", 303 "supported": "strong" 304 }, 305 { 306 "claim": "Grammar-constrained decoding alone performs worse than unconstrained generation for TypeScript.", 307 "evidence": "Table 1 shows GCD yields 126, 289, 236 total successes vs. unconstrained's 214, 308, 259 for the three models. Section 6.2.1 attributes this to limiting the grammar to a small TypeScript fragment that prevents otherwise compilable programs.", 308 "supported": "strong" 309 }, 310 { 311 "claim": "Mündler et al. performs slightly better than ChopChop at medium temperatures and significantly better at high temperatures for TypeScript.", 312 "evidence": "Table 1 shows Mündler et al. achieving 337, 351, 347 total vs. ChopChop's 274, 339, 319 for the three models. Section 6.2.1 attributes this to Mündler et al. supporting a larger language fragment.", 313 "supported": "strong" 314 } 315 ], 316 "methodology_tags": ["benchmark-eval", "theoretical"], 317 "key_findings": "ChopChop introduces the first programmable framework for semantic constrained decoding of language models, allowing users to define constraints over abstract syntax trees rather than string representations. Across two case studies (equivalence-guided decoding and type-safe TypeScript generation) with three small-to-medium LMs at five temperatures, ChopChop consistently outperforms unconstrained and grammar-only baselines, with especially dramatic gains for equivalence-guided decoding where baselines score near zero. The overhead is practical (tens to hundreds of ms/token) and the theoretical framework provides formal soundness and completeness guarantees.", 318 "red_flags": [ 319 { 320 "flag": "Very small equivalence benchmark", 321 "detail": "The equivalence-guided decoding evaluation uses only 10 manually created benchmark tasks. While the dramatic improvement (0 vs. 33-45 successes) is convincing directionally, 10 tasks is very small for drawing robust quantitative conclusions." 322 }, 323 { 324 "flag": "No variance or repeated runs", 325 "detail": "Each benchmark/model/temperature combination appears to be evaluated once. For stochastic decoding (temperatures > 0.01), results could vary across runs. No standard deviations or confidence intervals are reported." 326 }, 327 { 328 "flag": "Benchmark contamination not addressed", 329 "detail": "The TypeScript benchmarks come from MBPP (2021) and MultiPL-E (2022), which predate all three models' training. Contamination could inflate baseline performance (making ChopChop's improvement look smaller) or affect the nature of errors. This is not discussed." 330 }, 331 { 332 "flag": "Only small/medium models tested", 333 "detail": "All models are 6.7B-13B parameters. It is unclear whether the improvements would hold for larger, more capable models that may already produce semantically valid code more frequently without constraints." 334 } 335 ], 336 "cited_papers": [ 337 { 338 "title": "Monitor-guided decoding of code LMs with static analysis of repository context", 339 "authors": ["Lakshya A Agrawal", "Aditya Kanade", "Navin Goyal", "Shuvendu K. Lahiri", "Sriram K. Rajamani"], 340 "year": 2023, 341 "doi": "10.5555/3666122.3667523", 342 "relevance": "Proposes the monitor-based framework for constrained decoding with static analysis, a direct predecessor to ChopChop's semantic constrained decoding." 343 }, 344 { 345 "title": "Type-Constrained Code Generation with Language Models", 346 "authors": ["Niels Mündler", "Jingxuan He", "Hao Wang", "Koushik Sen", "Dawn Song", "Martin Vechev"], 347 "year": 2025, 348 "doi": "10.1145/3729274", 349 "relevance": "Implements type-constrained decoding for TypeScript, serving as a direct baseline comparison for ChopChop's type-safe decoding case study." 350 }, 351 { 352 "title": "Grammar-aligned decoding", 353 "authors": ["Kanghee Park", "Jiayu Wang", "Taylor Berg-Kirkpatrick", "Nadia Polikarpova", "Loris D'Antoni"], 354 "year": 2025, 355 "doi": "10.52202/079017-0774", 356 "relevance": "Addresses the token distribution skewing problem in grammar-constrained decoding, directly relevant to improving constrained decoding quality." 357 }, 358 { 359 "title": "Flexible and Efficient Grammar-Constrained Decoding", 360 "authors": ["Kanghee Park", "Timothy Zhou", "Loris D'Antoni"], 361 "year": 2025, 362 "arxiv_id": "2502.05111", 363 "relevance": "Presents efficient grammar-constrained decoding techniques that could be integrated with ChopChop as a preprocessing step." 364 }, 365 { 366 "title": "SynCode: LLM Generation with Grammar Augmentation", 367 "authors": ["Shubham Ugare", "Tarun Suresh", "Hangoo Kang", "Sasa Misailovic", "Gagandeep Singh"], 368 "year": 2025, 369 "relevance": "Grammar-augmented LLM generation framework, representative of the grammar-constrained decoding baseline approach." 370 }, 371 { 372 "title": "Grammar-Constrained Decoding for Structured NLP Tasks without Finetuning", 373 "authors": ["Saibo Geng", "Martin Josifoski", "Maxime Peyrard", "Robert West"], 374 "year": 2023, 375 "doi": "10.18653/v1/2023.emnlp-main.674", 376 "relevance": "Early work on grammar-constrained decoding for structured outputs, foundational to the constrained decoding field." 377 }, 378 { 379 "title": "Program Synthesis with Large Language Models", 380 "authors": ["Jacob Austin", "Augustus Odena", "Maxwell Nye"], 381 "year": 2021, 382 "arxiv_id": "2108.07732", 383 "relevance": "Introduces the MBPP benchmark used in ChopChop's TypeScript evaluation." 384 }, 385 { 386 "title": "MultiPL-E: A Scalable and Extensible Approach to Benchmarking Neural Code Generation", 387 "authors": ["Federico Cassano", "John Gouwar", "Daniel Nguyen"], 388 "year": 2022, 389 "arxiv_id": "2208.08227", 390 "relevance": "Provides the TypeScript translations of MBPP tasks used as benchmarks in ChopChop's type-safe decoding evaluation." 391 }, 392 { 393 "title": "Efficient Guided Generation for Large Language Models", 394 "authors": ["Brandon T. Willard", "Rémi Louf"], 395 "year": 2023, 396 "arxiv_id": "2307.09702", 397 "relevance": "Foundational work on efficient constrained generation for LLMs, part of the grammar-constrained decoding literature." 398 }, 399 { 400 "title": "Synchromesh: Reliable code generation from pre-trained language models", 401 "authors": ["Gabriel Poesia", "Oleksandr Polozov", "Vu Le", "Ashish Tiwari", "Gustavo Soares", "Christopher Meek", "Sumit Gulwani"], 402 "year": 2022, 403 "arxiv_id": "2201.11227", 404 "relevance": "Proposes completion engines for reliable code generation from LLMs, a framework closely related to ChopChop's constrained decoding approach." 405 } 406 ] 407 }