scan.json (19172B)
1 { 2 "paper": { 3 "title": "Constrained Decoding for Fill-in-the-Middle Code Language Models via Efficient Left and Right Quotienting of Context-Sensitive Grammars", 4 "authors": ["Daniel Melcer", "Nathan Fulton", "Sanjay Krishna Gouda", "Haifeng Qian"], 5 "year": 2024, 6 "venue": "arXiv", 7 "arxiv_id": "2402.17988" 8 }, 9 "checklist": { 10 "artifacts": { 11 "code_released": { 12 "applies": true, 13 "answer": false, 14 "justification": "Footnote 5 states 'Link omitted for review; see supplemental material.' No working URL is provided in the paper." 15 }, 16 "data_released": { 17 "applies": true, 18 "answer": true, 19 "justification": "The paper uses the publicly available The Stack dataset ('the-stack-smol-xl') and states 'We include the code and random seeds necessary to exactly reproduce both datasets in our supplemental material' (Section VII-A)." 20 }, 21 "environment_specified": { 22 "applies": true, 23 "answer": false, 24 "justification": "No requirements.txt, Dockerfile, or detailed environment specification is provided. The paper mentions implementation is 'largely in Python, with selected subroutines written in Rust' but gives no version details." 25 }, 26 "reproduction_instructions": { 27 "applies": true, 28 "answer": false, 29 "justification": "While random seeds and dataset construction details are described, there are no step-by-step reproduction instructions, README, or scripts to replicate the experiments. The code link is omitted." 30 } 31 }, 32 "statistical_methodology": { 33 "confidence_intervals_or_error_bars": { 34 "applies": true, 35 "answer": false, 36 "justification": "Results in Tables I and II are reported as raw counts with no confidence intervals or error bars." 37 }, 38 "significance_tests": { 39 "applies": true, 40 "answer": false, 41 "justification": "The paper claims constrained generation 'performs significantly better' but provides no statistical significance tests — only raw count comparisons." 42 }, 43 "effect_sizes_reported": { 44 "applies": true, 45 "answer": true, 46 "justification": "Tables I and II provide absolute counts and the full confusion matrix, allowing effect sizes to be computed (e.g., constrained succeeds on 90665/95390 vs unconstrained on 65353/95390 for STACK-BOUNDARY)." 47 }, 48 "sample_size_justified": { 49 "applies": true, 50 "answer": false, 51 "justification": "The sample sizes (95390 experiments from 9539 files, 10 per file) are described but not justified via power analysis or other rationale." 52 }, 53 "variance_reported": { 54 "applies": true, 55 "answer": false, 56 "justification": "No variance, standard deviation, or spread measures are reported. Results appear to be single-run experiments with greedy sampling." 57 } 58 }, 59 "evaluation_design": { 60 "baselines_included": { 61 "applies": true, 62 "answer": true, 63 "justification": "The paper compares against unconstrained generation and checked unconstrained generation as baselines (Tables I and II)." 64 }, 65 "baselines_contemporary": { 66 "applies": true, 67 "answer": false, 68 "justification": "The paper does not compare against other constrained decoding methods for code (e.g., Synchromesh, grammar-aligned decoding [31], or Outlines [11/18]). Only unconstrained generation baselines are used." 69 }, 70 "ablation_study": { 71 "applies": true, 72 "answer": false, 73 "justification": "No ablation study is provided to isolate the contribution of individual components (e.g., lexer branching, indentation handling, parentheses handling)." 74 }, 75 "multiple_metrics": { 76 "applies": true, 77 "answer": false, 78 "justification": "The only metric is syntactic correctness (whether ast.parse succeeds). No functional correctness, code quality, or other metrics are reported." 79 }, 80 "human_evaluation": { 81 "applies": true, 82 "answer": false, 83 "justification": "No human evaluation of generated code quality is included. Evaluation is entirely automated via ast.parse." 84 }, 85 "held_out_test_set": { 86 "applies": true, 87 "answer": true, 88 "justification": "The evaluation uses a subset of The Stack ('the-stack-smol-xl') which is a separate dataset from SantaCoder's training data. The datasets are constructed from this held-out subset." 89 }, 90 "per_category_breakdown": { 91 "applies": true, 92 "answer": true, 93 "justification": "Results are broken down by dataset (STACK-BOUNDARY vs STACK-RANDSPAN) and failure cases are categorized (29 parser issues vs 4696 model failures in Section VII-C1)." 94 }, 95 "failure_cases_discussed": { 96 "applies": true, 97 "answer": true, 98 "justification": "Section VII-C1 provides detailed failure analysis distinguishing between parser-related failures (29 cases) and model failures to connect to right context (4696 cases)." 99 }, 100 "negative_results_reported": { 101 "applies": true, 102 "answer": true, 103 "justification": "The paper reports 4725 failure cases for STACK-BOUNDARY and discusses the soundness-completeness tradeoff (Section VI-B) and cases where the method accepts invalid programs (Figure 10)." 104 } 105 }, 106 "claims_and_evidence": { 107 "abstract_claims_supported": { 108 "applies": true, 109 "answer": true, 110 "justification": "The abstract claims constrained generation 'can significantly reduce the incidence of syntax errors,' which is supported by Tables I and II showing large reductions (e.g., from ~30000 failures to ~4725 on STACK-BOUNDARY)." 111 }, 112 "causal_claims_justified": { 113 "applies": true, 114 "answer": true, 115 "justification": "The causal claim is that constrained decoding reduces syntax errors. The experimental design (same model, same inputs, constrained vs unconstrained) is a controlled comparison adequate for this claim." 116 }, 117 "generalization_bounded": { 118 "applies": true, 119 "answer": true, 120 "justification": "The paper bounds claims to Python 3 and SantaCoder, describing the implementation as a 'proof-of-concept' and noting it evaluates 'the particularly difficult case of FIM completion for Python 3.'" 121 }, 122 "alternative_explanations_discussed": { 123 "applies": true, 124 "answer": true, 125 "justification": "Section VII-C discusses why STACK-RANDSPAN performs better (less text removed despite harder contexts), and the failure analysis considers multiple causes for failures (parser limitations vs model limitations)." 126 } 127 }, 128 "setup_transparency": { 129 "model_versions_specified": { 130 "applies": true, 131 "answer": true, 132 "justification": "The paper specifies 'SantaCoder' [30] which is a specific, versioned open-source model with a clear reference." 133 }, 134 "prompts_provided": { 135 "applies": true, 136 "answer": true, 137 "justification": "The FIM prompt format is fully specified in Section I (FIM-PREFIX, FIM-SUFFIX, FIM-MIDDLE tokens with exact concatenation order)." 138 }, 139 "hyperparameters_reported": { 140 "applies": true, 141 "answer": true, 142 "justification": "Section VII-B reports greedy sampling, 500 token limit, and top-50 candidate fallback strategy." 143 }, 144 "scaffolding_described": { 145 "applies": false, 146 "answer": false, 147 "justification": "No agentic scaffolding is used. This is a single-pass constrained decoding system." 148 }, 149 "data_preprocessing_documented": { 150 "applies": true, 151 "answer": true, 152 "justification": "Section VII-A documents preprocessing: 459 files excluded for ast.parse errors, 2 files excluded for unimplemented features, leaving 9539 files. Dataset construction procedures are detailed for both STACK-BOUNDARY and STACK-RANDSPAN." 153 } 154 }, 155 "limitations_and_scope": { 156 "limitations_section_present": { 157 "applies": true, 158 "answer": true, 159 "justification": "Section VIII (Future Work) and Section VI-B (Completeness-Soundness-Complexity Tradeoff) substantively discuss limitations. Section VII-D acknowledges the prototype nature of the implementation." 160 }, 161 "threats_to_validity_specific": { 162 "applies": true, 163 "answer": true, 164 "justification": "The paper discusses specific threats: the parser is not sound (Figure 10 shows concrete invalid programs accepted), implementation is a research prototype with Python performance limitations, and Python's grammar is specified as PEG so exact CFG conversion may be impossible (footnote 6)." 165 }, 166 "scope_boundaries_stated": { 167 "applies": true, 168 "answer": true, 169 "justification": "The paper explicitly states scope boundaries: 'more complete code generation systems, and evaluations for systems that include metrics of context escape, are out of scope for this paper' (Section VIII-A). It also notes this is a proof-of-concept for Python 3 only." 170 } 171 }, 172 "data_integrity": { 173 "raw_data_available": { 174 "applies": true, 175 "answer": true, 176 "justification": "The base dataset (The Stack) is publicly available, and the paper states random seeds for dataset construction are included in supplemental material." 177 }, 178 "data_collection_described": { 179 "applies": true, 180 "answer": true, 181 "justification": "Section VII-A describes the data source (the-stack-smol-xl, 10000 Python files from GitHub), filtering criteria, and how both synthetic datasets were constructed." 182 }, 183 "recruitment_methods_described": { 184 "applies": false, 185 "answer": false, 186 "justification": "No human participants. Data is from a standard public benchmark (The Stack)." 187 }, 188 "data_pipeline_documented": { 189 "applies": true, 190 "answer": true, 191 "justification": "The pipeline is documented: 10000 files → exclude 459 parse errors → exclude 2 unimplemented features → 9539 files → 10 experiments per file → 95390 experiments. Both dataset construction methods are specified." 192 } 193 }, 194 "conflicts_of_interest": { 195 "funding_disclosed": { 196 "applies": true, 197 "answer": false, 198 "justification": "No funding or acknowledgments section is present in the paper." 199 }, 200 "affiliations_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Author affiliations are clearly listed: Northeastern University, MIT-IBM AI Lab, and AWS AI Labs. Two authors are from AWS, which produces Amazon Q Developer (mentioned in the introduction)." 204 }, 205 "funder_independent_of_outcome": { 206 "applies": true, 207 "answer": false, 208 "justification": "Two authors are from AWS AI Labs, and the paper's techniques are directly relevant to Amazon Q Developer (cited in the introduction). AWS has a financial interest in improved code completion." 209 }, 210 "financial_interests_declared": { 211 "applies": true, 212 "answer": false, 213 "justification": "No competing interests or financial interests statement is present in the paper." 214 } 215 }, 216 "contamination": { 217 "training_cutoff_stated": { 218 "applies": true, 219 "answer": false, 220 "justification": "The paper does not state SantaCoder's training data cutoff date." 221 }, 222 "train_test_overlap_discussed": { 223 "applies": true, 224 "answer": false, 225 "justification": "No discussion of whether the-stack-smol-xl files overlap with SantaCoder's training data, despite both being sourced from The Stack." 226 }, 227 "benchmark_contamination_addressed": { 228 "applies": true, 229 "answer": false, 230 "justification": "SantaCoder was trained on The Stack, and the evaluation uses a subset of The Stack. This potential contamination is not addressed." 231 } 232 }, 233 "human_studies": { 234 "pre_registered": { 235 "applies": false, 236 "answer": false, 237 "justification": "No human participants." 238 }, 239 "irb_or_ethics_approval": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants." 243 }, 244 "demographics_reported": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants." 248 }, 249 "inclusion_exclusion_criteria": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants." 253 }, 254 "randomization_described": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants." 258 }, 259 "blinding_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants." 263 }, 264 "attrition_reported": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants." 268 } 269 }, 270 "cost_and_practicality": { 271 "inference_cost_reported": { 272 "applies": true, 273 "answer": true, 274 "justification": "Section VII-D reports per-token overhead timing (with regression equations) and one-time overhead for constrained generation, comparing against checked unconstrained generation." 275 }, 276 "compute_budget_stated": { 277 "applies": true, 278 "answer": false, 279 "justification": "No total compute budget, GPU hours, or hardware specifications are stated for running the experiments." 280 } 281 } 282 }, 283 "claims": [ 284 { 285 "claim": "Constrained generation significantly reduces syntax errors compared to unconstrained generation for FIM tasks.", 286 "evidence": "Table I: constrained succeeds on 90665/95390 (95.0%) vs unconstrained on 65353/95390 (68.5%) for STACK-BOUNDARY. Table II: 92085/95390 (96.5%) vs 68590/95390 (71.9%) for STACK-RANDSPAN.", 287 "supported": "strong" 288 }, 289 { 290 "claim": "The constrained generation method has near-constant per-token overhead independent of context size.", 291 "evidence": "Figure 11 (top) shows R² = 4.22×10⁻³ for constrained generation overhead vs context size, compared to R² = 0.544 for checked unconstrained generation.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "The method handles context-sensitive features of real programming languages including whitespace sensitivity and leftmost-longest lexing.", 296 "evidence": "Sections V and VI detail the algorithms. The evaluation on Python 3 (which has whitespace sensitivity) demonstrates practical handling, though Section VI-B acknowledges soundness tradeoffs.", 297 "supported": "moderate" 298 }, 299 { 300 "claim": "Checked unconstrained generation fixes some cases but constrained generation still outperforms it.", 301 "evidence": "Table I: checked unconstrained recovers 5110 additional cases beyond unconstrained, but constrained succeeds on 490 cases where checked unconstrained fails.", 302 "supported": "strong" 303 } 304 ], 305 "methodology_tags": ["benchmark-eval"], 306 "key_findings": "The paper extends the Earley parsing algorithm to support left and right quotienting for context-sensitive grammars, enabling constrained fill-in-the-middle code generation. On two synthetic Python datasets derived from The Stack, constrained generation with SantaCoder achieves 95-96.5% syntactic correctness compared to 68.5-71.9% for unconstrained generation. The method introduces near-constant per-token overhead independent of context size, unlike checked unconstrained generation which scales linearly. Most failures (4696/4725 on STACK-BOUNDARY) are due to the model failing to connect to the right context rather than parser limitations.", 307 "red_flags": [ 308 { 309 "flag": "Potential train-test contamination", 310 "detail": "SantaCoder was trained on The Stack, and the evaluation uses a subset of The Stack (the-stack-smol-xl). The paper does not discuss whether evaluation files appeared in training data." 311 }, 312 { 313 "flag": "No comparison with other constrained decoding methods", 314 "detail": "The paper only compares against unconstrained baselines. Other constrained decoding approaches for code (grammar-aligned decoding, Outlines/Synchromesh) are not compared against, making it unclear if the improvement comes from the quotienting approach specifically or constrained decoding generally." 315 }, 316 { 317 "flag": "AWS conflict of interest undisclosed", 318 "detail": "Two authors are from AWS AI Labs, and the paper's techniques are directly applicable to Amazon Q Developer (cited in the introduction). No conflict of interest statement is provided." 319 } 320 ], 321 "cited_papers": [ 322 { 323 "title": "StarCoder: May the source be with you!", 324 "authors": ["R. Li", "L. B. Allal"], 325 "year": 2023, 326 "relevance": "Major open-source code LLM used widely in code generation benchmarks." 327 }, 328 { 329 "title": "Evaluating Large Language Models Trained on Code", 330 "authors": ["M. Chen", "J. Tworek"], 331 "year": 2021, 332 "relevance": "Introduced Codex and HumanEval benchmark, foundational to LLM code generation evaluation." 333 }, 334 { 335 "title": "Code Llama: Open Foundation Models for Code", 336 "authors": ["B. Rozière", "J. Gehring"], 337 "year": 2024, 338 "arxiv_id": "2308.12950", 339 "relevance": "Major open-source code LLM family with FIM capabilities." 340 }, 341 { 342 "title": "Efficient Guided Generation for Large Language Models", 343 "authors": ["B. T. Willard", "R. Louf"], 344 "year": 2023, 345 "relevance": "Key prior work on constrained decoding for LLMs using grammar-based sampling." 346 }, 347 { 348 "title": "Syntax-Aware On-the-Fly Code Completion", 349 "authors": ["W. Takerngsaksiri", "C. Tantithamthavorn", "Y.-F. Li"], 350 "year": 2023, 351 "relevance": "Prior work on syntax-aware constrained code completion." 352 }, 353 { 354 "title": "SantaCoder: Don't reach for the stars!", 355 "authors": ["L. B. Allal", "R. Li"], 356 "year": 2023, 357 "relevance": "The code LLM used in this paper's experiments for FIM evaluation." 358 }, 359 { 360 "title": "Grammar-aligned decoding", 361 "authors": ["K. Park", "J. Wang", "T. Berg-Kirkpatrick", "N. Polikarpova", "L. D'Antoni"], 362 "year": 2024, 363 "relevance": "Complementary constrained decoding technique that could be combined with this work." 364 }, 365 { 366 "title": "Efficient Training of Language Models to Fill in the Middle", 367 "authors": ["M. Bavarian", "H. Jun"], 368 "year": 2022, 369 "relevance": "Foundational work on FIM training for code LLMs, defining the FIM task format used in this paper." 370 }, 371 { 372 "title": "Deepseek-coder-v2: Breaking the barrier of closed-source models in code intelligence", 373 "year": 2024, 374 "arxiv_id": "2406.11931", 375 "relevance": "Major code LLM relevant to code generation capability evaluation." 376 } 377 ] 378 }