scan.json (28092B)
1 { 2 "paper": { 3 "title": "AdapTrack: Constrained Decoding without Distorting LLM's Output Intent", 4 "authors": [ 5 "Yongmin Li", 6 "Jia Li", 7 "Ge Li", 8 "Zhi Jin" 9 ], 10 "year": 2025, 11 "venue": "arXiv preprint (Conference acronym 'XX' placeholder indicates unpublished/preprint)", 12 "arxiv_id": "2510.17376" 13 }, 14 "checklist": { 15 "artifacts": { 16 "code_released": { 17 "applies": true, 18 "answer": false, 19 "justification": "Section 6.3 states 'We open-source the modified package together with our source code' but NO repository URL, GitHub link, or archive link is provided anywhere in the paper. The schema requires 'a working URL or archive.' A claim of open-source release without a verifiable link is NO." 20 }, 21 "data_released": { 22 "applies": true, 23 "answer": false, 24 "justification": "The paper's primary novel datasets (TFv1 with 419 APIs, TFv1Real with 1,000 sampled files) are constructed by the authors but no download link or URL is provided. While HumanEval and MBPP are publicly available, the key custom datasets that represent the paper's contribution are not verifiably released." 25 }, 26 "environment_specified": { 27 "applies": true, 28 "answer": false, 29 "justification": "The paper names libraries (PyTorch, Transformers, transformers-CFG) and hardware (64-core Intel Xeon, 8 NVIDIA RTX A6000 GPUs) but provides no library version numbers, no requirements.txt, no Dockerfile, and no environment setup section with specific dependency versions. This is insufficient to recreate the experimental environment." 30 }, 31 "reproduction_instructions": { 32 "applies": true, 33 "answer": false, 34 "justification": "The paper describes the algorithm and mentions open-sourcing code, but does not provide step-by-step instructions for reproducing the experimental results. No README commands, scripts, or reproduction guide are described in the paper." 35 } 36 }, 37 "statistical_methodology": { 38 "confidence_intervals_or_error_bars": { 39 "applies": true, 40 "answer": false, 41 "justification": "All results in Tables 1, 2, 3, and 4 are point estimates only (e.g., '80.19%', '49.30%'). No confidence intervals, error bars, or standard deviations are reported for any main result." 42 }, 43 "significance_tests": { 44 "applies": true, 45 "answer": false, 46 "justification": "The paper makes numerous comparative claims ('AdapTrack performs significantly better than constrained decoding') but provides no statistical significance tests (no p-values, t-tests, or similar). Comparisons are made by directly comparing point estimates." 47 }, 48 "effect_sizes_reported": { 49 "applies": true, 50 "answer": true, 51 "justification": "The paper reports percentage improvements with baseline context (e.g., 'improvement of 360.87% compared to constrained decoding, from 10.98% to 50.60%'), which provides enough context to assess effect magnitudes." 52 }, 53 "sample_size_justified": { 54 "applies": true, 55 "answer": false, 56 "justification": "The TFv1Real dataset uses 1,000 randomly picked files (from 12,883 available), and 20 random samples per problem are collected, but no justification is given for why these sample sizes are sufficient to support the claims. No power analysis is mentioned." 57 }, 58 "variance_reported": { 59 "applies": true, 60 "answer": false, 61 "justification": "Tables 1, 2, 3, and 4 report single-point results only. No standard deviation, variance, or confidence intervals are reported across runs. Table 5 reports avg/min/max for LM call counts but not for the performance metrics." 62 } 63 }, 64 "evaluation_design": { 65 "baselines_included": { 66 "applies": true, 67 "answer": true, 68 "justification": "The paper compares AdapTrack against two direct baselines: unconstrained decoding and constrained decoding (Section 6.1). For RQ4, it also compares against ASAp (Adaptive Sampling with Approximate Expected Futures)." 69 }, 70 "baselines_contemporary": { 71 "applies": true, 72 "answer": true, 73 "justification": "ASAp (Park et al., 2024) is a recent and directly competing method. The other baselines (unconstrained and constrained decoding) are appropriate ablation-style comparisons, not just old methods. The paper acknowledges ASAp as the closest prior work." 74 }, 75 "ablation_study": { 76 "applies": true, 77 "answer": true, 78 "justification": "Section 7.2.2 and Figure 7 conduct experiments with limited backtracking distances (0, 1, 2, 4, 8), showing how performance changes as a key component (backtrack distance) is varied. The robustness experiments on temperature and model size also serve as ablation-like analyses." 79 }, 80 "multiple_metrics": { 81 "applies": true, 82 "answer": true, 83 "justification": "The paper reports EM@k metrics (k=1,3,5,10,20) for API completion tasks and pass@k for general code generation, plus KL divergence for distribution alignment — at least three distinct metric types." 84 }, 85 "human_evaluation": { 86 "applies": false, 87 "answer": false, 88 "justification": "This is a decoding algorithm evaluated on automated code correctness benchmarks (exact match, pass@k). Human evaluation is not relevant to claims about whether generated code satisfies API constraints." 89 }, 90 "held_out_test_set": { 91 "applies": true, 92 "answer": true, 93 "justification": "TFv1Real was randomly sampled from 12,883 files (Section 6.2), and the paper uses established held-out benchmarks HumanEval and MBPP. The TFv1 dataset is synthetic but constructed specifically for evaluation, not for method tuning." 94 }, 95 "per_category_breakdown": { 96 "applies": true, 97 "answer": true, 98 "justification": "Results are broken down by model (four different code LLMs), by dataset (TFv1 v1 setting, TFv1 v2 setting, TFv1Real, HumanEval, MBPP), and by DSL subdataset (SLIA, INV-BV, CP, binary). Table 4 provides per-dataset KL divergence." 99 }, 100 "failure_cases_discussed": { 101 "applies": true, 102 "answer": true, 103 "justification": "Section 6.3 discusses cases where AdapTrack performs worse than unconstrained decoding (e.g., 'pass@1 of StarCoder2 7B and CodeLlama 7B on MBPP'), attributing it to the constrainer's limitations. Section 6.4 discusses INV-BV where all methods perform similarly." 104 }, 105 "negative_results_reported": { 106 "applies": true, 107 "answer": true, 108 "justification": "The paper explicitly notes that AdapTrack sometimes performs worse than unconstrained decoding (Table 3, pass@1 for StarCoder2 7B and CodeLlama 7B on MBPP), and explains the failure case. The outlier in pass@20 of CodeLlama 7B on HumanEval is also noted." 109 } 110 }, 111 "claims_and_evidence": { 112 "abstract_claims_supported": { 113 "applies": true, 114 "answer": true, 115 "justification": "The abstract claims improvements of up to 360.87% on TFv1, 38.93% on TFv1Real, 7.84% on HumanEval, and 6.42% on MBPP over constrained decoding. These specific numbers are confirmed in Tables 1, 2, and 3. The theoretical proof is presented in Section 5." 116 }, 117 "causal_claims_justified": { 118 "applies": true, 119 "answer": true, 120 "justification": "The main causal claim is that AdapTrack improves performance by avoiding distribution distortion. The paper supports this causally via: (1) a mathematical proof that AdapTrack follows the model's true distribution; (2) controlled experiments where only the decoding method changes while all other variables are held constant; (3) distribution alignment experiments (RQ4) showing lower KL divergence." 121 }, 122 "generalization_bounded": { 123 "applies": true, 124 "answer": false, 125 "justification": "The title 'Constrained Decoding without Distorting LLM's Output Intent' and abstract frame the contribution broadly for 'language model-based code generation.' However, experiments are conducted only on 7B base models (with 0.5B-32B robustness check), only in Python and TypeScript, and only with specific constrainer types. The paper does not explicitly bound its claims to these settings. The schema says 'If the paper tests on Python and claims results for code generation generally, NO.'" 126 }, 127 "alternative_explanations_discussed": { 128 "applies": true, 129 "answer": true, 130 "justification": "Section 7.3 discusses two specific threats to validity: (1) whether the problem is specific to deprecated APIs only (mitigated by RQ1 TFv1 v1-setting and RQ3 experiments); (2) whether the problem is model-capability-dependent (mitigated by using multiple models). These are specific alternative explanations, not generic boilerplate." 131 } 132 }, 133 "setup_transparency": { 134 "model_versions_specified": { 135 "applies": true, 136 "answer": true, 137 "justification": "Specific model versions are named: 'Qwen2.5 Coder 7B', 'DeepSeek Coder Base 6.7B', 'StarCoder2 7B', 'CodeLlama Python 7B', 'Mistral-7B' for RQ4. These are specific enough to identify the models used." 138 }, 139 "prompts_provided": { 140 "applies": true, 141 "answer": false, 142 "justification": "For the RQ4 DSL experiments, prompts are described as containing 'a grammar and specification' plus '3 in-context examples in the form of (Problem, Solution) pairs', but the actual prompt text is not provided. For other experiments, the code context format is shown (Figure 3) but the full prompt is not given." 143 }, 144 "hyperparameters_reported": { 145 "applies": true, 146 "answer": true, 147 "justification": "Temperature is specified as 1 for main experiments (Section 7.1.1). Section 7.1.1 also specifies temperatures tested (0.1 to 1.0). The top-p variant used in RQ3 is described. A timeout of 60 seconds per token validity check is specified." 148 }, 149 "scaffolding_described": { 150 "applies": false, 151 "answer": false, 152 "justification": "AdapTrack is a decoding algorithm, not an agentic scaffold. There is no agentic scaffolding in this work — the method operates directly at the token generation level without tool use, memory management, or agent loops." 153 }, 154 "data_preprocessing_documented": { 155 "applies": true, 156 "answer": true, 157 "justification": "Section 6.2 describes the full data pipeline for TFv1Real with explicit steps and counts: 46,785 Python files collected, deduplicated and split, files containing tf.compat.v1 in prefix removed, non-API-call suffixes removed → 14,237 files; then filtered by context length >3,584 tokens → 12,883 files; then randomly sampled 1,000 files. Each filtering criterion is explicitly stated." 158 } 159 }, 160 "limitations_and_scope": { 161 "limitations_section_present": { 162 "applies": true, 163 "answer": true, 164 "justification": "Section 7.3 'Threats to validity' discusses limitations, including dataset selection threats (7.3.1) and model selection threats (7.3.2). This is a dedicated subsection with substantive discussion." 165 }, 166 "threats_to_validity_specific": { 167 "applies": true, 168 "answer": true, 169 "justification": "Section 7.3.1 specifically addresses whether the problem is only present in deprecated APIs, and 7.3.2 addresses whether the improvement is model-specific. These are specific threats to this study, not generic disclaimers. However, they lack discussion of efficiency costs in the threats section (addressed elsewhere)." 170 }, 171 "scope_boundaries_stated": { 172 "applies": true, 173 "answer": false, 174 "justification": "While threats to validity are discussed, the paper does not explicitly state what the results do NOT show. For example, there is no discussion of whether results transfer to instruction-tuned models (only base models are tested), or to languages beyond Python/TypeScript. The discussion of limitations is focused on rebutting threats rather than clearly bounding what was not tested." 175 } 176 }, 177 "data_integrity": { 178 "raw_data_available": { 179 "applies": true, 180 "answer": false, 181 "justification": "The paper claims open-source release but provides no URL or link to datasets in the text. The raw TFv1Real dataset (12,883 filtered GitHub files, then 1,000 sampled) is not independently verifiable from the paper alone." 182 }, 183 "data_collection_described": { 184 "applies": true, 185 "answer": true, 186 "justification": "TFv1 dataset construction is described in detail (Section 6.1): collected all legal TensorFlow 2.16 APIs, filtered for tf.compat.v1 prefix, excluded APIs with v2 counterparts, retained only lowercase-starting APIs, removed prefix-able APIs, yielding 419 APIs. TFv1Real collection process is similarly detailed (Section 6.2)." 187 }, 188 "recruitment_methods_described": { 189 "applies": false, 190 "answer": false, 191 "justification": "This paper has no human participants. Data is collected from GitHub repositories (public data) and official TensorFlow API documentation, not from recruited participants." 192 }, 193 "data_pipeline_documented": { 194 "applies": true, 195 "answer": true, 196 "justification": "The full data pipeline from collection to final analysis is documented for both datasets (Section 6.1 and 6.2) with explicit counts and filtering criteria at each stage." 197 } 198 }, 199 "conflicts_of_interest": { 200 "funding_disclosed": { 201 "applies": true, 202 "answer": true, 203 "justification": "Acknowledgments section discloses funding: 'National Key R&D Program under Grant No. 2023YFB4503801, the National Natural Science Foundation of China under Grant No. 62192733, 62192730, 62192731, and the Major Program (JD) of Hubei Province (No.2023BAA024).'" 204 }, 205 "affiliations_disclosed": { 206 "applies": true, 207 "answer": true, 208 "justification": "Author affiliations are listed on the first page: Yongmin Li and Ge Li and Zhi Jin are at Peking University (Key Lab of High Confidence Software Technology); Jia Li is at Tsinghua University College of AI." 209 }, 210 "funder_independent_of_outcome": { 211 "applies": true, 212 "answer": true, 213 "justification": "Funding is from Chinese government research programs (National Key R&D Program, NSFC) which have no financial stake in whether constrained decoding or AdapTrack performs better. Funders are independent of the outcome." 214 }, 215 "financial_interests_declared": { 216 "applies": true, 217 "answer": false, 218 "justification": "No competing interests statement is present in the paper. Absence of disclosure is not the same as absence of conflict; the paper does not include a standard competing interests declaration." 219 } 220 }, 221 "contamination": { 222 "training_cutoff_stated": { 223 "applies": true, 224 "answer": false, 225 "justification": "The paper uses HumanEval and MBPP benchmarks with base LLMs (Qwen2.5 Coder, DeepSeek Coder, StarCoder2, CodeLlama) but does not state the training data cutoff dates for any of these models." 226 }, 227 "train_test_overlap_discussed": { 228 "applies": true, 229 "answer": false, 230 "justification": "The paper uses HumanEval (2021) and MBPP (2021) with models trained after these benchmarks were published. No discussion of whether these benchmarks appear in the training data is provided." 231 }, 232 "benchmark_contamination_addressed": { 233 "applies": true, 234 "answer": false, 235 "justification": "HumanEval and MBPP were both published in 2021. All models used (Qwen2.5, DeepSeek, StarCoder2, CodeLlama) were trained after 2021 and may have seen these benchmarks in training. The paper does not address this contamination risk." 236 } 237 }, 238 "human_studies": { 239 "pre_registered": { 240 "applies": false, 241 "answer": false, 242 "justification": "No human participants are involved in this study. It is a benchmark evaluation of a decoding algorithm." 243 }, 244 "irb_or_ethics_approval": { 245 "applies": false, 246 "answer": false, 247 "justification": "No human participants are involved in this study. IRB approval is not applicable." 248 }, 249 "demographics_reported": { 250 "applies": false, 251 "answer": false, 252 "justification": "No human participants are involved in this study." 253 }, 254 "inclusion_exclusion_criteria": { 255 "applies": false, 256 "answer": false, 257 "justification": "No human participants are involved in this study." 258 }, 259 "randomization_described": { 260 "applies": false, 261 "answer": false, 262 "justification": "No human participants are involved in this study." 263 }, 264 "blinding_described": { 265 "applies": false, 266 "answer": false, 267 "justification": "No human participants are involved in this study." 268 }, 269 "attrition_reported": { 270 "applies": false, 271 "answer": false, 272 "justification": "No human participants are involved in this study." 273 } 274 }, 275 "cost_and_practicality": { 276 "inference_cost_reported": { 277 "applies": true, 278 "answer": true, 279 "justification": "Table 5 reports the number of language model calls per sample (avg, min, max) for AdapTrack vs. baselines, showing AdapTrack requires approximately 20-60% more LM calls. This is a meaningful proxy for inference cost." 280 }, 281 "compute_budget_stated": { 282 "applies": true, 283 "answer": false, 284 "justification": "The hardware (8 NVIDIA RTX A6000 GPUs) is specified but total GPU hours, wall-clock time for experiments, or total API cost is not reported. The paper notes efficiency concerns but does not quantify the total compute budget." 285 } 286 } 287 }, 288 "claims": [ 289 { 290 "claim": "AdapTrack achieves up to 360.87% improvement over constrained decoding on the synthetic TFv1 API completion dataset.", 291 "evidence": "Table 1 (Section 6.1): DeepSeek Coder Base 6.7B EM@1 in TensorFlow v2 setting improves from 10.98% (constrained decoding) to 50.60% (AdapTrack), which is a 360.87% relative improvement.", 292 "supported": "strong" 293 }, 294 { 295 "claim": "AdapTrack achieves up to 38.93% improvement over constrained decoding on the real-world TFv1Real API completion dataset.", 296 "evidence": "Table 2 (Section 6.2): StarCoder2 7B EM@1 improves from 29.80% to 41.40%, an improvement of 38.93% relative. Results shown for 4 models.", 297 "supported": "strong" 298 }, 299 { 300 "claim": "AdapTrack achieves up to 7.84% improvement on HumanEval and 6.42% on MBPP over constrained decoding.", 301 "evidence": "Table 3 (Section 6.3): Qwen2.5 Coder 7B pass@1 on HumanEval improves from 64.15% to 69.18% (7.84% relative improvement). MBPP improvement 66.67% to 70.77% (6.15%, not exactly 6.42% — the 6.42% must be from another model/metric combination).", 302 "supported": "moderate" 303 }, 304 { 305 "claim": "AdapTrack's distribution aligns better with the oracle language model distribution than constrained decoding or ASAp, as measured by KL divergence.", 306 "evidence": "Table 4 (Section 6.4): AdapTrack achieves average KL divergence of 9.97 vs. 11.46 for constrained decoding and 11.90 for ASAp. Results shown across 4 DSL datasets using Mistral-7B.", 307 "supported": "strong" 308 }, 309 { 310 "claim": "AdapTrack is robust to temperature variations (0.0 to 1.0) and model sizes (0.5B to 32B), consistently outperforming constrained decoding.", 311 "evidence": "Figures 5 and 6 (Section 7.1): Temperature analysis with CodeLlama and Qwen2.5; model size analysis with Qwen2.5 Coder 0.5B-32B series on TFv1 and TFv1Real.", 312 "supported": "strong" 313 }, 314 { 315 "claim": "The distribution produced by AdapTrack is mathematically identical to the language model's distribution conditioned on constraints.", 316 "evidence": "Section 5 provides Theorems 5.1 and 5.2 with formal proofs. The proof shows that P_Q(s) = (1/Q[epsilon]) * P(s), meaning the algorithm samples proportional to the unconstrained model probability.", 317 "supported": "strong" 318 } 319 ], 320 "methodology_tags": [ 321 "benchmark-eval", 322 "theoretical" 323 ], 324 "key_findings": "AdapTrack addresses a fundamental flaw in constrained decoding where greedily eliminating invalid tokens distorts the language model's probability distribution, causing it to generate suboptimal valid outputs instead of the intended ones. By incorporating rejection-sampling-based adaptive backtracking, AdapTrack maintains a distribution that is provably proportional to the unconstrained model distribution while satisfying constraints. Empirically, AdapTrack achieves dramatic improvements over naive constrained decoding on API completion tasks (up to 360.87% relative improvement) and more modest improvements on general code generation benchmarks (up to 7.84% on HumanEval), with increased compute cost of approximately 20-60% more LM calls.", 325 "red_flags": [ 326 { 327 "flag": "No statistical significance testing", 328 "detail": "Despite making comparative claims ('AdapTrack performs significantly better') across multiple tables, the paper reports only point estimates with no confidence intervals, standard deviations, or significance tests. The word 'significantly' is used colloquially rather than statistically." 329 }, 330 { 331 "flag": "Benchmark contamination not addressed", 332 "detail": "HumanEval and MBPP were published in 2021. All four models evaluated (Qwen2.5 Coder, DeepSeek Coder, StarCoder2, CodeLlama) were trained after 2021 and likely saw these benchmarks in training. This is particularly important because the baseline (unconstrained decoding) could be inflated by memorization, affecting the relative improvement numbers." 333 }, 334 { 335 "flag": "No URL provided for open-source release", 336 "detail": "The paper claims to open-source code and the modified constrainer package, but no URL is provided in the paper text. This makes the claimed release unverifiable from the paper alone." 337 }, 338 { 339 "flag": "Venue placeholder in header", 340 "detail": "The paper header contains 'Conference acronym XX, June 03-05, 2018, Woodstock, NY' — a template placeholder indicating this is an unsubmitted or preprint version. This affects the reliability of the review process." 341 } 342 ], 343 "cited_papers": [ 344 { 345 "title": "Evaluating large language models trained on code", 346 "authors": [ 347 "Mark Chen", 348 "Jerry Tworek", 349 "Heewoo Jun" 350 ], 351 "year": 2021, 352 "arxiv_id": "2107.03374", 353 "relevance": "Introduces HumanEval benchmark and pass@k metric, used as evaluation benchmark in this paper." 354 }, 355 { 356 "title": "Grammar-aligned decoding", 357 "authors": [ 358 "Kanghee Park", 359 "Jiayu Wang", 360 "Taylor Berg-Kirkpatrick", 361 "Nadia Polikarpova", 362 "Loris D'Antoni" 363 ], 364 "year": 2024, 365 "relevance": "Introduces ASAp (Adaptive Sampling with Approximate Expected Futures), the main competing method for fixing constrained decoding distribution distortion." 366 }, 367 { 368 "title": "Monitor-guided decoding of code lms with static analysis of repository context", 369 "authors": [ 370 "Lakshya A Agrawal", 371 "Aditya Kanade", 372 "Navin Goyal", 373 "Shuvendu Lahiri", 374 "Sriram Rajamani" 375 ], 376 "year": 2023, 377 "relevance": "Introduces constrained decoding using static analysis tools, directly related to the problem AdapTrack solves." 378 }, 379 { 380 "title": "Copiloting the copilots: Fusing large language models with completion engines for automated program repair", 381 "authors": [ 382 "Yuxiang Wei", 383 "Chunqiu Steven Xia", 384 "Lingming Zhang" 385 ], 386 "year": 2023, 387 "relevance": "Repilot introduces constrained decoding for general-purpose code generation using IDE language servers, closely related to this work." 388 }, 389 { 390 "title": "Type-Constrained Code Generation with Language Models", 391 "authors": [ 392 "Niels Mündler", 393 "Jingxuan He", 394 "Hao Wang", 395 "Koushik Sen", 396 "Dawn Song", 397 "Martin Vechev" 398 ], 399 "year": 2025, 400 "relevance": "Introduces type-system-based constrained decoding used as the constrainer in RQ3 experiments of AdapTrack." 401 }, 402 { 403 "title": "SWE-bench: Can Language Models Resolve Real-world Github Issues?", 404 "authors": [ 405 "Carlos E Jimenez", 406 "John Yang", 407 "Alexander Wettig", 408 "Shunyu Yao", 409 "Kexin Pei", 410 "Ofir Press", 411 "Karthik R Narasimhan" 412 ], 413 "year": 2024, 414 "relevance": "Major benchmark for evaluating LLM coding capabilities on real software engineering tasks, cited as context for code generation tools." 415 }, 416 { 417 "title": "DeepSeek-Coder: When the Large Language Model Meets Programming", 418 "authors": [ 419 "Daya Guo", 420 "Qihao Zhu", 421 "Dejian Yang" 422 ], 423 "year": 2024, 424 "arxiv_id": "2401.14196", 425 "relevance": "One of the four code LLMs evaluated in the paper; relevant to AI code generation capability." 426 }, 427 { 428 "title": "Synchromesh: Reliable Code Generation from Pre-trained Language Models", 429 "authors": [ 430 "Gabriel Poesia", 431 "Alex Polozov", 432 "Vu Le", 433 "Ashish Tiwari", 434 "Gustavo Soares", 435 "Christopher Meek", 436 "Sumit Gulwani" 437 ], 438 "year": 2022, 439 "relevance": "Introduces incremental parsing into constrained code generation, a key baseline method." 440 }, 441 { 442 "title": "LLMs Meet Library Evolution: Evaluating Deprecated API Usage in LLM-based Code Completion", 443 "authors": [ 444 "Chong Wang", 445 "Kaifeng Huang", 446 "Jian Zhang", 447 "Yebo Feng", 448 "Lyuye Zhang", 449 "Yang Liu", 450 "Xin Peng" 451 ], 452 "year": 2025, 453 "arxiv_id": "2406.09834", 454 "relevance": "Directly evaluates the problem of deprecated API usage in LLM-based code completion, the core motivation for AdapTrack." 455 }, 456 { 457 "title": "VersiCode: Towards Version-controllable Code Generation", 458 "authors": [ 459 "Tongtong Wu", 460 "Weigang Wu", 461 "Xingyu Wang" 462 ], 463 "year": 2024, 464 "arxiv_id": "2406.07411", 465 "relevance": "Focuses on version-specific code completion and migration, directly related to the API versioning problem addressed by AdapTrack." 466 }, 467 { 468 "title": "Program synthesis with large language models", 469 "authors": [ 470 "Jacob Austin", 471 "Augustus Odena", 472 "Maxwell Nye", 473 "Maarten Bosma" 474 ], 475 "year": 2021, 476 "arxiv_id": "2108.07732", 477 "relevance": "Introduces MBPP benchmark used in evaluation; relevant to LLM code generation capability assessment." 478 }, 479 { 480 "title": "ROCODE: Integrating Backtracking Mechanism and Program Analysis in Large Language Models for Code Generation", 481 "authors": [ 482 "Xue Jiang", 483 "Yihong Dong", 484 "Yongding Tao", 485 "Huanyu Liu", 486 "Zhi Jin", 487 "Ge Li" 488 ], 489 "year": 2025, 490 "relevance": "Related work on backtracking in LLM code generation using static analysis, distinct from AdapTrack's distribution-correction approach." 491 }, 492 { 493 "title": "Controllable Generation via Locally Constrained Resampling", 494 "authors": [ 495 "Kareem Ahmed", 496 "Kai-Wei Chang", 497 "Guy Van den Broeck" 498 ], 499 "year": 2025, 500 "relevance": "Gen-C models constraints with constraint circuits and resamples to fix greedy constrained decoding; directly related competing approach." 501 } 502 ] 503 }