scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (19835B)
      1 {
      2   "paper": {
      3     "title": "Correctness-Guaranteed Code Generation via Constrained Decoding",
      4     "authors": ["Lingxiao Li", "Salar Rahili", "Yiwei Zhao"],
      5     "year": 2025,
      6     "venue": "COLM 2025",
      7     "arxiv_id": "2508.15866"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": false,
     14         "justification": "No repository URL or code archive is provided in the paper. The paper does not mention releasing source code."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": false,
     19         "justification": "The 8 talent category prompts and handcrafted examples are described in appendices but no downloadable dataset or data archive is provided."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions using 8 A100 GPUs and VLLM but does not provide a requirements.txt, Dockerfile, or detailed environment setup with library versions."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions, README, or runnable scripts are provided."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": false,
     36         "justification": "Results in Figure 3 show histograms of score distributions across 80 data points per method but no confidence intervals or error bars are reported."
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": false,
     41         "justification": "The paper claims its method achieves the highest success rate and quality but provides no statistical significance tests comparing methods."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Figure 3 provides score distributions and counts per bin (e.g., 53 in 80-100 for Ours-Reflection vs 38 for Claude). Table 1 reports tokens/sec (7.64 vs 2.04 and 1.47) giving magnitude of speedup."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "8 prompts x 10 runs = 80 data points per method. No justification for why 8 prompts or 10 runs is sufficient."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No standard deviation, IQR, or spread measures are reported. Only histogram distributions are shown."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Baselines include unconstrained generation with Claude-3.5-Sonnet and Qwen2.5-32B-Coder, both with and without reflection (Section 6.2)."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Claude-3.5-Sonnet and Qwen2.5-32B-Coder are contemporary strong coding LLMs. Table 1 also compares against Willard & Louf (2023) and Koo et al. (2024) for speed."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The reflection variant serves as a partial ablation. The comparison between constrained and unconstrained with the same model (Qwen2.5-32B-Coder) isolates the constrained decoding contribution."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The paper reports success rate (whether generation terminates and produces valid code), quality scores (1-100 via LLM judge), and inference speed (tokens/sec in Table 1)."
     79       },
     80       "human_evaluation": {
     81         "applies": true,
     82         "answer": false,
     83         "justification": "Quality evaluation is done entirely by Claude-3.5-Sonnet as an LLM judge. No human evaluation of generated code quality is reported."
     84       },
     85       "held_out_test_set": {
     86         "applies": true,
     87         "answer": false,
     88         "justification": "The 8 talent category prompts are used for evaluation but there is no separation into dev/test sets. The same prompts are used for all methods."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Figure 3 provides score distributions broken down by score ranges (0-19, 20-39, etc.) for each method. Appendix C.8 details the 8 individual prompts."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Section 6.2 and Appendix C.9-C.10 discuss failure cases: the method fails when generation does not terminate due to distribution distortion causing indefinite repetition."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper honestly reports that Algorithm 1 is not guaranteed to terminate (Section 6.1), that constrained decoding can distort distributions (Appendix C.9), and that Claude-3.5-Sonnet outperforms in the 90-100 score range (38 vs 33 data points)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "The abstract claims semantic correctness guarantees (supported by Theorem 5.1, 6.1), runtime correctness validation in a roguelike game (supported by DCI experiments in Section 6), and the constrained decoding algorithm with non-extensible property (formally defined and proven)."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The main causal claim is that constrained decoding improves correctness. This is supported by controlled comparison: same model (Qwen2.5-32B-Coder) with and without constrained decoding, isolating the variable."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "The paper is explicit that results are demonstrated through sLua (a custom language) and the DCI game. It states the framework is 'language-agnostic' but demonstrates only on sLua, and acknowledges limitations of the language restrictions."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "The paper discusses distribution distortion as an alternative explanation for failures (Appendix C.9), and acknowledges that Claude-3.5-Sonnet may produce higher quality code in the top range due to being a stronger base model."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "The paper uses 'Claude-3.5-Sonnet' and 'Qwen2.5-32B-Coder' without specific version snapshots or API dates. No snapshot date for Claude-3.5-Sonnet is given."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Appendix C.6-C.8 provide the agent workflow, system prompts, and the 8 talent category prompts used for evaluation. In-context examples are described in detail."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": false,
    142         "justification": "No temperature, top-p, or sampling parameters are reported for the LLM generation. The 1500 token limit for termination is stated but standard hyperparameters are missing."
    143       },
    144       "scaffolding_described": {
    145         "applies": true,
    146         "answer": true,
    147         "justification": "Algorithm 1 fully describes the constrained decoding scaffolding. Appendix C.6 describes the simple LLM agent used to guide category generation. The ToP framework is described in detail in Section 4."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "The paper documents how talent categories are structured (4 talents + up to 4 effects), how effects are generated first then talents, and how generated scripts are registered in the environment for future prompts (Section 6.2)."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": true,
    159         "justification": "The paper discusses limitations throughout: Algorithm 1 may not terminate (Section 6.1), distribution distortion issues (Appendix C.9), and language feature restrictions for runtime guarantees."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": true,
    164         "justification": "Specific threats are discussed: nontermination of generation due to distribution distortion (Appendix C.9), the tradeoff of expressiveness for safety by restricting language features, and the limitation to sLua rather than general languages."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": true,
    169         "justification": "The paper explicitly states it demonstrates on sLua only, that dynamic data structures and nil pointers are not supported, and that the runtime guarantee requires specific API design choices and language restrictions."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "No raw experimental data (individual scores, generated scripts, timing measurements) is made available for independent verification."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Section 6.2 describes: 8 prompts, 10 runs each, scores from Claude-3.5-Sonnet judge, failure counted as 0. Table 1 describes single-run timing measurement setup."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data consists of automated benchmark runs on programmatic tasks."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline is documented: prompts -> generation (with/without constrained decoding) -> LLM-judge scoring -> histogram aggregation. Failure handling (unsuccessful = score 0, 1500 token termination limit) is stated."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding statement is present. All authors are from Netflix but no explicit funding acknowledgment is provided."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All three authors are listed with Netflix affiliation on the first page."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "All authors are Netflix employees. Netflix could benefit from demonstrating effective code generation for game mechanics. No statement about funder independence from outcomes."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests or financial interests statement is provided in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": false,
    219         "answer": false,
    220         "justification": "The paper does not evaluate pre-trained model capability on a standard benchmark. It evaluates a constrained decoding method on a custom task (DCI game mechanics) that could not be in training data."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": false,
    224         "answer": false,
    225         "justification": "The evaluation task (generating sLua scripts for a custom game DCI) is novel and created by the authors, so train/test overlap is not a concern."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": false,
    229         "answer": false,
    230         "justification": "No standard benchmark is used. The evaluation is on a custom game scripting task created by the authors."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Table 1 reports tokens/sec for each method (18.89 unconstrained, 7.64 ours, 2.04 and 1.47 for alternatives) and compilation overhead percentages."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": true,
    279         "justification": "The paper states experiments were run on 8 A100 GPUs (Table 1 caption). Inference speed is benchmarked."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "The constrained decoding method generates semantically correct sLua programs that conform to prescribed scripting APIs.",
    286       "evidence": "Theorem 5.1 proves linear-time parsing; Theorem 6.1 proves runtime error-free execution for DCI scripts. Section 6.2 shows 0% syntax/semantic errors in successful generations.",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "The method with reflection achieves the highest success rate and quality scores compared to unconstrained baselines.",
    291       "evidence": "Figure 3: Ours-Reflection has 53 scores in 80-100 range out of 80 runs. Unconstrained Qwen has 49, Claude has 49. However Claude has 38 in 90-100 vs 33 for ours.",
    292       "supported": "moderate"
    293     },
    294     {
    295       "claim": "Adaptive rejection sampling is more than three times faster than alternative regex-compilation approaches for constrained decoding.",
    296       "evidence": "Table 1: 7.64 tokens/sec (ours) vs 2.04 (Willard & Louf) and 1.47 (Koo et al.), with only 4.98% compilation overhead vs 73-79%.",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Generated scripts are guaranteed to terminate and execute without runtime errors in the game engine.",
    301       "evidence": "Theorem 6.1, proved in Appendix C.5, combines safe callback patterns, bounded loop iterations, lexical scope function references, and capped recursion depth.",
    302       "supported": "strong"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval", "theoretical"],
    306   "key_findings": "The paper presents a constrained decoding algorithm using a Tree of Parsers (ToP) that generates semantically correct programs by incorporating context-sensitive parsing during LM token generation. Demonstrated on sLua (a typed Lua variant) for a roguelike game, the method achieves provable runtime correctness guarantees (Theorem 6.1) while maintaining competitive code quality. Adaptive rejection sampling provides a 3x+ speedup over alternative constrained decoding approaches. The main limitation is that the algorithm itself may not terminate due to distribution distortion in autoregressive generation.",
    307   "red_flags": [
    308     {
    309       "flag": "LLM-as-judge evaluation",
    310       "detail": "Code quality is evaluated solely by Claude-3.5-Sonnet scoring 1-100. No human evaluation of generated code quality is performed, and the reliability of this LLM judge is not validated."
    311     },
    312     {
    313       "flag": "Small evaluation scale",
    314       "detail": "Only 8 talent category prompts are used for evaluation, each run 10 times. This is a narrow evaluation setting for claims about correctness-guaranteed code generation."
    315     },
    316     {
    317       "flag": "No code or data release",
    318       "detail": "Despite being a systems paper with a concrete implementation (sLua parser, DCI game, constrained decoding algorithm), no source code or data is released for reproduction."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "Monitor-guided decoding of code LMs with static analysis of repository context",
    324       "authors": ["Lakshya A Agrawal", "Aditya Kanade", "Navin Goyal", "Shuvendu Lahiri", "Sriram Rajamani"],
    325       "year": 2023,
    326       "relevance": "Prior work on constrained decoding for code with semantic correctness via static analysis."
    327     },
    328     {
    329       "title": "Synchromesh: Reliable code generation from pre-trained language models",
    330       "authors": ["Gabriel Poesia", "Oleksandr Polozov", "Vu Le", "Ashish Tiwari", "Gustavo Soares", "Christopher Meek", "Sumit Gulwani"],
    331       "year": 2022,
    332       "arxiv_id": "2201.11227",
    333       "relevance": "Pioneered completion engine approach for semantic constrained decoding in code generation."
    334     },
    335     {
    336       "title": "Type-constrained code generation with language models",
    337       "authors": ["Niels Mundler", "Jingxuan He", "Hao Wang", "Koushik Sen", "Dawn Song", "Martin Vechev"],
    338       "year": 2025,
    339       "relevance": "Concurrent work on type-constrained decoding using prefix automata to reduce compilation errors."
    340     },
    341     {
    342       "title": "Efficient guided generation for LLMs",
    343       "authors": ["Brandon T Willard", "Remi Louf"],
    344       "year": 2023,
    345       "arxiv_id": "2307.09702",
    346       "relevance": "Foundational work on regex/CFG-based constrained decoding for LLMs, used as speed baseline."
    347     },
    348     {
    349       "title": "Automata-based constraints for language model decoding",
    350       "authors": ["Terry Koo", "Frederick Liu", "Luheng He"],
    351       "year": 2024,
    352       "arxiv_id": "2407.08103",
    353       "relevance": "Automata-based constrained decoding approach using finite-state transducers, used as speed baseline."
    354     },
    355     {
    356       "title": "XGrammar: Flexible and efficient structured generation engine for large language models",
    357       "authors": ["Yixin Dong", "Charlie F Ruan", "Yaxing Cai", "Ruihang Lai", "Ziyi Xu", "Yilong Zhao", "Tianqi Chen"],
    358       "year": 2024,
    359       "arxiv_id": "2411.15100",
    360       "relevance": "Structured generation engine for LLMs supporting CFG constraints."
    361     },
    362     {
    363       "title": "Improving LLM code generation with grammar augmentation",
    364       "authors": ["Shubham Ugare", "Tarun Suresh", "Hangoo Kang", "Sasa Misailovic", "Gagandeep Singh"],
    365       "year": 2024,
    366       "relevance": "Grammar-augmented approach to improve LLM code generation correctness."
    367     },
    368     {
    369       "title": "Grammar-aligned decoding",
    370       "authors": ["Kanghee Park", "Jiayu Wang", "Taylor Berg-Kirkpatrick", "Nadia Polikarpova", "Loris D'Antoni"],
    371       "year": 2024,
    372       "arxiv_id": "2405.21047",
    373       "relevance": "Discusses distribution distortion in grammar-constrained decoding, a key challenge addressed in this paper."
    374     },
    375     {
    376       "title": "GitHub Copilot AI pair programmer: Asset or liability?",
    377       "authors": ["Arghavan Moradi Dakhel", "Vahid Majdinasab", "Amin Nikanjam", "Foutse Khomh", "Michel C Desmarais", "Zhen Ming Jack Jiang"],
    378       "year": 2023,
    379       "relevance": "Evaluation of AI-assisted code generation tools relevant to the survey scope."
    380     }
    381   ]
    382 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs