scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (20741B)
      1 {
      2   "paper": {
      3     "title": "Constrained Decoding of Diffusion LLMs with Context-Free Grammars",
      4     "authors": ["Niels Mündler", "Jasper Dekoninck", "Martin Vechev"],
      5     "year": 2025,
      6     "venue": "arXiv preprint",
      7     "arxiv_id": "2508.10111"
      8   },
      9   "checklist": {
     10     "artifacts": {
     11       "code_released": {
     12         "applies": true,
     13         "answer": true,
     14         "justification": "GitHub repository linked: https://github.com/eth-sri/constrained-diffusion and project page https://constrained-diffusion.ai provided in the paper header."
     15       },
     16       "data_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "The paper uses publicly available HumanEval-X dataset and describes synthetic benchmark generation procedures for JSON and SMILES. The code repository would contain these benchmarks. The base datasets (HumanEval-X, NousResearch json-mode-eval) are public."
     20       },
     21       "environment_specified": {
     22         "applies": true,
     23         "answer": false,
     24         "justification": "The paper mentions the implementation is ~7000 lines Python and ~5500 lines Rust but does not provide requirements.txt, Dockerfile, or detailed dependency/version specifications beyond mentioning Rdkit."
     25       },
     26       "reproduction_instructions": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No step-by-step reproduction instructions are provided in the paper. The paper describes the method and experiments but does not include a 'Reproducing Results' section or specific commands to run."
     30       }
     31     },
     32     "statistical_methodology": {
     33       "confidence_intervals_or_error_bars": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Section 4.1 states: 'We compute confidence intervals at 95%, boldface the best method, and underline all methods over which the increase is not significant. The usual size of the confidence interval is 1% to 2%.'"
     37       },
     38       "significance_tests": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "The paper uses 95% confidence intervals to determine significance, underlining results where the improvement is not statistically significant. This serves as a significance testing mechanism."
     42       },
     43       "effect_sizes_reported": {
     44         "applies": true,
     45         "answer": true,
     46         "justification": "Absolute improvements are reported with baseline context throughout, e.g., 'an absolute increase of 5.2%, 22.5%, and 31.5% for 1-MRI, 2-MRI, and 3-MRI' and 'up to 7%' functional correctness improvement. Full baseline and method numbers in tables."
     47       },
     48       "sample_size_justified": {
     49         "applies": true,
     50         "answer": false,
     51         "justification": "No justification for using 164 HumanEval tasks, 272 JSON instances, or 167 SMILES instances. No power analysis discussed."
     52       },
     53       "variance_reported": {
     54         "applies": true,
     55         "answer": true,
     56         "justification": "Section 4.1 and Appendix D.2 state all methods were run 4 times with different seeds (0-4) and 95% confidence intervals are reported, indicating variance across runs."
     57       }
     58     },
     59     "evaluation_design": {
     60       "baselines_included": {
     61         "applies": true,
     62         "answer": true,
     63         "justification": "Vanilla (unconstrained) decoding is the primary baseline, and Con.- (constrained without completion sampling) serves as an ablation baseline. Results compared across 5 infilling models and 4 diffusion models."
     64       },
     65       "baselines_contemporary": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "The paper notes no prior work supports CFG constraining for DLMs or MRI. The comparison is against unconstrained decoding on state-of-the-art models (DREAM 7B, DiffuCoder 7B, LLaDA 8B, etc., all 2024-2025). Suresh et al. [47] (Dingo) is discussed but only handles regular languages, not CFGs."
     69       },
     70       "ablation_study": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "The paper ablates: Con.- vs Con. (with/without completion sampling), varying number of infilling regions (1-MRI, 2-MRI, 3-MRI), varying model sizes (1.3B to 33B), and Appendix D mentions ablation on diffusion steps."
     74       },
     75       "multiple_metrics": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Two main metrics: syntactic correctness (Syntax) and functional correctness (Functional/pass@1). Runtime overhead is also measured as a third metric."
     79       },
     80       "human_evaluation": {
     81         "applies": false,
     82         "answer": false,
     83         "justification": "Human evaluation is not relevant here — syntactic correctness is deterministic (grammar adherence) and functional correctness is measured via test cases."
     84       },
     85       "held_out_test_set": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "This is not a learning/training paper. The method is an inference-time algorithm applied to existing benchmarks; there is no train/dev/test split concern."
     89       },
     90       "per_category_breakdown": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Results broken down by model (5 infilling + 4 diffusion), by task (C++, JSON, SMILES), by number of infilling regions (1-MRI, 2-MRI, 3-MRI), and runtime per model/task."
     94       },
     95       "failure_cases_discussed": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "The paper discusses timeout failures, notes that 'only 0.7% of valid completions do not appear in the first 50 selected LLM proposals,' discusses SMILES functional correctness being very low (1.5% average), and mentions DreamCoder achieving only 19% syntactic correctness on C++ even with constraints."
     99       },
    100       "negative_results_reported": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "SMILES functional correctness shows negligible improvement (0.2% average increase). The paper reports cases where constrained decoding cannot help (model timeouts, poor functional quality despite syntactic validity)."
    104       }
    105     },
    106     "claims_and_evidence": {
    107       "abstract_claims_supported": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Abstract claims of 'near-perfect syntactic correctness' supported by Tables 1-2 (96-100% with completion), 'improving functional correctness' supported (up to 7%), and 'computational overhead remains practical' supported by runtime tables showing median 30-125% overhead."
    111       },
    112       "causal_claims_justified": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "The causal claim that constrained decoding improves correctness is justified by controlled single-variable manipulation: same models, same seeds, same benchmarks, with/without the constraining algorithm. The ablation Con.- vs Con. isolates the completion sampling component."
    116       },
    117       "generalization_bounded": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Claims are bounded to tested settings: specific models (named with sizes), specific tasks (C++, JSON, SMILES), specific benchmarks. The title specifies 'Context-Free Grammars' and the paper does not claim applicability beyond CFGs."
    121       },
    122       "alternative_explanations_discussed": {
    123         "applies": true,
    124         "answer": false,
    125         "justification": "No discussion of alternative explanations for observed results. For example, no discussion of whether functional correctness improvements could be due to other factors, or whether the grammar overapproximation affects results systematically."
    126       }
    127     },
    128     "setup_transparency": {
    129       "model_versions_specified": {
    130         "applies": true,
    131         "answer": false,
    132         "justification": "Models are named with parameter counts (e.g., 'STARCODER2 7B', 'DREAM 7B', 'LLADA 8B') but no specific version snapshots, checkpoint dates, or HuggingFace model IDs are provided."
    133       },
    134       "prompts_provided": {
    135         "applies": true,
    136         "answer": true,
    137         "justification": "Example prompts for all three tasks (C++, JSON, SMILES) are shown in Figures 7-10 in the appendix, and generation/verification prompts for synthetic data creation in Figures 11-14."
    138       },
    139       "hyperparameters_reported": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Appendix D.2 reports: temperature (1 for MRI, 0.2 for DLM), sampling algorithms (greedy for MRI, entropy/low-confidence for DLM), diffusion steps (32), max tokens (256), timeout (300s), rejection limit (100), seeds (0-4)."
    143       },
    144       "scaffolding_described": {
    145         "applies": false,
    146         "answer": false,
    147         "justification": "No agentic scaffolding is used. This is a constrained decoding algorithm applied at inference time."
    148       },
    149       "data_preprocessing_documented": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "Appendix D.3 documents dataset construction in detail: MRI span removal procedure (5-100 chars, resampling), JSON schema cleaning/filtering, SMILES generation with three filtering steps (validity, solvability, deduplication), with resulting instance counts."
    153       }
    154     },
    155     "limitations_and_scope": {
    156       "limitations_section_present": {
    157         "applies": true,
    158         "answer": false,
    159         "justification": "No dedicated limitations or threats-to-validity section. Some limitations are mentioned in passing (grammar overapproximation in Appendix E, C++ grammar not covering all features) but no consolidated discussion."
    160       },
    161       "threats_to_validity_specific": {
    162         "applies": true,
    163         "answer": false,
    164         "justification": "No specific threats to validity discussed. The grammar coverage limitation and overapproximation are mentioned but not framed as threats to validity."
    165       },
    166       "scope_boundaries_stated": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No explicit statements about what the results do NOT show. The paper does not discuss limitations of the benchmarks used or bound claims to specific settings explicitly."
    170       }
    171     },
    172     "data_integrity": {
    173       "raw_data_available": {
    174         "applies": true,
    175         "answer": false,
    176         "justification": "Raw experimental outputs (model generations, per-instance results) are not provided. Only aggregate statistics in tables."
    177       },
    178       "data_collection_described": {
    179         "applies": true,
    180         "answer": true,
    181         "justification": "Appendix D.3 describes dataset construction procedures for all three tasks in detail, including filtering criteria and resulting counts."
    182       },
    183       "recruitment_methods_described": {
    184         "applies": false,
    185         "answer": false,
    186         "justification": "No human participants. Data sources are standard benchmarks and synthetic generation."
    187       },
    188       "data_pipeline_documented": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "The pipeline from HumanEval-X to MRI tasks (span removal, filtering invalid solutions) and from Gemini generation to JSON/SMILES benchmarks (three filtering steps with counts: 272 JSON, 167 SMILES) is documented in Appendix D.3."
    192       }
    193     },
    194     "conflicts_of_interest": {
    195       "funding_disclosed": {
    196         "applies": true,
    197         "answer": false,
    198         "justification": "No funding or acknowledgments section found in the paper."
    199       },
    200       "affiliations_disclosed": {
    201         "applies": true,
    202         "answer": true,
    203         "justification": "All authors listed as ETH Zurich, Department of Computer Science. No commercial product is being evaluated."
    204       },
    205       "funder_independent_of_outcome": {
    206         "applies": true,
    207         "answer": false,
    208         "justification": "No funding information disclosed, so independence cannot be assessed."
    209       },
    210       "financial_interests_declared": {
    211         "applies": true,
    212         "answer": false,
    213         "justification": "No competing interests statement found in the paper."
    214       }
    215     },
    216     "contamination": {
    217       "training_cutoff_stated": {
    218         "applies": true,
    219         "answer": false,
    220         "justification": "No training data cutoff dates stated for any of the 9 models evaluated. The paper uses HumanEval (published 2021) with models trained after 2021."
    221       },
    222       "train_test_overlap_discussed": {
    223         "applies": true,
    224         "answer": false,
    225         "justification": "No discussion of whether HumanEval examples appeared in training data of the evaluated models, despite HumanEval being a widely-known public benchmark."
    226       },
    227       "benchmark_contamination_addressed": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "HumanEval was published in 2021; all evaluated models were trained after this date. No discussion of contamination risk. The synthetic JSON and SMILES benchmarks would not have this issue, but C++ HumanEval does."
    231       }
    232     },
    233     "human_studies": {
    234       "pre_registered": {
    235         "applies": false,
    236         "answer": false,
    237         "justification": "No human participants in this study."
    238       },
    239       "irb_or_ethics_approval": {
    240         "applies": false,
    241         "answer": false,
    242         "justification": "No human participants in this study."
    243       },
    244       "demographics_reported": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants in this study."
    248       },
    249       "inclusion_exclusion_criteria": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants in this study."
    253       },
    254       "randomization_described": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants in this study."
    258       },
    259       "blinding_described": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants in this study."
    263       },
    264       "attrition_reported": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants in this study."
    268       }
    269     },
    270     "cost_and_practicality": {
    271       "inference_cost_reported": {
    272         "applies": true,
    273         "answer": true,
    274         "justification": "Runtime overhead reported in detail: Tables 3-4 show per-token and per-completion overhead in ms and percentages for all models and settings. Median overhead ranges from 30% (DLM) to 125% (MRI)."
    275       },
    276       "compute_budget_stated": {
    277         "applies": true,
    278         "answer": false,
    279         "justification": "No mention of total GPU hours, hardware used for experiments, or total compute budget. Only relative overhead percentages are reported."
    280       }
    281     }
    282   },
    283   "claims": [
    284     {
    285       "claim": "Constrained decoding with completion sampling achieves near-perfect syntactic correctness (95.8-100%) across all settings.",
    286       "evidence": "Tables 1 and 2 show Con. achieving 83-100% syntax in MRI and 99.2-100% in DLM settings (Section 4.2-4.3).",
    287       "supported": "strong"
    288     },
    289     {
    290       "claim": "Constrained decoding improves functional correctness by up to 7% over unconstrained generation.",
    291       "evidence": "Table 1 shows functional correctness improvements averaging 2.8% for MRI; Table 2 shows DREAM 7B JSON improving by 6.9% (Section 4.2-4.3).",
    292       "supported": "strong"
    293     },
    294     {
    295       "claim": "The computational overhead remains practical, with median inference time less than doubling on average.",
    296       "evidence": "Tables 3-4: MRI median overhead 125%, DLM median overhead 30%. For 7B models, overhead is ~100% (Section 4.2-4.3, Appendix D.4).",
    297       "supported": "strong"
    298     },
    299     {
    300       "claim": "Only 0.7% of valid completions do not appear in the first 50 selected LLM proposals.",
    301       "evidence": "Stated in Section 3.2 without further detail on how this was measured.",
    302       "supported": "moderate"
    303     }
    304   ],
    305   "methodology_tags": ["benchmark-eval"],
    306   "key_findings": "The paper presents the first constrained decoding method for diffusion language models that handles context-free grammars, enabling guaranteed syntactic correctness for C++, JSON, and SMILES generation. Across 9 models and 3 tasks, constrained decoding with completion sampling achieves 95.8-100% syntactic correctness (vs. 11-94% unconstrained) with modest runtime overhead (30-125% median). Functional correctness also improves by 2-7% since syntactically invalid outputs are prevented. The method generalizes across prefix, fill-in-the-middle, multi-region infilling, and diffusion LLM generation paradigms.",
    307   "red_flags": [
    308     {
    309       "flag": "No contamination analysis for HumanEval",
    310       "detail": "HumanEval was published in 2021 and is widely available online. All evaluated models were trained after 2021 and likely saw these examples during training. The paper does not discuss this contamination risk. While this affects functional correctness comparisons more than syntactic correctness (the main contribution), it undermines the validity of pass@1 results."
    311     },
    312     {
    313       "flag": "Synthetic benchmarks generated by Gemini",
    314       "detail": "The JSON and SMILES benchmarks were generated by Gemini-2.5-Pro. While filtering steps are applied, the quality and diversity of these synthetic benchmarks is not independently validated. No analysis of benchmark difficulty distribution or comparison to human-created benchmarks."
    315     },
    316     {
    317       "flag": "No limitations section",
    318       "detail": "The paper lacks a dedicated limitations or threats-to-validity section. Grammar coverage limitations (e.g., no C++ templates) and the overapproximation of infilling region sizes are mentioned only in passing."
    319     }
    320   ],
    321   "cited_papers": [
    322     {
    323       "title": "Evaluating Large Language Models Trained on Code",
    324       "authors": ["Mark Chen", "Jerry Tworek"],
    325       "year": 2021,
    326       "arxiv_id": "2107.03374",
    327       "relevance": "HumanEval benchmark paper, foundational code generation evaluation used in this work."
    328     },
    329     {
    330       "title": "Guiding LLMs The Right Way: Fast, Non-invasive Constrained Generation",
    331       "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"],
    332       "year": 2024,
    333       "relevance": "Prior constrained decoding method for autoregressive LLMs that this work generalizes to diffusion models."
    334     },
    335     {
    336       "title": "SynCode: LLM Generation with Grammar Augmentation",
    337       "authors": ["Shubham Ugare", "Tarun Suresh"],
    338       "year": 2024,
    339       "arxiv_id": "2403.01632",
    340       "relevance": "Grammar-augmented constrained decoding for autoregressive LLMs, a key prior approach."
    341     },
    342     {
    343       "title": "Dingo: Constrained inference for diffusion llms",
    344       "authors": ["Tarun Suresh", "Debangshu Banerjee"],
    345       "year": 2025,
    346       "arxiv_id": "2505.23061",
    347       "relevance": "Most closely related work: constrains diffusion LLMs but only to regular languages, not CFGs."
    348     },
    349     {
    350       "title": "Large Language Diffusion Models",
    351       "authors": ["Shen Nie", "Fengqi Zhu"],
    352       "year": 2025,
    353       "arxiv_id": "2502.09992",
    354       "relevance": "LLaDA model paper, one of the diffusion LLMs evaluated in this work."
    355     },
    356     {
    357       "title": "DiffuCoder: Understanding and Improving Masked Diffusion Models for Code Generation",
    358       "authors": ["Shansan Gong", "Ruixiang Zhang"],
    359       "year": 2025,
    360       "arxiv_id": "2506.20639",
    361       "relevance": "DiffuCoder model paper, one of the diffusion LLMs evaluated; focuses on code generation with diffusion."
    362     },
    363     {
    364       "title": "LiveCodeBench: Holistic and Contamination Free Evaluation of Large Language Models for Code",
    365       "authors": ["Naman Jain", "King Han"],
    366       "year": 2025,
    367       "relevance": "Contamination-aware code benchmark, relevant to evaluation methodology for LLM code generation."
    368     },
    369     {
    370       "title": "BaxBench: Can LLMs generate correct and secure backends?",
    371       "authors": ["Mark Vero", "Niels Mündler"],
    372       "year": 2025,
    373       "relevance": "LLM code generation benchmark for backend systems, from same research group."
    374     },
    375     {
    376       "title": "Constrained Decoding for Fill-in-the-middle Code Language Models via Efficient Left and Right Quotienting of Context-sensitive Grammars",
    377       "authors": ["Daniel Melcer", "Nathan Fulton"],
    378       "year": 2024,
    379       "arxiv_id": "2402.17988",
    380       "relevance": "Extends constrained decoding to FIM with context-sensitive features; directly extended by this work."
    381     },
    382     {
    383       "title": "A Survey on Large Language Models for Code Generation",
    384       "authors": ["Juyong Jiang", "Fan Wang"],
    385       "year": 2024,
    386       "relevance": "Survey of LLM code generation approaches, relevant to the broader survey scope."
    387     }
    388   ]
    389 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs