scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25261B)
      1 {
      2   "paper": {
      3     "title": "Fast Controlled Generation from Language Models with Adaptive Weighted Rejection Sampling",
      4     "authors": ["Benjamin Lipkin", "Benjamin LeBrun", "Jacob Hoover Vigly", "João Loula", "David R. MacIver", "Li Du", "Jason Eisner", "Ryan Cotterell", "Vikash Mansinghka", "Timothy J. O'Donnell", "Alexander K. Lew", "Tim Vieira"],
      5     "year": 2025,
      6     "venue": "COLM 2025",
      7     "arxiv_id": "2504.05410",
      8     "doi": "10.48550/arXiv.2504.05410"
      9   },
     10   "scan_version": 2,
     11   "active_modules": ["experimental_rigor", "data_leakage"],
     12   "methodology_tags": ["benchmark-eval", "theoretical"],
     13   "key_findings": "AWRS achieves >50x speedup over token masking for constrained decoding with no accuracy loss, by dynamically allocating computation based on constraint difficulty. AWRS-SMC outperforms or matches state-of-the-art baselines across 5 domains (Text-to-SQL, JSON, goal inference, molecular synthesis, pattern matching) while using fewer particles. Runtime scales with KL divergence between constrained and unconstrained distributions, meaning the method is faster for better base models.",
     14   "checklist": {
     15     "artifacts": {
     16       "code_released": {
     17         "applies": true,
     18         "answer": true,
     19         "justification": "Two GitHub repositories provided: https://github.com/genlm/genlm-control (library) and https://github.com/genlm/awrs-colm-2025 (experiment replication)."
     20       },
     21       "data_released": {
     22         "applies": true,
     23         "answer": true,
     24         "justification": "Uses publicly available benchmarks (Spider, JSONSchemaBench, Planetarium, GDB-17). The pattern matching dataset generation pipeline is described in App. J, and the replication repo is provided."
     25       },
     26       "environment_specified": {
     27         "applies": true,
     28         "answer": false,
     29         "justification": "No requirements.txt, Dockerfile, or detailed environment specification found in the paper. Hardware is mentioned (L40S, A100 GPUs) but software dependencies are not specified."
     30       },
     31       "reproduction_instructions": {
     32         "applies": true,
     33         "answer": false,
     34         "justification": "While a replication repository is provided, no step-by-step reproduction instructions are included in the paper itself. The paper notes implementations are 'relatively unoptimized' pure Python but does not provide reproduction steps."
     35       }
     36     },
     37     "statistical_methodology": {
     38       "confidence_intervals_or_error_bars": {
     39         "applies": true,
     40         "answer": true,
     41         "justification": "95% bootstrapped confidence intervals are reported for all accuracy and runtime results in Table 1 and Table 3."
     42       },
     43       "significance_tests": {
     44         "applies": true,
     45         "answer": false,
     46         "justification": "No formal significance tests are used. Comparisons between methods rely on comparing bootstrapped confidence intervals, but no explicit hypothesis tests (p-values, etc.) are reported."
     47       },
     48       "effect_sizes_reported": {
     49         "applies": true,
     50         "answer": true,
     51         "justification": "Results are reported with absolute accuracy values and runtime in seconds with baselines for context (e.g., ARS-LCD 0.980 vs TM-LCD 0.978, >50x speedup). The reader can compute effect sizes from the provided numbers."
     52       },
     53       "sample_size_justified": {
     54         "applies": true,
     55         "answer": false,
     56         "justification": "No justification for sample sizes is provided. The number of examples per benchmark is not discussed in terms of statistical power."
     57       },
     58       "variance_reported": {
     59         "applies": true,
     60         "answer": true,
     61         "justification": "95% bootstrapped confidence intervals are reported for all main results, providing spread information. Simulation results in App. F also show variance across Monte Carlo iterations."
     62       }
     63     },
     64     "evaluation_design": {
     65       "baselines_included": {
     66         "applies": true,
     67         "answer": true,
     68         "justification": "Five baselines compared: Base LM, TM-LCD (token masking), ARS-LCD, Sample-Verify, and Twisted SMC. Each is clearly described in §4."
     69       },
     70       "baselines_contemporary": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Baselines include recent work: Twisted SMC (Loula et al., 2025), Sample-Verify approaches from 2024, and the standard TM-LCD approach. Related work in §6 discusses very recent concurrent work (Botta et al., 2025; Mündler et al., 2025)."
     74       },
     75       "ablation_study": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "The comparison between ARS-LCD (unweighted) and AWRS-SMC (weighted with importance correction) serves as an ablation of the weighting component. The paper also varies number of particles and model sizes (Fig. L.1, Tables 1 and 3)."
     79       },
     80       "multiple_metrics": {
     81         "applies": true,
     82         "answer": true,
     83         "justification": "Two primary metrics reported: task-specific accuracy and runtime (seconds per example). Each domain also has its own accuracy metric (execution accuracy, QED, pattern adherence, etc.)."
     84       },
     85       "human_evaluation": {
     86         "applies": false,
     87         "answer": false,
     88         "justification": "Human evaluation is not relevant; the tasks have objective ground-truth metrics (execution accuracy, schema validation, pattern matching, PDDL equivalence, QED)."
     89       },
     90       "held_out_test_set": {
     91         "applies": true,
     92         "answer": true,
     93         "justification": "Standard benchmark splits used: Spider development split, JSONSchemaBench validation splits, Planetarium Blocksworld tasks. These are established held-out evaluation sets."
     94       },
     95       "per_category_breakdown": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Results are broken down by domain (5 domains in Table 1), by model size (Table 3, 1B/8B/70B), and by JSON difficulty level (Github-trivial, -easy, -medium mentioned in §4)."
     99       },
    100       "failure_cases_discussed": {
    101         "applies": true,
    102         "answer": true,
    103         "justification": "The paper discusses where LCD fails (dead-end paths, §2 example with 'mortg'), and Figure 2 shows how AWRS handles hard cases. The paper notes Pattern Matching suffers less from greediness because its local constraint is more precise."
    104       },
    105       "negative_results_reported": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "AWRS-SMC does not always improve over ARS-LCD (Pattern Matching domain, where the local constraint is precise enough). The paper also notes TM-LCD was computationally infeasible for most benchmarks."
    109       }
    110     },
    111     "claims_and_evidence": {
    112       "abstract_claims_supported": {
    113         "applies": true,
    114         "answer": true,
    115         "justification": "Abstract claims of orders-of-magnitude fewer constraint evaluations, low-variance unbiased Z estimates, and superiority across 5 benchmarks are all supported by Table 1, Figure 2, and the theoretical analysis."
    116       },
    117       "causal_claims_justified": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Causal claims like 'AWRS improves runtime' are justified through controlled single-variable comparisons (same model, same benchmark, different decoding algorithm). The ablation between ARS-LCD and AWRS-SMC isolates the importance weighting contribution."
    121       },
    122       "generalization_bounded": {
    123         "applies": true,
    124         "answer": true,
    125         "justification": "Claims are bounded to the tested domains. The paper states results for specific benchmarks and models (Llama 3.1 8B, etc.). The abstract says 'through extensive empirical evaluation in text-to-SQL, molecular synthesis, goal inference, pattern matching, and JSON domains' rather than claiming general superiority."
    126       },
    127       "alternative_explanations_discussed": {
    128         "applies": true,
    129         "answer": true,
    130         "justification": "The paper discusses why AWRS-SMC's advantage varies across domains (Pattern Matching has a more precise local constraint, §5). It also discusses the relationship between model quality and runtime, providing both theoretical (§3, App. G) and empirical (Fig. 2) analyses of what drives performance."
    131       },
    132       "proxy_outcome_distinction": {
    133         "applies": true,
    134         "answer": true,
    135         "justification": "The paper measures task-specific accuracy (execution accuracy, QED, pattern adherence) and clearly states what each metric measures. It does not overframe these as broader claims beyond the specific tasks."
    136       }
    137     },
    138     "setup_transparency": {
    139       "model_versions_specified": {
    140         "applies": true,
    141         "answer": true,
    142         "justification": "Specific model versions stated: Llama 3.1 8B-Instruct, Llama 3.1 8B, Llama 3.2 1B, Llama 3.3 70B. These are specific enough model identifiers."
    143       },
    144       "prompts_provided": {
    145         "applies": true,
    146         "answer": false,
    147         "justification": "For molecular synthesis, 'few-shot prompts created by repeatedly selecting 20 random samples from the GDB-17 database' but actual prompt text is not shown. No prompts are provided in the paper or appendix for any domain."
    148       },
    149       "hyperparameters_reported": {
    150         "applies": true,
    151         "answer": true,
    152         "justification": "App. K reports temperature (1.0), max tokens per domain (32-350), ESS thresholds for resampling, particle counts (M=5 for AWRS-SMC, M=10 for baselines), and resampling strategies."
    153       },
    154       "scaffolding_described": {
    155         "applies": false,
    156         "answer": false,
    157         "justification": "No agentic scaffolding is used. The method is a decoding algorithm applied directly to language models."
    158       },
    159       "data_preprocessing_documented": {
    160         "applies": true,
    161         "answer": true,
    162         "justification": "Data sources and processing are described: Spider dev split, JSONSchemaBench validation splits with difficulty tiers, Planetarium Blocksworld with up to 10 objects, GDB-17 database for molecular synthesis, and a detailed pattern generation pipeline in App. J."
    163       }
    164     },
    165     "limitations_and_scope": {
    166       "limitations_section_present": {
    167         "applies": true,
    168         "answer": false,
    169         "justification": "No dedicated limitations section. The paper discusses some limitations implicitly (e.g., noting implementations are unoptimized, TM-LCD is infeasible for most benchmarks) but lacks a structured limitations discussion."
    170       },
    171       "threats_to_validity_specific": {
    172         "applies": true,
    173         "answer": false,
    174         "justification": "No threats to validity are discussed. The paper does not address potential issues like the choice of benchmarks, the impact of constraint checker implementation quality, or generalization to other model families."
    175       },
    176       "scope_boundaries_stated": {
    177         "applies": true,
    178         "answer": true,
    179         "justification": "The paper explicitly notes it focuses on 'runtime control' (footnote 1), distinguishing from training-based methods. It also notes implementations are 'pure Python and relatively unoptimized' (footnote 5), bounding runtime claims."
    180       }
    181     },
    182     "data_integrity": {
    183       "raw_data_available": {
    184         "applies": true,
    185         "answer": true,
    186         "justification": "A replication repository is provided (https://github.com/genlm/awrs-colm-2025) containing 'source code and data to replicate this paper's experiments.'"
    187       },
    188       "data_collection_described": {
    189         "applies": true,
    190         "answer": true,
    191         "justification": "Each benchmark's data source is clearly described: Spider dev split (Yu et al., 2018), JSONSchemaBench validation splits (Geng et al., 2025), Planetarium Blocksworld (Zuo et al., 2024), GDB-17 (Ruddigkeit et al., 2012), and custom pattern matching pipeline (App. J)."
    192       },
    193       "recruitment_methods_described": {
    194         "applies": false,
    195         "answer": false,
    196         "justification": "No human participants. All data comes from standard benchmarks."
    197       },
    198       "data_pipeline_documented": {
    199         "applies": true,
    200         "answer": true,
    201         "justification": "The pattern matching generation pipeline is detailed in App. J with filtering stages and counts (1503 candidates → 402 after dedup, library compatibility, FSM exclusion, and prefix checks). Other benchmarks use standard splits."
    202       }
    203     },
    204     "conflicts_of_interest": {
    205       "funding_disclosed": {
    206         "applies": true,
    207         "answer": true,
    208         "justification": "Acknowledgments section discloses NSF Graduate Research Fellowship (Grant No. 2141064), NSF SBE Postdoctoral Research Fellowship (Grant No. SMA-2404644), and Mila compute resources."
    209       },
    210       "affiliations_disclosed": {
    211         "applies": true,
    212         "answer": true,
    213         "justification": "All author affiliations are clearly listed: MIT, ETH Zürich, McGill, Canada CIFAR AI Chair, Mila, Johns Hopkins, Yale, CHI FRO."
    214       },
    215       "funder_independent_of_outcome": {
    216         "applies": true,
    217         "answer": true,
    218         "justification": "Funding is from NSF and academic compute (Mila). These are independent of the outcome — no commercial entity with a stake in the results is funding the work."
    219       },
    220       "financial_interests_declared": {
    221         "applies": true,
    222         "answer": false,
    223         "justification": "No competing interests statement is present in the paper."
    224       }
    225     },
    226     "contamination": {
    227       "training_cutoff_stated": {
    228         "applies": true,
    229         "answer": false,
    230         "justification": "The paper uses Llama models on benchmarks like Spider but does not state the training data cutoff dates for the models."
    231       },
    232       "train_test_overlap_discussed": {
    233         "applies": true,
    234         "answer": false,
    235         "justification": "No discussion of whether Spider or other benchmark examples appeared in Llama's training data."
    236       },
    237       "benchmark_contamination_addressed": {
    238         "applies": true,
    239         "answer": false,
    240         "justification": "Spider was published in 2018, well before Llama's training data. No discussion of contamination risk despite using benchmarks that predate the models."
    241       }
    242     },
    243     "human_studies": {
    244       "pre_registered": {
    245         "applies": false,
    246         "answer": false,
    247         "justification": "No human participants."
    248       },
    249       "irb_or_ethics_approval": {
    250         "applies": false,
    251         "answer": false,
    252         "justification": "No human participants."
    253       },
    254       "demographics_reported": {
    255         "applies": false,
    256         "answer": false,
    257         "justification": "No human participants."
    258       },
    259       "inclusion_exclusion_criteria": {
    260         "applies": false,
    261         "answer": false,
    262         "justification": "No human participants."
    263       },
    264       "randomization_described": {
    265         "applies": false,
    266         "answer": false,
    267         "justification": "No human participants."
    268       },
    269       "blinding_described": {
    270         "applies": false,
    271         "answer": false,
    272         "justification": "No human participants."
    273       },
    274       "attrition_reported": {
    275         "applies": false,
    276         "answer": false,
    277         "justification": "No human participants."
    278       }
    279     },
    280     "cost_and_practicality": {
    281       "inference_cost_reported": {
    282         "applies": true,
    283         "answer": true,
    284         "justification": "Runtime in seconds per example is reported for all methods and benchmarks in Table 1 and Table 3. Constraint evaluation costs per domain are in Table 2 (App. I.1)."
    285       },
    286       "compute_budget_stated": {
    287         "applies": true,
    288         "answer": false,
    289         "justification": "Hardware is mentioned (L40S, A100 GPUs) but total compute budget (GPU hours, total experiment time) is not stated."
    290       }
    291     },
    292     "experimental_rigor": {
    293       "seed_sensitivity_reported": {
    294         "applies": true,
    295         "answer": false,
    296         "justification": "No seed sensitivity analysis. Results are reported with bootstrapped CIs but no mention of varying random seeds across runs."
    297       },
    298       "number_of_runs_stated": {
    299         "applies": true,
    300         "answer": false,
    301         "justification": "The number of experimental runs is not explicitly stated. Bootstrapped CIs are computed but it is unclear whether results come from single or multiple runs."
    302       },
    303       "hyperparameter_search_budget": {
    304         "applies": true,
    305         "answer": false,
    306         "justification": "No hyperparameter search budget reported. Choices of particle counts (M=5 for AWRS-SMC, M=10 for baselines) and ESS thresholds appear tuned but no search process is described."
    307       },
    308       "best_config_selection_justified": {
    309         "applies": true,
    310         "answer": false,
    311         "justification": "The choice of M=5 for AWRS-SMC vs M=10 for baselines is not justified through a validation procedure. Different ESS thresholds and resampling strategies are used per domain without explaining selection."
    312       },
    313       "multiple_comparison_correction": {
    314         "applies": false,
    315         "answer": false,
    316         "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable."
    317       },
    318       "self_comparison_bias_addressed": {
    319         "applies": true,
    320         "answer": false,
    321         "justification": "The authors implement all baselines themselves. No discussion of self-comparison bias or whether their implementations of Sample-Verify and Twisted SMC are faithful."
    322       },
    323       "compute_budget_vs_performance": {
    324         "applies": true,
    325         "answer": true,
    326         "justification": "Figure L.1 shows accuracy vs runtime tradeoff across different particle counts and model sizes. Table 1 reports both accuracy and runtime for all methods, enabling compute-matched comparisons."
    327       },
    328       "benchmark_construct_validity": {
    329         "applies": true,
    330         "answer": false,
    331         "justification": "No discussion of whether the chosen benchmarks adequately represent real-world constrained generation needs. The paper does not question the construct validity of any benchmark."
    332       },
    333       "scaffold_confound_addressed": {
    334         "applies": false,
    335         "answer": false,
    336         "justification": "No scaffolding involved. The methods are decoding algorithms applied directly to LMs."
    337       }
    338     },
    339     "data_leakage": {
    340       "temporal_leakage_addressed": {
    341         "applies": true,
    342         "answer": false,
    343         "justification": "No discussion of temporal leakage. Spider (2018) and other benchmarks predate Llama training, creating contamination risk that is not addressed."
    344       },
    345       "feature_leakage_addressed": {
    346         "applies": true,
    347         "answer": false,
    348         "justification": "No discussion of feature leakage. The constraint checker provides information during generation that may not be available in unconstrained settings, but this is inherent to the method rather than a leakage concern."
    349       },
    350       "non_independence_addressed": {
    351         "applies": true,
    352         "answer": false,
    353         "justification": "No discussion of whether training and test data share structural similarities."
    354       },
    355       "leakage_detection_method": {
    356         "applies": true,
    357         "answer": false,
    358         "justification": "No leakage detection or prevention method is used or discussed."
    359       }
    360     }
    361   },
    362   "claims": [
    363     {
    364       "claim": "ARS-LCD is >50x faster than token masking (TM-LCD) with no loss of accuracy on constrained decoding.",
    365       "evidence": "Table 1e: Pattern Matching — ARS-LCD accuracy 0.980 vs TM-LCD 0.978, runtime 0.16s vs 6.91s (§5).",
    366       "supported": "strong"
    367     },
    368     {
    369       "claim": "AWRS-SMC outperforms or matches all baselines across 5 domains while using fewer particles (M=5 vs M=10).",
    370       "evidence": "Table 1: AWRS-SMC matches or beats Sample-Verify and Twisted SMC in accuracy across all 5 domains (§5).",
    371       "supported": "strong"
    372     },
    373     {
    374       "claim": "AWRS runtime scales with the KL divergence between constrained and unconstrained distributions, making it faster for better models.",
    375       "evidence": "Figure 2 shows empirical correlation between DKL and number of constraint evaluations. Theoretical analysis in Proposition 4 and App. G.2.",
    376       "supported": "strong"
    377     },
    378     {
    379       "claim": "AWRS-SMC with a smaller LM (1B) outperforms Twisted SMC with larger LMs (8B, 70B) in accuracy and runtime.",
    380       "evidence": "Figure L.1 and Table 3: AWRS-SMC with Llama 3.2 1B (0.974 accuracy) beats Twisted SMC with Llama 3.1 8B (0.796) and Llama 3.3 70B (0.846) on Pattern Matching.",
    381       "supported": "strong"
    382     },
    383     {
    384       "claim": "The Z estimator produced by AWRS is unbiased.",
    385       "evidence": "Formal proof in Proposition 3 / App. D using the RAVI framework. Empirical validation in Figure F.1 showing MAE trending to 0 with increasing Monte Carlo samples.",
    386       "supported": "strong"
    387     }
    388   ],
    389   "red_flags": [
    390     {
    391       "flag": "Asymmetric particle counts",
    392       "detail": "AWRS-SMC uses M=5 particles while Sample-Verify and Twisted SMC use M=10. While the paper acknowledges this ('with half the number of particles'), the comparison is not fully controlled — giving AWRS-SMC the same budget might show different results."
    393     },
    394     {
    395       "flag": "TM-LCD baseline run on only one benchmark",
    396       "detail": "The key speedup claim (>50x) is demonstrated on only the Pattern Matching domain. The paper states TM-LCD was 'computationally infeasible' for other benchmarks but does not provide partial results or projected costs."
    397     },
    398     {
    399       "flag": "No contamination analysis",
    400       "detail": "Spider (2018) and other benchmarks predate Llama model training, creating significant contamination risk. Base LM accuracy on Spider (52.3%) is high enough that memorization could be a factor, and this is never discussed."
    401     }
    402   ],
    403   "cited_papers": [
    404     {
    405       "title": "Synchromesh: Reliable code generation from pre-trained language models",
    406       "authors": ["Gabriel Poesia", "Alex Polozov", "Vu Le", "Ashish Tiwari", "Gustavo Soares", "Christopher Meek", "Sumit Gulwani"],
    407       "year": 2022,
    408       "relevance": "Key prior work on constrained decoding for code generation from LMs."
    409     },
    410     {
    411       "title": "SynCode: Improving LLM code generation with grammar augmentation",
    412       "authors": ["Shubham Ugare", "Tarun Suresh", "Hangoo Kang", "Sasa Misailovic", "Gagandeep Singh"],
    413       "year": 2024,
    414       "arxiv_id": "2403.01632",
    415       "relevance": "Grammar-based constrained decoding for LLM code generation."
    416     },
    417     {
    418       "title": "Efficient guided generation for large language models",
    419       "authors": ["Brandon T Willard", "Rémi Louf"],
    420       "year": 2023,
    421       "arxiv_id": "2307.09702",
    422       "relevance": "Outlines library for structured LLM generation with grammar constraints."
    423     },
    424     {
    425       "title": "XGrammar: Flexible and efficient structured generation engine for large language models",
    426       "authors": ["Yixin Dong", "Charlie F Ruan", "Yaxing Cai", "Ruihang Lai", "Ziyi Xu", "Yilong Zhao", "Tianqi Chen"],
    427       "year": 2024,
    428       "arxiv_id": "2411.15100",
    429       "relevance": "State-of-the-art structured generation engine with optimized grammar-based constrained decoding."
    430     },
    431     {
    432       "title": "Syntactic and semantic control of large language models via sequential Monte Carlo",
    433       "authors": ["João Loula", "Benjamin LeBrun", "Li Du", "Ben Lipkin"],
    434       "year": 2025,
    435       "relevance": "Prior work by same group on SMC-based constrained LLM generation, used as baseline."
    436     },
    437     {
    438       "title": "CRANE: Reasoning with constrained LLM generation",
    439       "authors": ["Debangshu Banerjee", "Tarun Suresh", "Shubham Ugare", "Sasa Misailovic", "Gagandeep Singh"],
    440       "year": 2025,
    441       "arxiv_id": "2502.09061",
    442       "relevance": "Recent work on constrained LLM generation with reasoning capabilities."
    443     },
    444     {
    445       "title": "Grammar-aligned decoding",
    446       "authors": ["Kanghee Park", "Jiayu Wang", "Taylor Berg-Kirkpatrick", "Nadia Polikarpova", "Loris D'Antoni"],
    447       "year": 2024,
    448       "relevance": "Addresses distortion in grammar-constrained decoding, related to LCD greediness problem."
    449     },
    450     {
    451       "title": "PICARD: Parsing incrementally for constrained auto-regressive decoding from language models",
    452       "authors": ["Torsten Scholak", "Nathan Schucher", "Dzmitry Bahdanau"],
    453       "year": 2021,
    454       "relevance": "Influential work on incremental constrained decoding for text-to-SQL generation."
    455     },
    456     {
    457       "title": "IterGen: Iterative structured LLM generation",
    458       "authors": ["Shubham Ugare", "Rohan Gumaste", "Tarun Suresh", "Gagandeep Singh", "Sasa Misailovic"],
    459       "year": 2025,
    460       "relevance": "Alternative approach to structured generation allowing backtracking on constraint violations."
    461     },
    462     {
    463       "title": "Generating structured outputs from language models: Benchmark and studies",
    464       "authors": ["Saibo Geng", "Hudson Cooper", "Michai Moskal"],
    465       "year": 2025,
    466       "arxiv_id": "2501.10868",
    467       "relevance": "JSONSchemaBench benchmark used in this paper's evaluation of structured output generation."
    468     }
    469   ]
    470 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs