scan-v4.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v4.json (33334B)
      1 {
      2   "scan_version": 4,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Fast Controlled Generation from Language Models with Adaptive Weighted Rejection Sampling",
      6     "authors": [
      7       "Benjamin Lipkin",
      8       "Benjamin LeBrun",
      9       "Jacob Hoover Vigly",
     10       "João Loula",
     11       "David R. MacIver",
     12       "Li Du",
     13       "Jason Eisner",
     14       "Ryan Cotterell",
     15       "Vikash Mansinghka",
     16       "Timothy J. O'Donnell",
     17       "Alexander K. Lew",
     18       "Tim Vieira"
     19     ],
     20     "year": 2025,
     21     "venue": "COLM 2025",
     22     "arxiv_id": "2504.05410",
     23     "doi": "10.48550/arXiv.2504.05410"
     24   },
     25   "checklist": {
     26     "claims_and_evidence": {
     27       "abstract_claims_supported": {
     28         "applies": true,
     29         "answer": true,
     30         "justification": "Abstract claims of orders-of-magnitude fewer constraint evaluations, low-variance unbiased Z estimates, and superiority across 5 benchmarks are all supported by Table 1, Figure 2, and the theoretical analysis.",
     31         "source": "opus"
     32       },
     33       "causal_claims_justified": {
     34         "applies": true,
     35         "answer": true,
     36         "justification": "Causal claims like 'AWRS improves runtime' are justified through controlled single-variable comparisons (same model, same benchmark, different decoding algorithm). The ablation between ARS-LCD and AWRS-SMC isolates the importance weighting contribution.",
     37         "source": "opus"
     38       },
     39       "generalization_bounded": {
     40         "applies": true,
     41         "answer": true,
     42         "justification": "Claims are bounded to the tested domains. The paper states results for specific benchmarks and models (Llama 3.1 8B, etc.). The abstract says 'through extensive empirical evaluation in text-to-SQL, molecular synthesis, goal inference, pattern matching, and JSON domains' rather than claiming general superiority.",
     43         "source": "opus"
     44       },
     45       "alternative_explanations_discussed": {
     46         "applies": true,
     47         "answer": true,
     48         "justification": "The paper discusses why AWRS-SMC's advantage varies across domains (Pattern Matching has a more precise local constraint, §5). It also discusses the relationship between model quality and runtime, providing both theoretical (§3, App. G) and empirical (Fig. 2) analyses of what drives performance.",
     49         "source": "opus"
     50       },
     51       "proxy_outcome_distinction": {
     52         "applies": true,
     53         "answer": true,
     54         "justification": "The paper measures task-specific accuracy (execution accuracy, QED, pattern adherence) and clearly states what each metric measures. It does not overframe these as broader claims beyond the specific tasks.",
     55         "source": "opus"
     56       }
     57     },
     58     "limitations_and_scope": {
     59       "limitations_section_present": {
     60         "applies": true,
     61         "answer": false,
     62         "justification": "No dedicated limitations section. The paper discusses some limitations implicitly (e.g., noting implementations are unoptimized, TM-LCD is infeasible for most benchmarks) but lacks a structured limitations discussion.",
     63         "source": "opus"
     64       },
     65       "threats_to_validity_specific": {
     66         "applies": true,
     67         "answer": false,
     68         "justification": "No threats to validity are discussed. The paper does not address potential issues like the choice of benchmarks, the impact of constraint checker implementation quality, or generalization to other model families.",
     69         "source": "opus"
     70       },
     71       "scope_boundaries_stated": {
     72         "applies": true,
     73         "answer": true,
     74         "justification": "The paper explicitly notes it focuses on 'runtime control' (footnote 1), distinguishing from training-based methods. It also notes implementations are 'pure Python and relatively unoptimized' (footnote 5), bounding runtime claims.",
     75         "source": "opus"
     76       }
     77     },
     78     "conflicts_of_interest": {
     79       "funding_disclosed": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "Acknowledgments section discloses NSF Graduate Research Fellowship (Grant No. 2141064), NSF SBE Postdoctoral Research Fellowship (Grant No. SMA-2404644), and Mila compute resources.",
     83         "source": "opus"
     84       },
     85       "affiliations_disclosed": {
     86         "applies": true,
     87         "answer": true,
     88         "justification": "All author affiliations are clearly listed: MIT, ETH Zürich, McGill, Canada CIFAR AI Chair, Mila, Johns Hopkins, Yale, CHI FRO.",
     89         "source": "opus"
     90       },
     91       "funder_independent_of_outcome": {
     92         "applies": true,
     93         "answer": true,
     94         "justification": "Funding is from NSF and academic compute (Mila). These are independent of the outcome — no commercial entity with a stake in the results is funding the work.",
     95         "source": "opus"
     96       },
     97       "financial_interests_declared": {
     98         "applies": true,
     99         "answer": false,
    100         "justification": "No competing interests statement is present in the paper.",
    101         "source": "opus"
    102       }
    103     },
    104     "scope_and_framing": {
    105       "key_terms_defined": {
    106         "applies": true,
    107         "answer": true,
    108         "justification": "Key terms are precisely defined: locally constrained decoding (LCD, §2), the normalizing constant Z (Eq. 1), adaptive rejection sampling (§2), properly weighted proposals (Definition 3, App. B), and sequential Monte Carlo (App. A).",
    109         "source": "haiku"
    110       },
    111       "intended_contribution_clear": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The paper explicitly enumerates four contributions in the introduction: (1) the AWRS fast Las Vegas sampler, (2) stochastic unbiased estimates of Z for SMC, (3) runtime analysis, and (4) empirical evaluation across five domains.",
    115         "source": "haiku"
    116       },
    117       "engagement_with_prior_work": {
    118         "applies": true,
    119         "answer": true,
    120         "justification": "Section 6 and throughout the paper explicitly compare AWRS against Outlines/token masking, grammar-based approaches (Koo et al., XGrammar), and SMC methods (Lew et al. 2023, Zhao et al. 2024, Loula et al. 2025), explaining mechanistic differences rather than just listing citations.",
    121         "source": "haiku"
    122       }
    123     }
    124   },
    125   "type_checklist": {
    126     "empirical": {
    127       "artifacts": {
    128         "code_released": {
    129           "applies": true,
    130           "answer": true,
    131           "justification": "Two GitHub repositories provided: https://github.com/genlm/genlm-control (library) and https://github.com/genlm/awrs-colm-2025 (experiment replication).",
    132           "source": "opus"
    133         },
    134         "data_released": {
    135           "applies": true,
    136           "answer": true,
    137           "justification": "Uses publicly available benchmarks (Spider, JSONSchemaBench, Planetarium, GDB-17). The pattern matching dataset generation pipeline is described in App. J, and the replication repo is provided.",
    138           "source": "opus"
    139         },
    140         "environment_specified": {
    141           "applies": true,
    142           "answer": false,
    143           "justification": "No requirements.txt, Dockerfile, or detailed environment specification found in the paper. Hardware is mentioned (L40S, A100 GPUs) but software dependencies are not specified.",
    144           "source": "opus"
    145         },
    146         "reproduction_instructions": {
    147           "applies": true,
    148           "answer": false,
    149           "justification": "While a replication repository is provided, no step-by-step reproduction instructions are included in the paper itself. The paper notes implementations are 'relatively unoptimized' pure Python but does not provide reproduction steps.",
    150           "source": "opus"
    151         }
    152       },
    153       "statistical_methodology": {
    154         "confidence_intervals_or_error_bars": {
    155           "applies": true,
    156           "answer": true,
    157           "justification": "95% bootstrapped confidence intervals are reported for all accuracy and runtime results in Table 1 and Table 3.",
    158           "source": "opus"
    159         },
    160         "significance_tests": {
    161           "applies": true,
    162           "answer": false,
    163           "justification": "No formal significance tests are used. Comparisons between methods rely on comparing bootstrapped confidence intervals, but no explicit hypothesis tests (p-values, etc.) are reported.",
    164           "source": "opus"
    165         },
    166         "effect_sizes_reported": {
    167           "applies": true,
    168           "answer": true,
    169           "justification": "Results are reported with absolute accuracy values and runtime in seconds with baselines for context (e.g., ARS-LCD 0.980 vs TM-LCD 0.978, >50x speedup). The reader can compute effect sizes from the provided numbers.",
    170           "source": "opus"
    171         },
    172         "sample_size_justified": {
    173           "applies": true,
    174           "answer": false,
    175           "justification": "No justification for sample sizes is provided. The number of examples per benchmark is not discussed in terms of statistical power.",
    176           "source": "opus"
    177         },
    178         "variance_reported": {
    179           "applies": true,
    180           "answer": true,
    181           "justification": "95% bootstrapped confidence intervals are reported for all main results, providing spread information. Simulation results in App. F also show variance across Monte Carlo iterations.",
    182           "source": "opus"
    183         }
    184       },
    185       "evaluation_design": {
    186         "baselines_included": {
    187           "applies": true,
    188           "answer": true,
    189           "justification": "Five baselines compared: Base LM, TM-LCD (token masking), ARS-LCD, Sample-Verify, and Twisted SMC. Each is clearly described in §4.",
    190           "source": "opus"
    191         },
    192         "baselines_contemporary": {
    193           "applies": true,
    194           "answer": true,
    195           "justification": "Baselines include recent work: Twisted SMC (Loula et al., 2025), Sample-Verify approaches from 2024, and the standard TM-LCD approach. Related work in §6 discusses very recent concurrent work (Botta et al., 2025; Mündler et al., 2025).",
    196           "source": "opus"
    197         },
    198         "ablation_study": {
    199           "applies": true,
    200           "answer": true,
    201           "justification": "The comparison between ARS-LCD (unweighted) and AWRS-SMC (weighted with importance correction) serves as an ablation of the weighting component. The paper also varies number of particles and model sizes (Fig. L.1, Tables 1 and 3).",
    202           "source": "opus"
    203         },
    204         "multiple_metrics": {
    205           "applies": true,
    206           "answer": true,
    207           "justification": "Two primary metrics reported: task-specific accuracy and runtime (seconds per example). Each domain also has its own accuracy metric (execution accuracy, QED, pattern adherence, etc.).",
    208           "source": "opus"
    209         },
    210         "human_evaluation": {
    211           "applies": false,
    212           "answer": false,
    213           "justification": "Human evaluation is not relevant; the tasks have objective ground-truth metrics (execution accuracy, schema validation, pattern matching, PDDL equivalence, QED).",
    214           "source": "opus"
    215         },
    216         "held_out_test_set": {
    217           "applies": true,
    218           "answer": true,
    219           "justification": "Standard benchmark splits used: Spider development split, JSONSchemaBench validation splits, Planetarium Blocksworld tasks. These are established held-out evaluation sets.",
    220           "source": "opus"
    221         },
    222         "per_category_breakdown": {
    223           "applies": true,
    224           "answer": true,
    225           "justification": "Results are broken down by domain (5 domains in Table 1), by model size (Table 3, 1B/8B/70B), and by JSON difficulty level (Github-trivial, -easy, -medium mentioned in §4).",
    226           "source": "opus"
    227         },
    228         "failure_cases_discussed": {
    229           "applies": true,
    230           "answer": true,
    231           "justification": "The paper discusses where LCD fails (dead-end paths, §2 example with 'mortg'), and Figure 2 shows how AWRS handles hard cases. The paper notes Pattern Matching suffers less from greediness because its local constraint is more precise.",
    232           "source": "opus"
    233         },
    234         "negative_results_reported": {
    235           "applies": true,
    236           "answer": true,
    237           "justification": "AWRS-SMC does not always improve over ARS-LCD (Pattern Matching domain, where the local constraint is precise enough). The paper also notes TM-LCD was computationally infeasible for most benchmarks.",
    238           "source": "opus"
    239         }
    240       },
    241       "setup_transparency": {
    242         "model_versions_specified": {
    243           "applies": true,
    244           "answer": true,
    245           "justification": "Specific model versions stated: Llama 3.1 8B-Instruct, Llama 3.1 8B, Llama 3.2 1B, Llama 3.3 70B. These are specific enough model identifiers.",
    246           "source": "opus"
    247         },
    248         "prompts_provided": {
    249           "applies": true,
    250           "answer": false,
    251           "justification": "For molecular synthesis, 'few-shot prompts created by repeatedly selecting 20 random samples from the GDB-17 database' but actual prompt text is not shown. No prompts are provided in the paper or appendix for any domain.",
    252           "source": "opus"
    253         },
    254         "hyperparameters_reported": {
    255           "applies": true,
    256           "answer": true,
    257           "justification": "App. K reports temperature (1.0), max tokens per domain (32-350), ESS thresholds for resampling, particle counts (M=5 for AWRS-SMC, M=10 for baselines), and resampling strategies.",
    258           "source": "opus"
    259         },
    260         "scaffolding_described": {
    261           "applies": false,
    262           "answer": false,
    263           "justification": "No agentic scaffolding is used. The method is a decoding algorithm applied directly to language models.",
    264           "source": "opus"
    265         },
    266         "data_preprocessing_documented": {
    267           "applies": true,
    268           "answer": true,
    269           "justification": "Data sources and processing are described: Spider dev split, JSONSchemaBench validation splits with difficulty tiers, Planetarium Blocksworld with up to 10 objects, GDB-17 database for molecular synthesis, and a detailed pattern generation pipeline in App. J.",
    270           "source": "opus"
    271         }
    272       },
    273       "data_integrity": {
    274         "raw_data_available": {
    275           "applies": true,
    276           "answer": true,
    277           "justification": "A replication repository is provided (https://github.com/genlm/awrs-colm-2025) containing 'source code and data to replicate this paper's experiments.'",
    278           "source": "opus"
    279         },
    280         "data_collection_described": {
    281           "applies": true,
    282           "answer": true,
    283           "justification": "Each benchmark's data source is clearly described: Spider dev split (Yu et al., 2018), JSONSchemaBench validation splits (Geng et al., 2025), Planetarium Blocksworld (Zuo et al., 2024), GDB-17 (Ruddigkeit et al., 2012), and custom pattern matching pipeline (App. J).",
    284           "source": "opus"
    285         },
    286         "recruitment_methods_described": {
    287           "applies": false,
    288           "answer": false,
    289           "justification": "No human participants. All data comes from standard benchmarks.",
    290           "source": "opus"
    291         },
    292         "data_pipeline_documented": {
    293           "applies": true,
    294           "answer": true,
    295           "justification": "The pattern matching generation pipeline is detailed in App. J with filtering stages and counts (1503 candidates → 402 after dedup, library compatibility, FSM exclusion, and prefix checks). Other benchmarks use standard splits.",
    296           "source": "opus"
    297         }
    298       },
    299       "contamination": {
    300         "training_cutoff_stated": {
    301           "applies": true,
    302           "answer": false,
    303           "justification": "The paper uses Llama models on benchmarks like Spider but does not state the training data cutoff dates for the models.",
    304           "source": "opus"
    305         },
    306         "train_test_overlap_discussed": {
    307           "applies": true,
    308           "answer": false,
    309           "justification": "No discussion of whether Spider or other benchmark examples appeared in Llama's training data.",
    310           "source": "opus"
    311         },
    312         "benchmark_contamination_addressed": {
    313           "applies": true,
    314           "answer": false,
    315           "justification": "Spider was published in 2018, well before Llama's training data. No discussion of contamination risk despite using benchmarks that predate the models.",
    316           "source": "opus"
    317         }
    318       },
    319       "human_studies": {
    320         "pre_registered": {
    321           "applies": false,
    322           "answer": false,
    323           "justification": "No human participants.",
    324           "source": "opus"
    325         },
    326         "irb_or_ethics_approval": {
    327           "applies": false,
    328           "answer": false,
    329           "justification": "No human participants.",
    330           "source": "opus"
    331         },
    332         "demographics_reported": {
    333           "applies": false,
    334           "answer": false,
    335           "justification": "No human participants.",
    336           "source": "opus"
    337         },
    338         "inclusion_exclusion_criteria": {
    339           "applies": false,
    340           "answer": false,
    341           "justification": "No human participants.",
    342           "source": "opus"
    343         },
    344         "randomization_described": {
    345           "applies": false,
    346           "answer": false,
    347           "justification": "No human participants.",
    348           "source": "opus"
    349         },
    350         "blinding_described": {
    351           "applies": false,
    352           "answer": false,
    353           "justification": "No human participants.",
    354           "source": "opus"
    355         },
    356         "attrition_reported": {
    357           "applies": false,
    358           "answer": false,
    359           "justification": "No human participants.",
    360           "source": "opus"
    361         }
    362       },
    363       "cost_and_practicality": {
    364         "inference_cost_reported": {
    365           "applies": true,
    366           "answer": true,
    367           "justification": "Runtime in seconds per example is reported for all methods and benchmarks in Table 1 and Table 3. Constraint evaluation costs per domain are in Table 2 (App. I.1).",
    368           "source": "opus"
    369         },
    370         "compute_budget_stated": {
    371           "applies": true,
    372           "answer": false,
    373           "justification": "Hardware is mentioned (L40S, A100 GPUs) but total compute budget (GPU hours, total experiment time) is not stated.",
    374           "source": "opus"
    375         }
    376       },
    377       "experimental_rigor": {
    378         "seed_sensitivity_reported": {
    379           "applies": true,
    380           "answer": false,
    381           "justification": "No seed sensitivity analysis. Results are reported with bootstrapped CIs but no mention of varying random seeds across runs.",
    382           "source": "opus"
    383         },
    384         "number_of_runs_stated": {
    385           "applies": true,
    386           "answer": false,
    387           "justification": "The number of experimental runs is not explicitly stated. Bootstrapped CIs are computed but it is unclear whether results come from single or multiple runs.",
    388           "source": "opus"
    389         },
    390         "hyperparameter_search_budget": {
    391           "applies": true,
    392           "answer": false,
    393           "justification": "No hyperparameter search budget reported. Choices of particle counts (M=5 for AWRS-SMC, M=10 for baselines) and ESS thresholds appear tuned but no search process is described.",
    394           "source": "opus"
    395         },
    396         "best_config_selection_justified": {
    397           "applies": true,
    398           "answer": false,
    399           "justification": "The choice of M=5 for AWRS-SMC vs M=10 for baselines is not justified through a validation procedure. Different ESS thresholds and resampling strategies are used per domain without explaining selection.",
    400           "source": "opus"
    401         },
    402         "multiple_comparison_correction": {
    403           "applies": false,
    404           "answer": false,
    405           "justification": "No formal statistical tests are performed, so multiple comparison correction is not applicable.",
    406           "source": "opus"
    407         },
    408         "self_comparison_bias_addressed": {
    409           "applies": true,
    410           "answer": false,
    411           "justification": "The authors implement all baselines themselves. No discussion of self-comparison bias or whether their implementations of Sample-Verify and Twisted SMC are faithful.",
    412           "source": "opus"
    413         },
    414         "compute_budget_vs_performance": {
    415           "applies": true,
    416           "answer": true,
    417           "justification": "Figure L.1 shows accuracy vs runtime tradeoff across different particle counts and model sizes. Table 1 reports both accuracy and runtime for all methods, enabling compute-matched comparisons.",
    418           "source": "opus"
    419         },
    420         "benchmark_construct_validity": {
    421           "applies": true,
    422           "answer": false,
    423           "justification": "No discussion of whether the chosen benchmarks adequately represent real-world constrained generation needs. The paper does not question the construct validity of any benchmark.",
    424           "source": "opus"
    425         },
    426         "scaffold_confound_addressed": {
    427           "applies": false,
    428           "answer": false,
    429           "justification": "No scaffolding involved. The methods are decoding algorithms applied directly to LMs.",
    430           "source": "opus"
    431         }
    432       },
    433       "data_leakage": {
    434         "temporal_leakage_addressed": {
    435           "applies": true,
    436           "answer": false,
    437           "justification": "No discussion of temporal leakage. Spider (2018) and other benchmarks predate Llama training, creating contamination risk that is not addressed.",
    438           "source": "opus"
    439         },
    440         "feature_leakage_addressed": {
    441           "applies": true,
    442           "answer": false,
    443           "justification": "No discussion of feature leakage. The constraint checker provides information during generation that may not be available in unconstrained settings, but this is inherent to the method rather than a leakage concern.",
    444           "source": "opus"
    445         },
    446         "non_independence_addressed": {
    447           "applies": true,
    448           "answer": false,
    449           "justification": "No discussion of whether training and test data share structural similarities.",
    450           "source": "opus"
    451         },
    452         "leakage_detection_method": {
    453           "applies": true,
    454           "answer": false,
    455           "justification": "No leakage detection or prevention method is used or discussed.",
    456           "source": "opus"
    457         }
    458       }
    459     }
    460   },
    461   "claims": [
    462     {
    463       "claim": "AWRS requires orders of magnitude fewer constraint evaluations than enumerative token masking (TM-LCD), achieving >50× speedup at sequence level.",
    464       "evidence": "On pattern matching (the only domain where TM-LCD was computationally feasible), ARS-LCD achieves 0.16 sec/ex vs. TM-LCD's 6.91 sec/ex with equivalent accuracy (0.980 vs. 0.978), and Fig. 2 shows AWRS typically checks only 2–3 tokens per step.",
    465       "supported": "strong"
    466     },
    467     {
    468       "claim": "AWRS-SMC achieves higher accuracy than existing SMC methods (Sample-Verify, Twisted SMC) while using half the number of particles.",
    469       "evidence": "Table 1 shows AWRS-SMC (M=5) equals or exceeds Sample-Verify and Twisted SMC (M=10) accuracy in 4 of 5 domains, with significant improvements in Goal Inference (0.528 vs. 0.479), JSON (0.898 vs. 0.871), and Molecular Synthesis (0.615 vs. 0.594).",
    470       "supported": "strong"
    471     },
    472     {
    473       "claim": "AWRS provides provably unbiased estimates of the local normalizing constant Z, enabling sound integration into sequential Monte Carlo.",
    474       "evidence": "Propositions 1 and 3 formally prove E[Z-hat] = Z for WRS and AWRS respectively, derived via the RAVI framework (App. B–D); Fig. F.1 empirically validates convergence of MAE to 0 as Monte Carlo samples increase.",
    475       "supported": "strong"
    476     },
    477     {
    478       "claim": "AWRS runtime scales with KL divergence between unconstrained and constrained distributions, making it faster for better (more capable) LMs.",
    479       "evidence": "Proposition 4 establishes the runtime bound O(Σ π_x); Fig. 2 empirically confirms this scaling on pattern matching with three Llama sizes; Fig. L.1 shows AWRS-SMC with Llama 1B outperforms Twisted SMC with Llama 70B at similar runtimes.",
    480       "supported": "strong"
    481     },
    482     {
    483       "claim": "AWRS-SMC with Llama 3.2 1B outperforms Twisted SMC with Llama 3.1 8B and Llama 3.3 70B in accuracy on pattern matching.",
    484       "evidence": "Table 3 shows AWRS-SMC (1B): 0.974 accuracy vs. Twisted SMC (8B): 0.796 and Twisted SMC (70B): 0.846, with favorable runtime comparison in Fig. L.1.",
    485       "supported": "moderate"
    486     }
    487   ],
    488   "methodology_tags": [
    489     "benchmark-eval",
    490     "theoretical"
    491   ],
    492   "key_findings": "AWRS is a Las Vegas algorithm for constrained token sampling that achieves exact samples from the locally constrained distribution while requiring orders of magnitude fewer constraint evaluations than exhaustive token masking. By deriving provably unbiased estimates of the local normalizing constant Z as a byproduct of rejection sampling, AWRS enables sound integration into sequential Monte Carlo for correcting the global distribution bias of locally constrained decoding. Evaluated across five diverse domains (text-to-SQL, JSON, goal inference, molecular synthesis, pattern matching), AWRS-SMC consistently matches or exceeds state-of-the-art accuracy while being dramatically faster than token masking, and outperforms existing SMC methods with half the particles. Crucially, the algorithm's runtime scales with the KL divergence between constrained and unconstrained distributions, meaning better LMs yield faster constrained generation—a property that aligns favorably with LM scaling trends.",
    493   "red_flags": [
    494     {
    495       "flag": "Unequal particle counts in comparison",
    496       "detail": "AWRS-SMC is run with M=5 particles while Sample-Verify and Twisted SMC use M=10 in all main comparisons. While this demonstrates sample efficiency, it complicates direct runtime comparisons since particle count is the primary lever controlling accuracy-runtime tradeoffs."
    497     },
    498     {
    499       "flag": "TM-LCD baseline only on one domain",
    500       "detail": "The >50× speedup claim is computed only on the pattern matching domain, the sole domain where TM-LCD was computationally feasible. The speedup magnitude for other domains (SQL, JSON, molecular synthesis) is unmeasured."
    501     },
    502     {
    503       "flag": "Unoptimized Python implementations",
    504       "detail": "The paper explicitly notes (footnote 5) that all implementations are in 'pure Python and relatively unoptimized,' meaning the runtime comparisons do not reflect production-quality implementations and favor AWRS's algorithmic advantage while understating the potential gains from engineering optimization of baselines."
    505     },
    506     {
    507       "flag": "No limitations section",
    508       "detail": "The paper contains no dedicated limitations or threats-to-validity section. Scope constraints (single architecture family, unoptimized code, five curated domains) are scattered across footnotes rather than systematically discussed."
    509     },
    510     {
    511       "flag": "No contamination discussion",
    512       "detail": "The paper evaluates Llama models on Spider SQL and other public benchmarks without discussing whether these datasets appeared in Llama's training data, which could inflate reported accuracy if models have memorized benchmark examples."
    513     }
    514   ],
    515   "cited_papers": [
    516     {
    517       "title": "Sequential Monte Carlo Steering of Large Language Models Using Probabilistic Programs",
    518       "relevance": "Direct predecessor whose SMC framework for controlled LM generation AWRS extends by providing efficient, properly weighted proposals."
    519     },
    520     {
    521       "title": "Syntactic and Semantic Control of Large Language Models via Sequential Monte Carlo",
    522       "relevance": "Contemporary SMC method (Twisted SMC) serving as primary accuracy baseline; AWRS-SMC outperforms it while using fewer particles."
    523     },
    524     {
    525       "title": "Probabilistic Inference in Language Models via Twisted Sequential Monte Carlo",
    526       "relevance": "SMC baseline requiring expensive fine-tuning for twists; contrasted with AWRS's training-free approach."
    527     },
    528     {
    529       "title": "Efficient Guided Generation for Large Language Models",
    530       "relevance": "Outlines library implementing token masking (TM-LCD) baseline; AWRS extends to arbitrary black-box constraints beyond grammar-based approaches."
    531     },
    532     {
    533       "title": "Grammar-Aligned Decoding",
    534       "relevance": "Shows LCD can distort global distributions; motivates the SMC correction that AWRS enables more efficiently."
    535     },
    536     {
    537       "title": "Controllable Generation via Locally Constrained Resampling",
    538       "relevance": "Concurrent work on constrained resampling for LLMs, cited as related approach to correcting LCD greediness."
    539     },
    540     {
    541       "title": "Generating Structured Outputs from Language Models: Benchmark and Studies",
    542       "relevance": "Provides the JSONSchemaBench dataset used as one of the five evaluation domains."
    543     }
    544   ],
    545   "engagement_factors": {
    546     "practical_relevance": {
    547       "score": 3,
    548       "justification": "Directly addresses constrained generation (SQL, JSON, code) with released code and plug-and-play design compatible with any constraint function."
    549     },
    550     "surprise_contrarian": {
    551       "score": 2,
    552       "justification": "Counterintuitively shows that rejection sampling (which 'wastes' samples) can be dramatically faster than exhaustive token masking, and that smaller LMs with AWRS beat larger LMs with competing methods."
    553     },
    554     "fear_safety": {
    555       "score": 0,
    556       "justification": "No AI safety or risk concerns; the work improves controllability of LMs, which is generally safety-positive."
    557     },
    558     "drama_conflict": {
    559       "score": 1,
    560       "justification": "Challenges the dominant LCD/token-masking paradigm but does so through algorithmic improvement rather than controversy."
    561     },
    562     "demo_ability": {
    563       "score": 2,
    564       "justification": "GitHub repo (genlm/genlm-control) is publicly available and actively maintained, enabling practitioners to try the method."
    565     },
    566     "brand_recognition": {
    567       "score": 2,
    568       "justification": "MIT, ETH Zürich, and McGill/Mila are highly recognized; no industry lab involvement."
    569     }
    570   },
    571   "hn_data": {
    572     "threads": [
    573       {
    574         "hn_id": "35472750",
    575         "title": "A radiation hard RISC-V microprocessor for high-energy physics applications",
    576         "points": 111,
    577         "comments": 46,
    578         "url": "https://news.ycombinator.com/item?id=35472750",
    579         "created_at": "2023-04-06T18:54:30Z"
    580       },
    581       {
    582         "hn_id": "44397503",
    583         "title": "Exploiting Local KV Cache Asymmetry for Long-Context LLMs",
    584         "points": 6,
    585         "comments": 0,
    586         "url": "https://news.ycombinator.com/item?id=44397503",
    587         "created_at": "2025-06-27T15:22:27Z"
    588       },
    589       {
    590         "hn_id": "39976086",
    591         "title": "Physics of Language Models: Part 3.3, Knowledge Capacity Scaling Laws",
    592         "points": 5,
    593         "comments": 0,
    594         "url": "https://news.ycombinator.com/item?id=39976086",
    595         "created_at": "2024-04-09T03:56:53Z"
    596       },
    597       {
    598         "hn_id": "47104697",
    599         "title": "Reasoning Models Fabricate 75% of Their Explanations (ArXiv:2505.05410)",
    600         "points": 4,
    601         "comments": 0,
    602         "url": "https://news.ycombinator.com/item?id=47104697",
    603         "created_at": "2026-02-21T21:01:00Z"
    604       },
    605       {
    606         "hn_id": "44211549",
    607         "title": "Oracular Programming: A Modular Foundation for Building LLM-Enabled Software",
    608         "points": 4,
    609         "comments": 1,
    610         "url": "https://news.ycombinator.com/item?id=44211549",
    611         "created_at": "2025-06-07T18:30:04Z"
    612       },
    613       {
    614         "hn_id": "43975695",
    615         "title": "AWRS SMC: Fast new algorithm for guiding LLMs as Bayesian inference",
    616         "points": 2,
    617         "comments": 0,
    618         "url": "https://news.ycombinator.com/item?id=43975695",
    619         "created_at": "2025-05-13T17:50:54Z"
    620       },
    621       {
    622         "hn_id": "43949744",
    623         "title": "Reasoning Models Don't Always Say What They Think",
    624         "points": 2,
    625         "comments": 0,
    626         "url": "https://news.ycombinator.com/item?id=43949744",
    627         "created_at": "2025-05-10T23:07:01Z"
    628       },
    629       {
    630         "hn_id": "45274922",
    631         "title": "Candidates evoke identity and issues on TikTok",
    632         "points": 2,
    633         "comments": 0,
    634         "url": "https://news.ycombinator.com/item?id=45274922",
    635         "created_at": "2025-09-17T12:15:44Z"
    636       },
    637       {
    638         "hn_id": "44028643",
    639         "title": "Reasoning Models Don't Always Say What They Think",
    640         "points": 1,
    641         "comments": 0,
    642         "url": "https://news.ycombinator.com/item?id=44028643",
    643         "created_at": "2025-05-19T11:29:32Z"
    644       },
    645       {
    646         "hn_id": "43726013",
    647         "title": "Parameter-Efficient Fine-Tuning of LLMs for Personality Detection",
    648         "points": 1,
    649         "comments": 0,
    650         "url": "https://news.ycombinator.com/item?id=43726013",
    651         "created_at": "2025-04-18T08:06:49Z"
    652       }
    653     ],
    654     "top_points": 111,
    655     "total_points": 138,
    656     "total_comments": 47
    657   }
    658 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs