scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (25674B)
      1 {
      2   "paper": {
      3     "title": "Logically Constrained Decoding",
      4     "authors": ["Franklin Ma", "Alan J. Hu"],
      5     "year": 2025,
      6     "venue": "MathNLP 2025 (Workshop at ACL)",
      7     "doi": "10.18653/v1/2025.mathnlp-main.11"
      8   },
      9   "scan_version": 2,
     10   "active_modules": ["experimental_rigor", "data_leakage"],
     11   "methodology_tags": ["benchmark-eval"],
     12   "key_findings": "Logically constrained decoding extends constrained decoding from syntactic to logical constraints, demonstrated on chess and propositional resolution proofs. In chess, unconstrained LLMs made illegal moves in 897/900 games while constrained LLMs made zero illegal moves. For resolution proofs, constrained decoding achieved 100% correct proof generation on pigeonhole problems of size 1-2, where unconstrained models scored 0% on size 2. The technique works across 9 different open-source LLM families with minimal latency overhead.",
     13   "checklist": {
     14     "artifacts": {
     15       "code_released": {
     16         "applies": true,
     17         "answer": true,
     18         "justification": "GitHub repository URL provided: https://github.com/terwo/logically-constrained-decoding (§4, footnote 8)."
     19       },
     20       "data_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "Pigeonhole problem encodings are fully specified in Appendix B, and chess uses the standard Stockfish engine. The SAT encodings are loaded via PySAT. The data is reproducible from the specification."
     24       },
     25       "environment_specified": {
     26         "applies": true,
     27         "answer": false,
     28         "justification": "Hardware is described (Dell EMC C4140, NVIDIA Tesla V100 16/32GB) and HuggingFace Transformers is mentioned, but no requirements.txt, Dockerfile, or specific library versions are provided in the paper."
     29       },
     30       "reproduction_instructions": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "No step-by-step reproduction instructions are provided in the paper. The GitHub link is given but no README or reproduction guide is described."
     34       }
     35     },
     36     "statistical_methodology": {
     37       "confidence_intervals_or_error_bars": {
     38         "applies": true,
     39         "answer": true,
     40         "justification": "Figure 2 shows whiskers representing 1 standard deviation for chess game results across 20 games per condition."
     41       },
     42       "significance_tests": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "No statistical significance tests are reported. Claims of improvement are based on raw comparisons (e.g., 897/900 vs 0/900 illegal moves, 0% vs 100% proof accuracy) without formal tests."
     46       },
     47       "effect_sizes_reported": {
     48         "applies": true,
     49         "answer": true,
     50         "justification": "Effect sizes are implicitly clear from absolute numbers: 897/900 games ended by illegal move without constraints vs 0/900 with constraints. For proofs, 0% vs 100% success rates. These provide full context for the magnitude of improvement."
     51       },
     52       "sample_size_justified": {
     53         "applies": true,
     54         "answer": false,
     55         "justification": "No justification is given for why 20 games per condition (chess) or 50 trials (proofs) were chosen. No power analysis is discussed."
     56       },
     57       "variance_reported": {
     58         "applies": true,
     59         "answer": true,
     60         "justification": "Standard deviation whiskers are shown in Figure 2 for chess results across 20 games. Figure 5 reports average tokens per second."
     61       }
     62     },
     63     "evaluation_design": {
     64       "baselines_included": {
     65         "applies": true,
     66         "answer": true,
     67         "justification": "Each model is compared in unconstrained vs constrained conditions, serving as its own baseline. This is the natural comparison for the proposed technique."
     68       },
     69       "baselines_contemporary": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "Models tested include recent releases: Qwen2.5 (2025), Gemma3 (2025), Phi-4-mini (2025), Llama 3.1 (2024), Ministral-8B (2024). Stockfish 17.1 is the latest version."
     73       },
     74       "ablation_study": {
     75         "applies": false,
     76         "answer": false,
     77         "justification": "The system has one core component (the symbolic constraint engine). The comparison is constrained vs unconstrained, which is the natural evaluation, not an ablation."
     78       },
     79       "multiple_metrics": {
     80         "applies": true,
     81         "answer": true,
     82         "justification": "For chess: number of legal moves before illegal move and game outcomes. For proofs: proof correctness rate. For performance: tokens per second. Multiple metrics across the two domains."
     83       },
     84       "human_evaluation": {
     85         "applies": false,
     86         "answer": false,
     87         "justification": "Human evaluation is irrelevant here — correctness of chess moves and resolution proofs is formally verifiable by machine."
     88       },
     89       "held_out_test_set": {
     90         "applies": false,
     91         "answer": false,
     92         "justification": "Not applicable — the evaluation uses online game play and proof generation, not a static dataset with train/test splits."
     93       },
     94       "per_category_breakdown": {
     95         "applies": true,
     96         "answer": true,
     97         "justification": "Results are broken down per model (9 models), per difficulty level (5 Stockfish settings), and per pigeonhole size (1, 2, 3) in Figures 2-4 and Table 1."
     98       },
     99       "failure_cases_discussed": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "The paper discusses failures: unconstrained models making illegal chess moves (Figure 1 example), ChatGPT-5 making an unsound deduction in resolution proofs, scalability issues with size-3 pigeonhole proofs, and the constraint engine's inability to complete experiments for large proofs."
    103       },
    104       "negative_results_reported": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "The paper reports that the constraint engine for resolution proofs does not scale to larger proofs — throughput drops from 24.7 to 5.5 tokens/sec as proof length grows (§4.3). They also could not complete size-3 experiments without an optimization that relaxes minimal invasiveness."
    108       }
    109     },
    110     "claims_and_evidence": {
    111       "abstract_claims_supported": {
    112         "applies": true,
    113         "answer": true,
    114         "justification": "The abstract claims proof-of-concept implementations for chess and resolution proofs with constrained decoding. These are supported by the experimental results in §4.1-4.3."
    115       },
    116       "causal_claims_justified": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The causal claim is that logically constrained decoding prevents illegal outputs. This is justified by the controlled comparison (same models, same prompts, with/without constraint) and the formal guarantee that the constraint engine blocks illegal tokens."
    120       },
    121       "generalization_bounded": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper explicitly frames results as 'proof-of-concept' and limits claims to chess and propositional resolution. The Limitations section acknowledges they only tested open-source models and smaller sizes, and that results may not directly apply to proprietary frontier models."
    125       },
    126       "alternative_explanations_discussed": {
    127         "applies": true,
    128         "answer": true,
    129         "justification": "The Limitations section discusses that prompt variations could affect results, that constraining output may limit reasoning ability (citing Banerjee et al. 2025), and that the chess experiments did not allow unconstrained thinking which may have affected play quality."
    130       },
    131       "proxy_outcome_distinction": {
    132         "applies": true,
    133         "answer": true,
    134         "justification": "The paper's claims match the granularity of its measurements: it measures illegal moves and proof correctness and claims exactly that — no broader proxy gap exists. It explicitly does not claim to produce a superior chess engine."
    135       }
    136     },
    137     "setup_transparency": {
    138       "model_versions_specified": {
    139         "applies": true,
    140         "answer": true,
    141         "justification": "Specific model names with sizes are given: Qwen2.5-7B-Instruct, Qwen2.5-14B-Instruct, Qwen2.5-32B-Instruct, Llama-3.1-8B-Instruct, Gemma-3-4b-it, Gemma-3-12b-it, Gemma-3-27b-it, Phi-4-mini-instruct, Ministral-8B-Instruct-2410. These are specific enough to identify exact model checkpoints."
    142       },
    143       "prompts_provided": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Full prompts for both chess (Figure 6) and resolution proofs (Figures 7-8) are provided in Appendix A."
    147       },
    148       "hyperparameters_reported": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Appendix D states temperature=1.0 and defaults from HuggingFace Transformers. Max token limits per pigeonhole size are given in Table 2. Precision settings (FP32 for Gemma3, FP16 for others) and quantization (8-bit for Gemma3 and Qwen2.5-32B) are specified."
    152       },
    153       "scaffolding_described": {
    154         "applies": false,
    155         "answer": false,
    156         "justification": "No agentic scaffolding is used. The system is a constrained decoding engine operating at the token level during generation."
    157       },
    158       "data_preprocessing_documented": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The pigeonhole SAT encoding is fully specified in Appendix B with the DIMACS format. Chess setup details are given (Stockfish 17.1, depth-15, 5 difficulty levels, 20 games per condition). Tokenization handling is documented in Appendix C."
    162       }
    163     },
    164     "limitations_and_scope": {
    165       "limitations_section_present": {
    166         "applies": true,
    167         "answer": true,
    168         "justification": "A dedicated 'Limitations' section is present after §5, with substantive discussion spanning multiple paragraphs."
    169       },
    170       "threats_to_validity_specific": {
    171         "applies": true,
    172         "answer": true,
    173         "justification": "Specific threats discussed: prompt notation variations (e.g., using + vs ∨ for OR), limitation to open-source models, limitation to smaller models due to compute constraints, chess experiments not allowing unconstrained thinking tokens, resolution implementation not scaling to larger proofs."
    174       },
    175       "scope_boundaries_stated": {
    176         "applies": true,
    177         "answer": true,
    178         "justification": "The paper explicitly states scope boundaries: 'We can modify and perform experiments only on open-source language models, so it is unclear to what degree our results can be applied to proprietary, frontier models.' Also: 'we were limited by our available computing resources to using smaller models.' The paper frames results as proof-of-concept, not general claims."
    179       }
    180     },
    181     "data_integrity": {
    182       "raw_data_available": {
    183         "applies": true,
    184         "answer": false,
    185         "justification": "No raw experimental data (game logs, proof transcripts, per-trial results) is made available. Only aggregate results in figures and tables."
    186       },
    187       "data_collection_described": {
    188         "applies": true,
    189         "answer": true,
    190         "justification": "The data collection procedure is well-described: chess games against Stockfish 17.1 at 5 difficulty levels, 20 games per condition (10 as White, 10 as Black), maximum 10 tokens per move. Resolution proofs: 50 trials per condition for sizes 1-2, 20 for size 3."
    191       },
    192       "recruitment_methods_described": {
    193         "applies": false,
    194         "answer": false,
    195         "justification": "No human participants. Data source is standard benchmark problems (pigeonhole) and chess engine play."
    196       },
    197       "data_pipeline_documented": {
    198         "applies": true,
    199         "answer": true,
    200         "justification": "The pipeline is straightforward and documented: generate move/proof step with LLM → check legality → update world model state → continue. The constraint engine's operation is described in §3 with pseudocode in §2.3."
    201       }
    202     },
    203     "conflicts_of_interest": {
    204       "funding_disclosed": {
    205         "applies": true,
    206         "answer": true,
    207         "justification": "Acknowledgements section states: 'This research was funded through an Undergraduate Summer Research Award and a Discovery Grant, both from the Natural Sciences and Engineering Research Council of Canada (NSERC).' Also mentions UBC Advanced Research Computing."
    208       },
    209       "affiliations_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Authors are from Department of Computer Science, University of British Columbia. No commercial product is being evaluated, so there is no affiliation conflict."
    213       },
    214       "funder_independent_of_outcome": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "NSERC is a government funding agency with no financial interest in the outcome of this research."
    218       },
    219       "financial_interests_declared": {
    220         "applies": true,
    221         "answer": false,
    222         "justification": "No competing interests or financial interests statement is present in the paper."
    223       }
    224     },
    225     "contamination": {
    226       "training_cutoff_stated": {
    227         "applies": true,
    228         "answer": false,
    229         "justification": "No training data cutoff dates are stated for any of the 9 models used."
    230       },
    231       "train_test_overlap_discussed": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "No discussion of whether chess game patterns or pigeonhole proofs appeared in training data. Chess games and SAT problems are widely available online."
    235       },
    236       "benchmark_contamination_addressed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "The pigeonhole problem is a classic well-known problem. No discussion of whether models may have seen pigeonhole resolution proofs during training. However, the constrained condition guarantees correctness regardless of contamination, partially mitigating this concern."
    240       }
    241     },
    242     "human_studies": {
    243       "pre_registered": {
    244         "applies": false,
    245         "answer": false,
    246         "justification": "No human participants in this study."
    247       },
    248       "irb_or_ethics_approval": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "demographics_reported": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "inclusion_exclusion_criteria": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "randomization_described": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "blinding_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "attrition_reported": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       }
    278     },
    279     "cost_and_practicality": {
    280       "inference_cost_reported": {
    281         "applies": true,
    282         "answer": true,
    283         "justification": "Tokens per second throughput is reported in Figure 5 for both constrained and unconstrained conditions, showing latency overhead. Specific numbers given: 24.7 tokens/sec for shorter proofs down to 5.5 for longer ones."
    284       },
    285       "compute_budget_stated": {
    286         "applies": true,
    287         "answer": false,
    288         "justification": "Hardware is described (Dell EMC C4140, NVIDIA Tesla V100) but total compute budget (GPU hours, total experiment time) is not quantified."
    289       }
    290     },
    291     "experimental_rigor": {
    292       "seed_sensitivity_reported": {
    293         "applies": true,
    294         "answer": true,
    295         "justification": "Results are reported across multiple trials with variance: 20 games per chess condition with standard deviation whiskers (Figure 2), and 50 trials per proof condition (Figures 3-4) showing success rates."
    296       },
    297       "number_of_runs_stated": {
    298         "applies": true,
    299         "answer": true,
    300         "justification": "Explicitly stated: 20 games per condition for chess (10 as White, 10 as Black), 50 iterations for pigeonhole sizes 1-2, 20 for size 3."
    301       },
    302       "hyperparameter_search_budget": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "No hyperparameter search is described. Temperature is set to 1.0 with defaults, but no justification for why these settings were chosen or whether alternatives were explored."
    306       },
    307       "best_config_selection_justified": {
    308         "applies": false,
    309         "answer": false,
    310         "justification": "No configuration selection is needed — the paper uses default settings for all models and reports results for all of them."
    311       },
    312       "multiple_comparison_correction": {
    313         "applies": false,
    314         "answer": false,
    315         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    316       },
    317       "self_comparison_bias_addressed": {
    318         "applies": true,
    319         "answer": false,
    320         "justification": "The authors implement both the constraint engine and the evaluation. No discussion of potential bias from evaluating their own system, though the formal nature of the domains (legal chess moves, valid resolvents) largely eliminates subjective evaluation bias."
    321       },
    322       "compute_budget_vs_performance": {
    323         "applies": true,
    324         "answer": true,
    325         "justification": "Figure 5 directly shows the performance cost of constrained vs unconstrained decoding in tokens/sec. The paper also discusses how throughput degrades with proof length (24.7 to 5.5 tokens/sec)."
    326       },
    327       "benchmark_construct_validity": {
    328         "applies": true,
    329         "answer": true,
    330         "justification": "The paper explicitly discusses what the benchmarks test and what they don't: chess is used as a 'Drosophila' model organism, not to build a chess engine. The pigeonhole problem is chosen as a well-known hard proof problem. The paper frames these as proof-of-concept, not claims about general reasoning."
    331       },
    332       "scaffold_confound_addressed": {
    333         "applies": false,
    334         "answer": false,
    335         "justification": "No scaffolding is involved — the technique operates at the token decoding level."
    336       }
    337     },
    338     "data_leakage": {
    339       "temporal_leakage_addressed": {
    340         "applies": true,
    341         "answer": false,
    342         "justification": "No discussion of whether pigeonhole proofs or chess strategies appeared in training data before model training cutoffs."
    343       },
    344       "feature_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "No discussion of whether the prompt format or problem encoding leaks information to the models."
    348       },
    349       "non_independence_addressed": {
    350         "applies": false,
    351         "answer": false,
    352         "justification": "Not applicable — the evaluation uses generated games and proof attempts, not a static dataset where train/test independence is a concern."
    353       },
    354       "leakage_detection_method": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No leakage detection or prevention method is applied. The paper does not check whether models have memorized pigeonhole proofs."
    358       }
    359     }
    360   },
    361   "claims": [
    362     {
    363       "claim": "Logically constrained decoding eliminates illegal moves in LLM chess play across all 9 models tested.",
    364       "evidence": "Table 1: 897/900 unconstrained games ended by illegal move vs 0/900 constrained games. Tested across 9 LLMs × 5 difficulty levels × 20 games = 900 per condition.",
    365       "supported": "strong"
    366     },
    367     {
    368       "claim": "With logically constrained decoding, all 9 LLMs achieve 100% correct resolution proofs for pigeonhole problems of size 1 and 2.",
    369       "evidence": "Figures 3 and 4: constrained models achieve 100% accuracy across 50 trials for size 1 and 2, while unconstrained models score 0% on size 2.",
    370       "supported": "strong"
    371     },
    372     {
    373       "claim": "Small constrained LLMs can solve problems that frontier models (ChatGPT-5, Claude Sonnet 4) solve incorrectly.",
    374       "evidence": "§4.2: Qwen2.5-7B with constrained decoding completes size-3 pigeonhole proof correctly, while ChatGPT-5 makes an unsound deduction and Claude Sonnet 4 fails.",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "The latency overhead of logically constrained decoding is generally minimal.",
    379       "evidence": "Figure 5: tokens/sec comparison for Qwen2.5-7B shows modest overhead for sizes 1-2, though significant slowdown for size-3 proofs (24.7→5.5 tokens/sec for long proofs).",
    380       "supported": "moderate"
    381     }
    382   ],
    383   "red_flags": [
    384     {
    385       "flag": "No contamination analysis for well-known problems",
    386       "detail": "Pigeonhole problems and their resolution proofs are classic textbook material widely available online. LLMs may have memorized these proofs during training. The unconstrained baseline performance may understate what models could do with better prompting, and the constrained performance may benefit from memorized proof strategies. This doesn't undermine the correctness guarantee, but does affect the generalizability claim."
    387     },
    388     {
    389       "flag": "Frontier model comparisons are informal",
    390       "detail": "The comparisons to ChatGPT-5 and Claude Sonnet 4 are described as informal ('We do not have the resources to do extensive experiments on these models'). The chess game in Figure 1 is a single casual game. These anecdotal comparisons are presented prominently but lack the rigor of the main experiments."
    391     },
    392     {
    393       "flag": "Incomplete experiments for size-3 pigeonhole",
    394       "detail": "The constrained experiments for 3-pigeonhole could not be completed without an optimization that relaxes the 'minimal invasiveness' property. The paper acknowledges this but still presents the result as evidence of the technique's success."
    395     }
    396   ],
    397   "cited_papers": [
    398     {
    399       "title": "CRANE: Reasoning with constrained LLM generation",
    400       "authors": ["Debangshu Banerjee", "Tarun Suresh", "Shubham Ugare", "Sasa Misailovic", "Gagandeep Singh"],
    401       "year": 2025,
    402       "relevance": "Directly relevant: shows that strictly constraining LLM outputs can reduce reasoning ability, motivating the design of logically constrained decoding."
    403     },
    404     {
    405       "title": "Prompting is programming: A query language for large language models",
    406       "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"],
    407       "year": 2023,
    408       "relevance": "Foundational work on constrained decoding for regular expressions and context-free grammars."
    409     },
    410     {
    411       "title": "Guiding LLMs the right way: Fast, non-invasive constrained generation",
    412       "authors": ["Luca Beurer-Kellner", "Marc Fischer", "Martin Vechev"],
    413       "year": 2024,
    414       "relevance": "Key prior work on efficient, minimally-invasive constrained decoding techniques."
    415     },
    416     {
    417       "title": "Type-constrained code generation with language models",
    418       "authors": ["Niels Mündler", "Jingxuan He", "Hao Wang", "Koushik Sen", "Dawn Song", "Martin Vechev"],
    419       "year": 2025,
    420       "relevance": "Extends constrained decoding beyond syntax to type safety for code generation — philosophically aligned work."
    421     },
    422     {
    423       "title": "Can transformers reason logically? A study in SAT solving",
    424       "authors": ["Leyan Pan", "Vijay Ganesh", "Jacob Abernethy", "Chris Esposo", "Wenke Lee"],
    425       "year": 2025,
    426       "relevance": "Shows that empirically trained transformers for SAT generalize and scale poorly, motivating neuro-symbolic approaches."
    427     },
    428     {
    429       "title": "Syncode: LLM generation with grammar augmentation",
    430       "authors": ["Shubham Ugare", "Tarun Suresh", "Hangoo Kang", "Sasa Misailovic", "Gagandeep Singh"],
    431       "year": 2024,
    432       "arxiv_id": "2403.01632",
    433       "relevance": "Constrained decoding system using grammar augmentation for LLM output — key baseline approach."
    434     },
    435     {
    436       "title": "Efficient guided generation for large language models",
    437       "authors": ["Brandon T. Willard", "Rémi Louf"],
    438       "year": 2023,
    439       "arxiv_id": "2307.09702",
    440       "relevance": "Foundational work on efficient constrained generation supporting both regex and CFG constraints."
    441     },
    442     {
    443       "title": "Chain-of-thought prompting elicits reasoning in large language models",
    444       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    445       "year": 2023,
    446       "arxiv_id": "2201.11903",
    447       "relevance": "Chain-of-thought prompting as an alternative approach to improving LLM reasoning on complex tasks."
    448     },
    449     {
    450       "title": "Reinforcement learning for reasoning in large language models with one training example",
    451       "authors": ["Yiping Wang"],
    452       "year": 2025,
    453       "arxiv_id": "2504.20571",
    454       "relevance": "RL with verified rewards as an alternative technique for boosting LLM reasoning performance."
    455     },
    456     {
    457       "title": "Lost in space: Finding the right tokens for structured output",
    458       "authors": ["Sil Hamilton", "David Mimno"],
    459       "year": 2025,
    460       "arxiv_id": "2502.14969",
    461       "relevance": "Addresses token misalignment in constrained decoding — a key implementation challenge."
    462     }
    463   ]
    464 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs