scan.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan.json (28570B)
      1 {
      2   "paper": {
      3     "title": "Prompting Is Programming: A Query Language for Large Language Models",
      4     "authors": [
      5       "Luca Beurer-Kellner",
      6       "Marc Fischer",
      7       "Martin Vechev"
      8     ],
      9     "year": 2023,
     10     "venue": "Proc. ACM Program. Lang.",
     11     "arxiv_id": "2212.06094",
     12     "doi": "10.1145/3591300"
     13   },
     14   "scan_version": 2,
     15   "active_modules": ["experimental_rigor", "data_leakage"],
     16   "methodology_tags": ["benchmark-eval", "theoretical"],
     17   "key_findings": "LMQL is a high-level query language for LLMs that combines text prompting with scripting and declarative output constraints. A formal model of eager partial evaluation semantics enables automatic token mask generation during decoding, reducing inference cost by 26-85% (measured in billable tokens) while maintaining or slightly improving task accuracy across chain-of-thought, interactive, and arithmetic prompting scenarios. The language requires 63-74% fewer lines of code than equivalent Python baseline implementations.",
     18   "checklist": {
     19     "artifacts": {
     20       "code_released": {
     21         "applies": true,
     22         "answer": true,
     23         "justification": "GitHub repository (https://github.com/eth-sri/lmql), Zenodo artifact (DOI: 10.5281/zenodo.7711823), and a live playground at https://lmql.ai are all provided in the 'Further Resources' section."
     24       },
     25       "data_released": {
     26         "applies": true,
     27         "answer": true,
     28         "justification": "The paper uses publicly available standard benchmarks: BIG benchmark (Odd One Out, Date Understanding), HotpotQA, and GSM8K, all cited with references. The Zenodo artifact includes the evaluated materials."
     29       },
     30       "environment_specified": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper states 'Nvidia A100 GPU with 40GB/80GB VRAM' and 'HuggingFace transformers library with pytorch on the backend' (§6 Experimental Setup) but provides no library version numbers, requirements.txt, or Dockerfile."
     34       },
     35       "reproduction_instructions": {
     36         "applies": true,
     37         "answer": true,
     38         "justification": "A formal Zenodo research artifact [3] ('PLDI'23 Research Artifacts v0.7') is released alongside the GitHub repository and project webpage with live demonstration, providing the standard PL conference artifact package for reproduction."
     39       }
     40     },
     41     "statistical_methodology": {
     42       "confidence_intervals_or_error_bars": {
     43         "applies": true,
     44         "answer": false,
     45         "justification": "Tables 3 and 5 report only point estimates (e.g., '33.33% accuracy', '73.04 model queries') with no confidence intervals or error bars."
     46       },
     47       "significance_tests": {
     48         "applies": true,
     49         "answer": false,
     50         "justification": "The paper claims LMQL reduces cost and maintains accuracy compared to standard decoding but reports no statistical significance tests. Comparisons are made by directly comparing numbers (e.g., '43.16% reduction')."
     51       },
     52       "effect_sizes_reported": {
     53         "applies": true,
     54         "answer": true,
     55         "justification": "Tables 3 and 5 report relative improvements with baseline context: e.g., model queries reduced from 73.04 to 41.51 (Δ -43.16%), billable tokens from 1178.71 to 861.32 (Δ -26.93%). This provides enough context to assess magnitude."
     56       },
     57       "sample_size_justified": {
     58         "applies": true,
     59         "answer": false,
     60         "justification": "The paper does not state how many examples from each benchmark were used for evaluation, nor does it provide any justification for sample sizes or power analysis."
     61       },
     62       "variance_reported": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "All results appear to be single-run measurements. No standard deviation, variance across seeds, or spread measures are reported in any table."
     66       }
     67     },
     68     "evaluation_design": {
     69       "baselines_included": {
     70         "applies": true,
     71         "answer": true,
     72         "justification": "All three case studies compare LMQL against 'Standard Decoding' using the HuggingFace generate() API baseline, which is described as 'a simple generate() API as e.g. provided by the HuggingFace Transformers package' (§6)."
     73       },
     74       "baselines_contemporary": {
     75         "applies": true,
     76         "answer": true,
     77         "justification": "The generate() API from HuggingFace Transformers is the standard high-level interface for LM interaction. The paper notes this 'reflects the current state of comparatively high-level LM APIs' (§6). OpenAI API is also discussed."
     78       },
     79       "ablation_study": {
     80         "applies": true,
     81         "answer": false,
     82         "justification": "LMQL has multiple components (scripted prompting, constraint decoding, eager evaluation, FollowMaps) but no ablation study isolates the contribution of individual components. The comparison is only LMQL-with-constraints vs. unconstrained baseline."
     83       },
     84       "multiple_metrics": {
     85         "applies": true,
     86         "answer": true,
     87         "justification": "The paper evaluates using five metrics: accuracy, decoder calls, model queries, billable tokens, and lines of code (LOC). All are reported across case studies."
     88       },
     89       "human_evaluation": {
     90         "applies": true,
     91         "answer": false,
     92         "justification": "No human evaluation is performed. The paper explicitly acknowledges this: 'our case studies cannot replace a full user study of LMQL, assessing its impact and usability together with real-world prompt engineers' (§6.4)."
     93       },
     94       "held_out_test_set": {
     95         "applies": true,
     96         "answer": false,
     97         "justification": "The paper does not explicitly state which dataset splits (train/dev/test) are used for evaluation. Few-shot examples appear hand-crafted or selected, but no explicit separation of development and evaluation data is documented."
     98       },
     99       "per_category_breakdown": {
    100         "applies": true,
    101         "answer": true,
    102         "justification": "Results are broken down by task (Odd One Out, Date Understanding in Table 3; ReAct and Arithmetic in Table 5) and by model (GPT-J-6B, OPT-30B in Table 3). Fig. 12 shows chunk-size breakdowns."
    103       },
    104       "failure_cases_discussed": {
    105         "applies": true,
    106         "answer": true,
    107         "justification": "Fig. 4(b) shows problematic LM completions (running-on text, irrelevant tangents). §6.3 acknowledges 'GPT-J 6B is not able to solve the problem correctly' in the arithmetic case study. These illustrate failure modes addressed by LMQL."
    108       },
    109       "negative_results_reported": {
    110         "applies": true,
    111         "answer": false,
    112         "justification": "Every experiment shows LMQL matching or improving on accuracy while reducing cost. No configurations that degraded performance, failed approaches, or unsuccessful design decisions are reported."
    113       }
    114     },
    115     "claims_and_evidence": {
    116       "abstract_claims_supported": {
    117         "applies": true,
    118         "answer": true,
    119         "justification": "The abstract claims '26-85% cost savings' and 'retain or increase accuracy.' Tables 3 and 5 show billable token reductions of 26.93% to 84.93% and accuracy that is equal or slightly improved, supporting these claims."
    120       },
    121       "causal_claims_justified": {
    122         "applies": true,
    123         "answer": true,
    124         "justification": "The paper's causal claims (e.g., LMQL's constraint decoding enables cost reduction) are supported by controlled comparisons: same models, same prompts, same benchmarks, differing only in decoding approach. The formal model (§5) provides a mechanistic explanation."
    125       },
    126       "generalization_bounded": {
    127         "applies": true,
    128         "answer": false,
    129         "justification": "The paper title claims 'A Query Language for Large Language Models' generically, but evaluation is limited to GPT-2-XL (1.5B), GPT-J (6B), and OPT-30B with GPT-3.5 as a brief 'control experiment.' Only three task types are evaluated. No discussion of where LMQL might not apply."
    130       },
    131       "alternative_explanations_discussed": {
    132         "applies": true,
    133         "answer": false,
    134         "justification": "No alternative explanations are discussed for the observed cost reductions or accuracy differences. For example, the slight accuracy improvement on Odd One Out is attributed to constraints guiding REASONING, but confounds (e.g., different stopping conditions affecting output length) are not explored."
    135       },
    136       "proxy_outcome_distinction": {
    137         "applies": true,
    138         "answer": true,
    139         "justification": "The paper's claims closely match its measurements. Cost savings are measured directly via billable tokens with explicit pricing. 'Conciseness' is measured via LOC with the caveat stated: 'As a measure of conciseness we count the number of functional lines of code' (§6). The absence of a user study for usability claims is explicitly acknowledged."
    140       }
    141     },
    142     "setup_transparency": {
    143       "model_versions_specified": {
    144         "applies": true,
    145         "answer": true,
    146         "justification": "Specific model identifiers are provided: 'GPT-J 6B' [27], 'OPT-30B' [34], 'gpt2-xl' (1.5B), and 'text-davinci-003' for the GPT-3.5 control. For open-source models, these names uniquely identify the model weights."
    147       },
    148       "prompts_provided": {
    149         "applies": true,
    150         "answer": true,
    151         "justification": "Full LMQL queries including complete prompt text are provided in Figs. 1, 10, 11, and 13. These include the few-shot examples, constraint specifications, and control flow — sufficient to reconstruct every prompt sent to the model."
    152       },
    153       "hyperparameters_reported": {
    154         "applies": true,
    155         "answer": false,
    156         "justification": "Decoding methods are specified in the LMQL queries (argmax, beam(n=3), sample(no_repeat_ngram_size=3)), but temperature is not stated for sampling. Max tokens is only specified in one query (max_length=2048 in Fig. 13). No systematic hyperparameter table is provided."
    157       },
    158       "scaffolding_described": {
    159         "applies": true,
    160         "answer": true,
    161         "justification": "The ReAct implementation (Fig. 11) shows the full scaffolding: interpretation loop, Wikipedia tool calls, action parsing, and stopping conditions. The arithmetic evaluation (Fig. 13) shows the calculator integration. The LMQL code IS the scaffold description."
    162       },
    163       "data_preprocessing_documented": {
    164         "applies": true,
    165         "answer": false,
    166         "justification": "No data preprocessing steps are documented. The paper uses standard benchmarks but does not describe how examples were selected, how many were used, or any filtering or transformation applied to benchmark data before evaluation."
    167       }
    168     },
    169     "limitations_and_scope": {
    170       "limitations_section_present": {
    171         "applies": true,
    172         "answer": false,
    173         "justification": "There is no dedicated limitations or threats-to-validity section. A brief acknowledgment about the lack of a user study appears in §6.4 Discussion ('we note that our case studies cannot replace a full user study'), but this is two sentences, not a substantive section."
    174       },
    175       "threats_to_validity_specific": {
    176         "applies": true,
    177         "answer": false,
    178         "justification": "The only specific threat mentioned is the absence of a user study for usability claims (§6.4). Other threats — limited model sizes tested, few benchmarks, single-run results, no adversarial constraint examples — are not discussed."
    179       },
    180       "scope_boundaries_stated": {
    181         "applies": true,
    182         "answer": false,
    183         "justification": "The paper does not explicitly state scope boundaries. It does not discuss what types of prompting tasks LMQL cannot handle, what models it has not been tested on, or where its efficiency gains might not hold."
    184       }
    185     },
    186     "data_integrity": {
    187       "raw_data_available": {
    188         "applies": true,
    189         "answer": false,
    190         "justification": "The Zenodo artifact is released but the paper does not explicitly state that raw experimental data (individual query results, per-example breakdowns) is included. Only aggregated statistics are reported in the paper."
    191       },
    192       "data_collection_described": {
    193         "applies": true,
    194         "answer": true,
    195         "justification": "Data sources are clearly identified: 'tasks relating to general and date understanding [25], question answering [32] and arithmetic math [8]' (§6). The benchmarks (BIG benchmark, HotpotQA, GSM8K) are standard public datasets with well-documented collection procedures."
    196       },
    197       "recruitment_methods_described": {
    198         "applies": false,
    199         "answer": false,
    200         "justification": "No human participants. Data sources are standard public benchmarks."
    201       },
    202       "data_pipeline_documented": {
    203         "applies": true,
    204         "answer": false,
    205         "justification": "The pipeline from benchmark data to final results is not documented. The number of examples evaluated, any filtering steps, and the exact procedure from raw benchmark to reported averages are not described."
    206       }
    207     },
    208     "conflicts_of_interest": {
    209       "funding_disclosed": {
    210         "applies": true,
    211         "answer": true,
    212         "justification": "Funding is disclosed in the Acknowledgements: 'This work has received funding from the Swiss State Secretariat for Education, Research and Innovation (SERI) (SERI-funded ERC Consolidator Grant).'"
    213       },
    214       "affiliations_disclosed": {
    215         "applies": true,
    216         "answer": true,
    217         "justification": "Author affiliations are clearly stated: all three authors are from ETH Zurich, Switzerland. No product being evaluated is affiliated with the authors."
    218       },
    219       "funder_independent_of_outcome": {
    220         "applies": true,
    221         "answer": true,
    222         "justification": "The funder (SERI/ERC) is a government research funding body with no financial interest in LMQL's success or in any of the language models evaluated."
    223       },
    224       "financial_interests_declared": {
    225         "applies": true,
    226         "answer": false,
    227         "justification": "No competing interests or financial interests statement is included in the paper."
    228       }
    229     },
    230     "contamination": {
    231       "training_cutoff_stated": {
    232         "applies": true,
    233         "answer": false,
    234         "justification": "The paper evaluates GPT-J-6B, OPT-30B, GPT-2-XL, and text-davinci-003 on benchmarks but does not state training data cutoff dates for any of these models."
    235       },
    236       "train_test_overlap_discussed": {
    237         "applies": true,
    238         "answer": false,
    239         "justification": "No discussion of whether benchmark examples (BIG, HotpotQA, GSM8K) appeared in the training data of the evaluated models."
    240       },
    241       "benchmark_contamination_addressed": {
    242         "applies": true,
    243         "answer": false,
    244         "justification": "HotpotQA (2018), GSM8K (2021), and BIG benchmark tasks were all published before the training data collection of models like GPT-J and OPT-30B. Contamination risk is not discussed."
    245       }
    246     },
    247     "human_studies": {
    248       "pre_registered": {
    249         "applies": false,
    250         "answer": false,
    251         "justification": "No human participants in this study."
    252       },
    253       "irb_or_ethics_approval": {
    254         "applies": false,
    255         "answer": false,
    256         "justification": "No human participants in this study."
    257       },
    258       "demographics_reported": {
    259         "applies": false,
    260         "answer": false,
    261         "justification": "No human participants in this study."
    262       },
    263       "inclusion_exclusion_criteria": {
    264         "applies": false,
    265         "answer": false,
    266         "justification": "No human participants in this study."
    267       },
    268       "randomization_described": {
    269         "applies": false,
    270         "answer": false,
    271         "justification": "No human participants in this study."
    272       },
    273       "blinding_described": {
    274         "applies": false,
    275         "answer": false,
    276         "justification": "No human participants in this study."
    277       },
    278       "attrition_reported": {
    279         "applies": false,
    280         "answer": false,
    281         "justification": "No human participants in this study."
    282       }
    283     },
    284     "cost_and_practicality": {
    285       "inference_cost_reported": {
    286         "applies": true,
    287         "answer": true,
    288         "justification": "Inference cost is a primary metric. Tables 3 and 5 report model queries, decoder calls, billable tokens, and estimated monetary cost per query (e.g., '0.63¢/query', '5.2¢/query', '6.2¢/query') based on GPT-3 davinci pricing."
    289       },
    290       "compute_budget_stated": {
    291         "applies": true,
    292         "answer": false,
    293         "justification": "Hardware is mentioned ('Nvidia A100 GPU with 40GB/80GB VRAM') but total GPU hours, wall-clock time for experiments, or total computational budget are not stated."
    294       }
    295     },
    296     "experimental_rigor": {
    297       "seed_sensitivity_reported": {
    298         "applies": true,
    299         "answer": false,
    300         "justification": "No mention of random seeds or seed sensitivity analysis. Results appear to be single-run measurements."
    301       },
    302       "number_of_runs_stated": {
    303         "applies": true,
    304         "answer": false,
    305         "justification": "The number of experimental runs producing the reported results is not stated. Tables report 'average performance statistics (over queries)' but this refers to averaging across dataset examples, not multiple runs."
    306       },
    307       "hyperparameter_search_budget": {
    308         "applies": true,
    309         "answer": false,
    310         "justification": "No hyperparameter search is reported. The choice of constraint formulations, stopping conditions, and chunk sizes for baselines appears ad-hoc without documenting alternatives tried."
    311       },
    312       "best_config_selection_justified": {
    313         "applies": true,
    314         "answer": false,
    315         "justification": "The baseline chunk size of 30 is chosen because it 'minimizes the number of billable tokens, while not issuing exceedingly many model queries' (§6.2), but no systematic search justifies this. LMQL constraint configurations are not justified."
    316       },
    317       "multiple_comparison_correction": {
    318         "applies": false,
    319         "answer": false,
    320         "justification": "No statistical tests are performed, so multiple comparison correction is not applicable."
    321       },
    322       "self_comparison_bias_addressed": {
    323         "applies": true,
    324         "answer": false,
    325         "justification": "The authors implement both LMQL and the Python baseline. The baseline implementation choices (chunk-wise decoding, manual parsing) directly affect the comparison. No acknowledgment of author-evaluation bias."
    326       },
    327       "compute_budget_vs_performance": {
    328         "applies": true,
    329         "answer": false,
    330         "justification": "While the paper reports cost metrics alongside accuracy, it does not compare performance at matched compute budgets. LMQL uses fewer model queries but the overhead of constraint evaluation, mask computation, and runtime execution is not quantified."
    331       },
    332       "benchmark_construct_validity": {
    333         "applies": true,
    334         "answer": false,
    335         "justification": "The paper uses BIG benchmark tasks, HotpotQA, and GSM8K without discussing whether these benchmarks adequately measure the claimed capabilities of LMQL (expressiveness, efficiency for general LM programming)."
    336       },
    337       "scaffold_confound_addressed": {
    338         "applies": false,
    339         "answer": false,
    340         "justification": "LMQL itself is the tool being evaluated. The comparison (LMQL vs. standard decoding) IS testing the scaffold/tool difference, so there is no confound to address — the scaffold IS the independent variable."
    341       }
    342     },
    343     "data_leakage": {
    344       "temporal_leakage_addressed": {
    345         "applies": true,
    346         "answer": false,
    347         "justification": "The benchmarks (HotpotQA 2018, GSM8K 2021, BIG 2022) predate or overlap with the training data of the evaluated models. No discussion of temporal leakage."
    348       },
    349       "feature_leakage_addressed": {
    350         "applies": true,
    351         "answer": false,
    352         "justification": "No discussion of whether the evaluation setup leaks information. For few-shot prompting, the choice and ordering of examples could interact with model training data, but this is not addressed."
    353       },
    354       "non_independence_addressed": {
    355         "applies": true,
    356         "answer": false,
    357         "justification": "No discussion of whether benchmark examples share structural similarities with training data or with each other."
    358       },
    359       "leakage_detection_method": {
    360         "applies": true,
    361         "answer": false,
    362         "justification": "No leakage detection or prevention method is applied. No canary strings, membership inference, or temporal splitting."
    363       }
    364     }
    365   },
    366   "claims": [
    367     {
    368       "claim": "LMQL reduces the number of billable tokens by 26-85% compared to standard decoding while retaining or improving accuracy.",
    369       "evidence": "Tables 3 and 5 show: Chain-of-thought on Odd One Out/Date Understanding: 27-31% billable token reduction with same or +1.19% accuracy (§6.1). ReAct: 76% reduction (§6.2). Arithmetic: 85% reduction (§6.3). Accuracy is maintained or slightly improved in all cases.",
    370       "supported": "moderate"
    371     },
    372     {
    373       "claim": "LMQL can express state-of-the-art prompting methods in significantly fewer lines of code than Python implementations.",
    374       "evidence": "Table 4 shows LOC comparison: Odd One Out 9 vs 34 (74% reduction), Date Understanding 13 vs 38 (66%), Arithmetic 22 vs 59 (63%), ReAct 18 vs 78 (77%).",
    375       "supported": "moderate"
    376     },
    377     {
    378       "claim": "Eager partial evaluation semantics with FollowMaps enable sound automatic token mask generation during decoding.",
    379       "evidence": "Theorem 5.1 provides a formal proof of Brzozowski soundness: FollowMaps guarantee that no valid tokens are masked out (i.e., T_Q ⊆ M). Full proof in §5.2.",
    380       "supported": "strong"
    381     },
    382     {
    383       "claim": "LMQL's constrained decoding can improve accuracy by guiding the model's reasoning steps.",
    384       "evidence": "Table 3 shows +1.19% accuracy improvement on Odd One Out with GPT-J. Manual inspection reveals constraints on REASONING variable guide the model's output (§6.1). No improvement observed with OPT-30B or on Date Understanding.",
    385       "supported": "weak"
    386     },
    387     {
    388       "claim": "LMQL reduces decoder calls by up to 80-86% for interactive prompting and arithmetic evaluation.",
    389       "evidence": "Table 5: ReAct decoder calls reduced from 5 to 1 (-80%), arithmetic from 7 to 1 (-86%). This is because LMQL validates on-the-fly in a single decoding run rather than requiring chunk-wise generation.",
    390       "supported": "moderate"
    391     }
    392   ],
    393   "red_flags": [
    394     {
    395       "flag": "No error bars or variance",
    396       "detail": "All results are reported as single-point estimates without confidence intervals, error bars, standard deviations, or multiple-run statistics. It is impossible to assess result stability or whether observed differences are statistically meaningful."
    397     },
    398     {
    399       "flag": "Self-comparison bias",
    400       "detail": "The authors implement both LMQL and the Python baseline. The baseline is designed as a 'standard generate() API' implementation with chunk-wise decoding. Implementation choices for the baseline (e.g., chunk size, parsing logic) directly affect the magnitude of reported improvements, and no independent evaluation was conducted."
    401     },
    402     {
    403       "flag": "Limited model and task scope",
    404       "detail": "Evaluation is limited to small/medium models (GPT-2-XL 1.5B, GPT-J 6B, OPT-30B) and three task types with one or a few examples per task (arithmetic case study shows only one GSM8K problem). GPT-3.5 is tested only as a brief 'control experiment.' Claims of general applicability are not well-supported."
    405     },
    406     {
    407       "flag": "No user study for usability claims",
    408       "detail": "The paper claims LMQL provides 'intuitive' and 'concise' interaction but evaluates usability only through LOC counts. The authors acknowledge this: 'our case studies cannot replace a full user study of LMQL' (§6.4)."
    409     },
    410     {
    411       "flag": "Sample sizes not reported",
    412       "detail": "The number of benchmark examples evaluated is not stated for any case study. Tables report 'average performance statistics (over queries)' without specifying how many queries were averaged."
    413     }
    414   ],
    415   "cited_papers": [
    416     {
    417       "title": "Language Models are Few-Shot Learners",
    418       "authors": ["Tom B. Brown", "Benjamin Mann", "Nick Ryder"],
    419       "year": 2020,
    420       "relevance": "Foundational paper on GPT-3 and few-shot prompting, which LMQL builds on and extends."
    421     },
    422     {
    423       "title": "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models",
    424       "authors": ["Jason Wei", "Xuezhi Wang", "Dale Schuurmans"],
    425       "year": 2023,
    426       "arxiv_id": "2201.11903",
    427       "relevance": "Key prompting technique that LMQL implements as a case study, demonstrating constrained chain-of-thought decoding."
    428     },
    429     {
    430       "title": "ReAct: Synergizing Reasoning and Acting in Language Models",
    431       "authors": ["Shunyu Yao", "Jeffrey Zhao", "Dian Yu"],
    432       "year": 2023,
    433       "arxiv_id": "2210.03629",
    434       "relevance": "Interactive prompting scheme with tool use that LMQL implements, showing 76-80% cost reduction."
    435     },
    436     {
    437       "title": "Training Verifiers to Solve Math Word Problems",
    438       "authors": ["Karl Cobbe", "Vineet Kosaraju", "Mohammad Bavarian"],
    439       "year": 2021,
    440       "arxiv_id": "2110.14168",
    441       "relevance": "GSM8K benchmark used in LMQL's arithmetic reasoning case study."
    442     },
    443     {
    444       "title": "Toolformer: Language Models Can Teach Themselves to Use Tools",
    445       "authors": ["Timo Schick", "Jane Dwivedi-Yu", "Roberto Dessì"],
    446       "year": 2023,
    447       "arxiv_id": "2302.04761",
    448       "relevance": "Tool-augmented LLM approach relevant to the survey's coverage of agentic AI capabilities."
    449     },
    450     {
    451       "title": "PAL: Program-aided Language Models",
    452       "authors": ["Luyu Gao", "Aman Madaan", "Shuyan Zhou"],
    453       "year": 2023,
    454       "arxiv_id": "2211.10435",
    455       "relevance": "Program-aided chain-of-thought variant where LM code output is fed to an interpreter, related to LM programming paradigm."
    456     },
    457     {
    458       "title": "Synchromesh: Reliable code generation from pre-trained language models",
    459       "authors": ["Gabriel Poesia", "Oleksandr Polozov", "Vu Le"],
    460       "year": 2022,
    461       "arxiv_id": "2201.11227",
    462       "relevance": "Constrained code generation approach using parser integration, relevant to constrained LM decoding."
    463     },
    464     {
    465       "title": "Constrained Language Models Yield Few-Shot Semantic Parsers",
    466       "authors": ["Richard Shin", "Christopher Lin", "Sam Thomson"],
    467       "year": 2021,
    468       "doi": "10.18653/v1/2021.emnlp-main.608",
    469       "relevance": "Handcrafted token prediction constraints for semantic parsing, a precursor to LMQL's generalized constraint system."
    470     },
    471     {
    472       "title": "Language Model Cascades",
    473       "authors": ["David Dohan", "Winnie Xu", "Aitor Lewkowycz"],
    474       "year": 2022,
    475       "arxiv_id": "2207.10342",
    476       "relevance": "Frames compositional LM use in a probabilistic programming context, related to LMQL's language model programming paradigm."
    477     },
    478     {
    479       "title": "Program of Thoughts Prompting: Disentangling Computation from Reasoning for Numerical Reasoning Tasks",
    480       "authors": ["Wenhu Chen", "Xueguang Ma", "Xinyi Wang"],
    481       "year": 2022,
    482       "arxiv_id": "2211.12588",
    483       "relevance": "Code-based reasoning approach where LM output is executed, relevant to LM programming for arithmetic tasks."
    484     },
    485     {
    486       "title": "Self-Consistency Improves Chain of Thought Reasoning in Language Models",
    487       "authors": ["Xuezhi Wang", "Jason Wei", "Dale Schuurmans"],
    488       "year": 2023,
    489       "arxiv_id": "2203.11171",
    490       "relevance": "Aggregation-based prompting scheme mentioned as an LMP instance, relevant to survey coverage of prompting methods."
    491     },
    492     {
    493       "title": "Prompt Programming for Large Language Models: Beyond the Few-Shot Paradigm",
    494       "authors": ["Laria Reynolds", "Kyle McDonell"],
    495       "year": 2021,
    496       "doi": "10.1145/3411763.3451760",
    497       "relevance": "Introduces meta prompting concept that LMQL builds upon, framing prompting as programming."
    498     }
    499   ]
    500 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs