scan-v5.json (29510B)
1 { 2 "scan_version": 5, 3 "paper_type": "empirical", 4 "paper": { 5 "title": "Fast Controlled Generation from Language Models with Adaptive Weighted Rejection Sampling", 6 "authors": [ 7 "Benjamin Lipkin", 8 "Benjamin LeBrun", 9 "Jacob Hoover Vigly", 10 "João Loula", 11 "David R. MacIver", 12 "Li Du", 13 "Jason Eisner", 14 "Ryan Cotterell", 15 "Vikash Mansinghka", 16 "Timothy J. O'Donnell", 17 "Alexander K. Lew", 18 "Tim Vieira" 19 ], 20 "year": 2025, 21 "venue": "COLM 2025", 22 "arxiv_id": "2504.05410", 23 "doi": "10.48550/arXiv.2504.05410" 24 }, 25 "checklist": { 26 "claims_and_evidence": { 27 "abstract_claims_supported": { 28 "applies": true, 29 "answer": true, 30 "justification": "All abstract claims are supported: the >50x speedup over token masking is demonstrated in Table 1e, unbiased Z estimates are formally proven (Propositions 1 and 3), and AWRS-SMC superiority is shown across all 5 benchmarks in Table 1.", 31 "source": "haiku" 32 }, 33 "causal_claims_justified": { 34 "applies": true, 35 "answer": true, 36 "justification": "Causal claims about AWRS causing faster runtime and higher accuracy are supported by controlled ablative comparisons (ARS-LCD isolates adaptive sampling; AWRS-SMC adds importance weighting) and by formal theoretical analysis in Appendices G.1 and G.2.", 37 "source": "haiku" 38 }, 39 "generalization_bounded": { 40 "applies": true, 41 "answer": true, 42 "justification": "Conclusions are bounded to the five evaluated domains using Llama models; algorithmic claims are backed by proofs, and the scaling claim (better models → faster AWRS) is empirically supported in Fig. L.1 across three model sizes.", 43 "source": "haiku" 44 }, 45 "alternative_explanations_discussed": { 46 "applies": true, 47 "answer": false, 48 "justification": "The paper provides no discussion of alternative explanations for the observed performance improvements—e.g., whether differences might stem from implementation quality, specific task characteristics, or random initialization rather than the algorithmic novelty.", 49 "source": "haiku" 50 }, 51 "proxy_outcome_distinction": { 52 "applies": true, 53 "answer": true, 54 "justification": "Task-specific metrics directly measure the stated goals (SQL execution accuracy, JSON schema validity, PDDL ground-truth equivalence, QED for molecules), and the paper explicitly notes these reflect downstream task performance rather than internal sampler behavior.", 55 "source": "haiku" 56 } 57 }, 58 "limitations_and_scope": { 59 "limitations_section_present": { 60 "applies": true, 61 "answer": false, 62 "justification": "There is no dedicated limitations or threats-to-validity section. The closest acknowledgment is a footnote that implementations are 'written in pure Python and are relatively unoptimized.'", 63 "source": "haiku" 64 }, 65 "threats_to_validity_specific": { 66 "applies": true, 67 "answer": false, 68 "justification": "No specific threats to validity are discussed anywhere in the paper—no mention of domain generalizability limits, sensitivity to hyperparameter choices, or cases where AWRS may underperform.", 69 "source": "haiku" 70 }, 71 "scope_boundaries_stated": { 72 "applies": true, 73 "answer": false, 74 "justification": "The conclusion states broadly that AWRS 'is faster and more accurate than existing methods' without explicitly scoping this to the evaluated constraint types, model families, or domain categories tested.", 75 "source": "haiku" 76 } 77 }, 78 "conflicts_of_interest": { 79 "funding_disclosed": { 80 "applies": true, 81 "answer": true, 82 "justification": "NSF Graduate Research Fellowship (Grant 2141064), NSF SBE Postdoctoral Fellowship (Grant SMA-2404644), and compute resources from Mila are disclosed in the Acknowledgments.", 83 "source": "haiku" 84 }, 85 "affiliations_disclosed": { 86 "applies": true, 87 "answer": true, 88 "justification": "All author affiliations are disclosed on the title page: MIT, ETH Zürich, McGill, Mila, Johns Hopkins, Yale, and CHI FRO.", 89 "source": "haiku" 90 }, 91 "funder_independent_of_outcome": { 92 "applies": true, 93 "answer": true, 94 "justification": "NSF funding is independent of research outcomes. Mila provides compute but has no stake in which algorithm performs better.", 95 "source": "haiku" 96 }, 97 "financial_interests_declared": { 98 "applies": true, 99 "answer": false, 100 "justification": "There is no competing interests statement or declaration of financial interests (patents, equity, consulting) anywhere in the paper.", 101 "source": "haiku" 102 } 103 }, 104 "scope_and_framing": { 105 "key_terms_defined": { 106 "applies": true, 107 "answer": true, 108 "justification": "Key terms are formally defined: locally constrained decoding (Section 2, Eq. 1), constraint function 1C, normalizing constant Z, and AWRS (Definition 2) all have precise mathematical definitions; SMC is formalized in Appendix A.", 109 "source": "haiku" 110 }, 111 "intended_contribution_clear": { 112 "applies": true, 113 "answer": true, 114 "justification": "The paper explicitly enumerates four contributions in the introduction: a fast Las Vegas sampler, stochastic Z estimates for SMC, runtime analysis, and empirical evaluation across five domains.", 115 "source": "haiku" 116 }, 117 "engagement_with_prior_work": { 118 "applies": true, 119 "answer": true, 120 "justification": "Section 6 (Related Work) and the introduction explicitly contrast AWRS with grammar-specialized methods (Outlines, XGrammar), backtracking approaches, and SMC methods (Lew et al. 2023, Zhao et al. 2024, Loula et al. 2025), explaining specific limitations each paper addresses.", 121 "source": "haiku" 122 } 123 } 124 }, 125 "type_checklist": { 126 "empirical": { 127 "artifacts": { 128 "code_released": { 129 "applies": true, 130 "answer": true, 131 "justification": "Experiment replication code and data are released at https://github.com/genlm/awrs-colm-2025, and a maintained production library is at https://github.com/genlm/genlm-control.", 132 "source": "haiku" 133 }, 134 "data_released": { 135 "applies": true, 136 "answer": true, 137 "justification": "Standard benchmarks (Spider, JSONSchemaBench, Planetarium, GDB-17) are publicly available; the custom 402-case pattern matching dataset is included in the experiment repository per the paper's reproducibility statement.", 138 "source": "haiku" 139 }, 140 "environment_specified": { 141 "applies": true, 142 "answer": false, 143 "justification": "The paper specifies GPU hardware (L40S, A100) but provides no requirements.txt, Dockerfile, or equivalent dependency specification in the paper itself.", 144 "source": "haiku" 145 }, 146 "reproduction_instructions": { 147 "applies": true, 148 "answer": false, 149 "justification": "The paper provides GitHub links and Appendix K lists hyperparameters, but no step-by-step reproduction instructions are included in the paper that could be followed without consulting the external repositories.", 150 "source": "haiku" 151 } 152 }, 153 "statistical_methodology": { 154 "confidence_intervals_or_error_bars": { 155 "applies": true, 156 "answer": true, 157 "justification": "Table 1 reports 95% bootstrapped confidence intervals for all accuracy and runtime results across all five domains, and this is replicated in Table 3 for model-size comparisons.", 158 "source": "haiku" 159 }, 160 "significance_tests": { 161 "applies": true, 162 "answer": false, 163 "justification": "Bootstrapped confidence intervals are reported but no formal statistical significance tests (t-tests, Wilcoxon tests, or similar) are conducted for comparative claims between methods.", 164 "source": "haiku" 165 }, 166 "effect_sizes_reported": { 167 "applies": true, 168 "answer": true, 169 "justification": "Accuracy differences and runtime speedups are reported with confidence intervals; the >50x speedup for ARS-LCD vs TM-LCD (0.16 vs 6.91 sec/ex on pattern matching) provides clear effect magnitude.", 170 "source": "haiku" 171 }, 172 "sample_size_justified": { 173 "applies": true, 174 "answer": false, 175 "justification": "Sample sizes are not justified. The paper uses existing benchmark splits and 402 pattern matching cases without explaining why these are sufficient for stable estimates or appropriate for the claimed conclusions.", 176 "source": "haiku" 177 }, 178 "variance_reported": { 179 "applies": true, 180 "answer": true, 181 "justification": "Bootstrapped 95% confidence intervals are reported for both accuracy and runtime across all methods and domains in Tables 1 and 3.", 182 "source": "haiku" 183 } 184 }, 185 "evaluation_design": { 186 "baselines_included": { 187 "applies": true, 188 "answer": true, 189 "justification": "Five baselines are compared: Base LM, TM-LCD (token masking), ARS-LCD, Sample-Verify, and Twisted SMC, providing a thorough comparison landscape.", 190 "source": "haiku" 191 }, 192 "baselines_contemporary": { 193 "applies": true, 194 "answer": true, 195 "justification": "Twisted SMC (Loula et al. 2025) is a concurrent state-of-the-art method; Sample-Verify represents current practice for post-hoc filtering; all baselines reflect the current state of constrained generation.", 196 "source": "haiku" 197 }, 198 "ablation_study": { 199 "applies": true, 200 "answer": true, 201 "justification": "The paper effectively ablates contributions: ARS-LCD vs TM-LCD isolates the rejection sampling speedup; ARS-LCD vs AWRS-SMC ablates the importance weighting component that corrects for greediness.", 202 "source": "haiku" 203 }, 204 "multiple_metrics": { 205 "applies": true, 206 "answer": true, 207 "justification": "Both accuracy (task-specific) and runtime (seconds per example) are reported for all methods in all five domains, with constraint evaluation cost (ms/eval) reported separately in Table 2.", 208 "source": "haiku" 209 }, 210 "human_evaluation": { 211 "applies": false, 212 "answer": false, 213 "justification": "Human evaluation is not applicable; all tasks have automated ground truth evaluation (SQL execution matching, JSON schema validation, PDDL equivalence checking, QED scoring).", 214 "source": "haiku" 215 }, 216 "held_out_test_set": { 217 "applies": true, 218 "answer": true, 219 "justification": "The paper uses held-out evaluation splits: Spider development split, JSONSchemaBench validation splits (trivial/easy/medium), Planetarium Blocksworld tasks, and the generated pattern matching test cases.", 220 "source": "haiku" 221 }, 222 "per_category_breakdown": { 223 "applies": true, 224 "answer": true, 225 "justification": "Results are broken down across 5 separate domain tables (1a–1e), and Appendix L provides per-model-size breakdown for pattern matching across Llama 1B/8B/70B.", 226 "source": "haiku" 227 }, 228 "failure_cases_discussed": { 229 "applies": true, 230 "answer": true, 231 "justification": "The paper illustrates LCD failure cases with the 'mortg' prefix dead-end example (Section 2) and App. A's probability tree example showing how local conditioning can dramatically distort the global distribution.", 232 "source": "haiku" 233 }, 234 "negative_results_reported": { 235 "applies": true, 236 "answer": true, 237 "justification": "AWRS-SMC's higher runtime than Twisted SMC on Text-to-SQL (3.02 vs 2.60 sec/ex) and Molecular Synthesis (3.41 vs 1.53 sec/ex) is reported without minimization.", 238 "source": "haiku" 239 } 240 }, 241 "setup_transparency": { 242 "model_versions_specified": { 243 "applies": true, 244 "answer": true, 245 "justification": "Specific model versions are stated throughout: Llama 3.1 8B-Instruct, Llama 3.1 8B, Llama 3.2 1B, and Llama 3.3 70B are all identified with version numbers.", 246 "source": "haiku" 247 }, 248 "prompts_provided": { 249 "applies": true, 250 "answer": false, 251 "justification": "The paper does not provide the actual LLM prompts used for any of the five benchmark tasks; only constraint function descriptions and benchmark references are given.", 252 "source": "haiku" 253 }, 254 "hyperparameters_reported": { 255 "applies": true, 256 "answer": true, 257 "justification": "Appendix K reports temperature (1.0), max tokens per domain (32–350), SMC ESS thresholds, resampling schemes (multinomial vs. stratified), and hardware specifications.", 258 "source": "haiku" 259 }, 260 "scaffolding_described": { 261 "applies": true, 262 "answer": true, 263 "justification": "The SMC scaffolding is described with pseudocode in Algorithms 1 and 2, including particle resampling conditions, ESS thresholds, and the properly-weighted proposal integration.", 264 "source": "haiku" 265 }, 266 "data_preprocessing_documented": { 267 "applies": true, 268 "answer": true, 269 "justification": "Appendix J describes the full 5-step generation and filtering pipeline for the pattern matching dataset; standard benchmarks are used with referenced splits requiring minimal preprocessing.", 270 "source": "haiku" 271 } 272 }, 273 "data_integrity": { 274 "raw_data_available": { 275 "applies": true, 276 "answer": true, 277 "justification": "The paper states 'The source code and data to replicate this paper's experiments can be found in the following repository: https://github.com/genlm/awrs-colm-2025.'", 278 "source": "haiku" 279 }, 280 "data_collection_described": { 281 "applies": true, 282 "answer": true, 283 "justification": "The custom pattern matching dataset collection is described step-by-step in Appendix J; standard benchmarks reference their original collection papers.", 284 "source": "haiku" 285 }, 286 "recruitment_methods_described": { 287 "applies": false, 288 "answer": false, 289 "justification": "No human participants were involved; all data comes from existing benchmarks or automated generation pipelines.", 290 "source": "haiku" 291 }, 292 "data_pipeline_documented": { 293 "applies": true, 294 "answer": true, 295 "justification": "The pattern matching generation pipeline is documented in Appendix J with a 5-step procedure (LLM generation → deduplication → regex library filter → FSM filter → prefix check).", 296 "source": "haiku" 297 } 298 }, 299 "contamination": { 300 "training_cutoff_stated": { 301 "applies": false, 302 "answer": false, 303 "justification": "NA — the paper evaluates a decoding algorithm's efficiency and accuracy, not model capabilities on knowledge benchmarks; training data contamination is not a meaningful concern for this evaluation.", 304 "source": "haiku" 305 }, 306 "train_test_overlap_discussed": { 307 "applies": false, 308 "answer": false, 309 "justification": "NA — the evaluation measures constraint satisfaction accuracy of the decoding algorithm; LM memorization of test inputs would not meaningfully confound the algorithmic comparisons.", 310 "source": "haiku" 311 }, 312 "benchmark_contamination_addressed": { 313 "applies": false, 314 "answer": false, 315 "justification": "NA — all methods use the same LM, so any contamination effect is constant across comparisons and does not affect relative conclusions.", 316 "source": "haiku" 317 } 318 }, 319 "human_studies": { 320 "pre_registered": { 321 "applies": false, 322 "answer": false, 323 "justification": "No human participants in the study.", 324 "source": "haiku" 325 }, 326 "irb_or_ethics_approval": { 327 "applies": false, 328 "answer": false, 329 "justification": "No human participants in the study.", 330 "source": "haiku" 331 }, 332 "demographics_reported": { 333 "applies": false, 334 "answer": false, 335 "justification": "No human participants in the study.", 336 "source": "haiku" 337 }, 338 "inclusion_exclusion_criteria": { 339 "applies": false, 340 "answer": false, 341 "justification": "No human participants in the study.", 342 "source": "haiku" 343 }, 344 "randomization_described": { 345 "applies": false, 346 "answer": false, 347 "justification": "No human participants in the study.", 348 "source": "haiku" 349 }, 350 "blinding_described": { 351 "applies": false, 352 "answer": false, 353 "justification": "No human participants in the study.", 354 "source": "haiku" 355 }, 356 "attrition_reported": { 357 "applies": false, 358 "answer": false, 359 "justification": "No human participants in the study.", 360 "source": "haiku" 361 } 362 }, 363 "cost_and_practicality": { 364 "inference_cost_reported": { 365 "applies": true, 366 "answer": true, 367 "justification": "Runtime (seconds per example) is reported for all methods across all five domains in Table 1, and constraint evaluation costs (ms/eval) are reported in Table 2.", 368 "source": "haiku" 369 }, 370 "compute_budget_stated": { 371 "applies": true, 372 "answer": false, 373 "justification": "Hardware is described (single L40S for most tasks, single A100 for goal inference, 4×L40S for 70B models) but total GPU hours or compute budget are not reported.", 374 "source": "haiku" 375 } 376 } 377 } 378 }, 379 "claims": [ 380 { 381 "claim": "AWRS requires orders of magnitude fewer constraint evaluations than token masking", 382 "evidence": "Table 1e shows ARS-LCD matches TM-LCD accuracy (0.980 vs 0.978) while being >50x faster (0.16 vs 6.91 sec/ex) on pattern matching; Fig. 2 shows AWRS typically checks only 2–3 tokens per sampling step", 383 "supported": "strong" 384 }, 385 { 386 "claim": "AWRS-SMC with M=5 particles matches or exceeds Sample-Verify and Twisted SMC with M=10 particles", 387 "evidence": "Table 1 shows AWRS-SMC achieves higher accuracy than Twisted SMC in 4 of 5 domains (JSON: 0.898 vs 0.871; Goal Inference: 0.528 vs 0.479; Molecular: 0.615 vs 0.591; Pattern Matching: 0.990 vs 0.813)", 388 "supported": "strong" 389 }, 390 { 391 "claim": "AWRS runtime scales with KL divergence between unconstrained and constrained distributions, not vocabulary size", 392 "evidence": "Fig. 2 empirically shows constraint evaluation count scales with DKL(p||p0); Proposition 4 and Appendix G.2 provide a formal proof that expected runtime is O(sum of pi_x for non-conforming tokens)", 393 "supported": "strong" 394 }, 395 { 396 "claim": "AWRS produces mathematically exact (unbiased) samples from the local constrained token distribution", 397 "evidence": "Proposition 1 (WRS) and Proposition 3 (AWRS) formally prove x ~ p and E[Z-hat] = Z using the RAVI framework with rigorous measure-theoretic proofs in Appendices C and D", 398 "supported": "strong" 399 }, 400 { 401 "claim": "AWRS-SMC with a 1B model achieves better accuracy-runtime tradeoff than Twisted SMC with a 70B model", 402 "evidence": "Fig. L.1 shows AWRS-SMC Llama 1B at 0.974 accuracy / 0.29 sec/ex vs. Twisted SMC Llama 70B at 0.846 / 0.44 sec/ex on pattern matching", 403 "supported": "strong" 404 }, 405 { 406 "claim": "AWRS enables arbitrary black-box constraints beyond grammar-based approaches", 407 "evidence": "The paper demonstrates PDDL planner constraints, SMILES validators, and context-sensitive pattern matching with backreferences—none expressible as context-free grammars supported by existing token masking libraries", 408 "supported": "strong" 409 } 410 ], 411 "methodology_tags": [ 412 "benchmark-eval", 413 "theoretical" 414 ], 415 "key_findings": "AWRS is a rejection-sampling-based algorithm for constrained LM generation that achieves >50x runtime speedup over token masking by evaluating constraints on only a small fraction of the vocabulary, while producing mathematically exact samples. Combined with Sequential Monte Carlo (AWRS-SMC), it produces unbiased importance weights that correct for the myopic bias of locally constrained decoding, improving accuracy over state-of-the-art methods with half the number of particles. Runtime scales with the KL divergence between unconstrained and constrained distributions, making the method self-improving as base LM quality increases—a 1B model with AWRS-SMC outperforms a 70B model with Twisted SMC on pattern matching.", 416 "red_flags": [ 417 { 418 "flag": "Single-domain TM-LCD comparison", 419 "detail": "The headline >50x speedup claim against token masking is demonstrated on only one domain (pattern matching) because TM-LCD is computationally infeasible elsewhere; four of five domains lack this critical baseline." 420 }, 421 { 422 "flag": "No limitations section", 423 "detail": "The paper contains no dedicated limitations or threats-to-validity section, omitting discussion of failure modes, domain generalizability limits, or conditions under which AWRS may underperform." 424 }, 425 { 426 "flag": "Unoptimized Python implementation caveat", 427 "detail": "Authors note implementations are 'written in pure Python and are relatively unoptimized,' meaning runtime comparisons may not reflect production-quality performance and the claimed speedups could change with optimized implementations of either side." 428 }, 429 { 430 "flag": "No competing interests declaration", 431 "detail": "The paper contains no competing interests statement despite multi-institution authorship that may include consulting, patent, or startup interests in constrained decoding technology." 432 }, 433 { 434 "flag": "Prompts not disclosed", 435 "detail": "Actual LLM prompts for the five benchmark tasks are absent, making it impossible to rule out that performance differences are partially attributable to prompt engineering rather than the decoding algorithm alone." 436 } 437 ], 438 "cited_papers": [ 439 { 440 "title": "Sequential Monte Carlo steering of large language models using probabilistic programs", 441 "relevance": "Primary prior work on SMC for constrained LLM generation; AWRS-SMC directly extends and addresses this method's limitations around constraint decomposability" 442 }, 443 { 444 "title": "Syntactic and semantic control of large language models via sequential Monte Carlo (Loula et al. 2025)", 445 "relevance": "Closest competing method (Twisted SMC); primary accuracy and runtime baseline requiring fast/slow constraint decomposition that AWRS eliminates" 446 }, 447 { 448 "title": "Probabilistic inference in language models via twisted sequential Monte Carlo (Zhao et al. 2024)", 449 "relevance": "SMC baseline requiring expensive fine-tuning for twist learning; AWRS-SMC addresses this by avoiding training-time overhead" 450 }, 451 { 452 "title": "Grammar-aligned decoding (Park et al. 2024)", 453 "relevance": "Key prior work demonstrating that LCD distorts the global distribution; provides theoretical motivation for the importance weighting approach" 454 }, 455 { 456 "title": "Efficient guided generation for large language models (Outlines, Willard & Louf 2023)", 457 "relevance": "State-of-the-art token masking library; represents the optimized grammar-constrained baseline that AWRS generalizes beyond" 458 }, 459 { 460 "title": "Recursive Monte Carlo and variational inference with auxiliary variables (RAVI, Lew et al. 2022)", 461 "relevance": "Theoretical framework used to derive proper weighting proofs for both WRS and AWRS; foundational to the paper's mathematical contributions" 462 }, 463 { 464 "title": "Spider: A large-scale human-labeled dataset for complex and cross-domain semantic parsing and text-to-SQL", 465 "relevance": "Text-to-SQL evaluation benchmark used in experiments" 466 }, 467 { 468 "title": "Generating structured outputs from language models: Benchmark and studies (JSONSchemaBench, Geng et al. 2025)", 469 "relevance": "JSON evaluation benchmark used in experiments" 470 } 471 ], 472 "engagement_factors": { 473 "practical_relevance": { 474 "score": 3, 475 "justification": "Directly addresses a common production bottleneck in structured LLM output generation (SQL, JSON, molecular design); code is released and immediately applicable to arbitrary black-box constraints." 476 }, 477 "surprise_contrarian": { 478 "score": 2, 479 "justification": "Challenges the dominant token masking paradigm with rejection sampling and proves a 1B model with AWRS can outperform a 70B model with the leading alternative SMC method." 480 }, 481 "fear_safety": { 482 "score": 0, 483 "justification": "No AI safety or risk implications discussed; this is an efficiency and accuracy improvement for constrained decoding algorithms." 484 }, 485 "drama_conflict": { 486 "score": 1, 487 "justification": "Implicit challenge to the Outlines/XGrammar ecosystem by arguing grammar-specialized approaches are unnecessarily restrictive, but no direct controversy or heated community debate." 488 }, 489 "demo_ability": { 490 "score": 3, 491 "justification": "Production-quality code is available at https://github.com/genlm/genlm-control with a maintained library; practitioners can immediately apply AWRS to their own constraint types." 492 }, 493 "brand_recognition": { 494 "score": 1, 495 "justification": "MIT, ETH Zürich, and McGill are respected institutions but the paper lacks involvement from prominent industry labs (OpenAI, Google, Anthropic) that would drive broad attention." 496 } 497 }, 498 "hn_data": { 499 "threads": [ 500 { 501 "hn_id": "35472750", 502 "title": "A radiation hard RISC-V microprocessor for high-energy physics applications", 503 "points": 111, 504 "comments": 46, 505 "url": "https://news.ycombinator.com/item?id=35472750", 506 "created_at": "2023-04-06T18:54:30Z" 507 }, 508 { 509 "hn_id": "44397503", 510 "title": "Exploiting Local KV Cache Asymmetry for Long-Context LLMs", 511 "points": 6, 512 "comments": 0, 513 "url": "https://news.ycombinator.com/item?id=44397503", 514 "created_at": "2025-06-27T15:22:27Z" 515 }, 516 { 517 "hn_id": "39976086", 518 "title": "Physics of Language Models: Part 3.3, Knowledge Capacity Scaling Laws", 519 "points": 5, 520 "comments": 0, 521 "url": "https://news.ycombinator.com/item?id=39976086", 522 "created_at": "2024-04-09T03:56:53Z" 523 }, 524 { 525 "hn_id": "47104697", 526 "title": "Reasoning Models Fabricate 75% of Their Explanations (ArXiv:2505.05410)", 527 "points": 4, 528 "comments": 0, 529 "url": "https://news.ycombinator.com/item?id=47104697", 530 "created_at": "2026-02-21T21:01:00Z" 531 }, 532 { 533 "hn_id": "44211549", 534 "title": "Oracular Programming: A Modular Foundation for Building LLM-Enabled Software", 535 "points": 4, 536 "comments": 1, 537 "url": "https://news.ycombinator.com/item?id=44211549", 538 "created_at": "2025-06-07T18:30:04Z" 539 }, 540 { 541 "hn_id": "43975695", 542 "title": "AWRS SMC: Fast new algorithm for guiding LLMs as Bayesian inference", 543 "points": 2, 544 "comments": 0, 545 "url": "https://news.ycombinator.com/item?id=43975695", 546 "created_at": "2025-05-13T17:50:54Z" 547 }, 548 { 549 "hn_id": "43949744", 550 "title": "Reasoning Models Don't Always Say What They Think", 551 "points": 2, 552 "comments": 0, 553 "url": "https://news.ycombinator.com/item?id=43949744", 554 "created_at": "2025-05-10T23:07:01Z" 555 }, 556 { 557 "hn_id": "45274922", 558 "title": "Candidates evoke identity and issues on TikTok", 559 "points": 2, 560 "comments": 0, 561 "url": "https://news.ycombinator.com/item?id=45274922", 562 "created_at": "2025-09-17T12:15:44Z" 563 }, 564 { 565 "hn_id": "44028643", 566 "title": "Reasoning Models Don't Always Say What They Think", 567 "points": 1, 568 "comments": 0, 569 "url": "https://news.ycombinator.com/item?id=44028643", 570 "created_at": "2025-05-19T11:29:32Z" 571 }, 572 { 573 "hn_id": "43726013", 574 "title": "Parameter-Efficient Fine-Tuning of LLMs for Personality Detection", 575 "points": 1, 576 "comments": 0, 577 "url": "https://news.ycombinator.com/item?id=43726013", 578 "created_at": "2025-04-18T08:06:49Z" 579 } 580 ], 581 "top_points": 111, 582 "total_points": 138, 583 "total_comments": 47 584 } 585 }