ai-research-survey

Systematic scan of agentic development research. What's signal, what's noise.
git clone https://git.shiptheloop.com/ai-research-survey.git
Log | Files | Refs

scan-v5.json (21190B)


      1 {
      2   "scan_version": 5,
      3   "paper_type": "theoretical",
      4   "paper": {
      5     "title": "Efficient Guided Generation for Large Language Models",
      6     "authors": [
      7       "Brandon T. Willard",
      8       "Rémi Louf"
      9     ],
     10     "year": 2023,
     11     "venue": "arXiv.org",
     12     "arxiv_id": "2307.09702",
     13     "doi": "10.48550/arXiv.2307.09702"
     14   },
     15   "checklist": {
     16     "claims_and_evidence": {
     17       "abstract_claims_supported": {
     18         "applies": true,
     19         "answer": true,
     20         "justification": "All major abstract claims supported: FSM reformulation developed rigorously in Section 3 with Definition 1 and Example 1; efficiency gains demonstrated in Section 3.2 showing 10-100x speedup vs Guidance; model-agnostic applicability shown through algorithm design applicable to any LLM outputting probability distributions; structure guarantees enabled by masking mechanism.",
     21         "source": "haiku"
     22       },
     23       "causal_claims_justified": {
     24         "applies": false,
     25         "answer": false,
     26         "justification": "This is a theoretical/algorithmic paper with no empirical causal claims to justify. Complexity relationships (O(N) vs O(1)) are established through algorithm design and formalism, not experimental causality.",
     27         "source": "haiku"
     28       },
     29       "generalization_bounded": {
     30         "applies": true,
     31         "answer": true,
     32         "justification": "Scope explicitly bounded in introduction: 'we are concerned with...sequences that conform to regular expressions or context-free grammars.' Section 4 extends to LALR(1) parsers. Boundaries are clear and scope is not oversold.",
     33         "source": "haiku"
     34       },
     35       "alternative_explanations_discussed": {
     36         "applies": false,
     37         "answer": false,
     38         "justification": "As theoretical work presenting a single formal framework, alternative approaches are mentioned (transducers via Kuchnik et al. [2023]) but not systematically explored or discussed as competing explanations.",
     39         "source": "haiku"
     40       },
     41       "proxy_outcome_distinction": {
     42         "applies": true,
     43         "answer": true,
     44         "justification": "Paper clearly distinguishes measured outcome (token generation runtime) from claimed benefit (efficiency). Section 3.2 validates claims with direct runtime measurements comparing indexing vs naive masking approach.",
     45         "source": "haiku"
     46       }
     47     },
     48     "limitations_and_scope": {
     49       "limitations_section_present": {
     50         "applies": true,
     51         "answer": false,
     52         "justification": "Section 5 provides Discussion addressing memory trade-offs and future directions, but no dedicated Limitations or Threats-to-Validity section. Discussion reads as speculation on extensions rather than systematic limitation analysis.",
     53         "source": "haiku"
     54       },
     55       "threats_to_validity_specific": {
     56         "applies": true,
     57         "answer": false,
     58         "justification": "No specific threats discussed. Memory trade-off mentioned ('naturally makes trade-off between processing and memory') but with no analysis of failure modes, pathological inputs, or conditions where approach breaks down beyond passing mention of 'non-pathological combinations.'",
     59         "source": "haiku"
     60       },
     61       "scope_boundaries_stated": {
     62         "applies": true,
     63         "answer": false,
     64         "justification": "Scope is implicitly bounded to regex/CFG/LALR(1) problems, but explicit statement of what approach does NOT show is missing. No discussion of problem classes outside scope or assumptions that enable claims.",
     65         "source": "haiku"
     66       }
     67     },
     68     "conflicts_of_interest": {
     69       "funding_disclosed": {
     70         "applies": true,
     71         "answer": false,
     72         "justification": "No funding sources disclosed anywhere in paper. Acknowledgments section (p. 18) thanks Dan Gerlanc and Dan Simpson for feedback but mentions no funding agencies, grants, or financial support.",
     73         "source": "haiku"
     74       },
     75       "affiliations_disclosed": {
     76         "applies": true,
     77         "answer": true,
     78         "justification": "Author affiliations clearly stated: both from 'Normal Computing.' Paper does not evaluate Normal Computing's products—it is pure methodology. Affiliation disclosure is transparent.",
     79         "source": "haiku"
     80       },
     81       "funder_independent_of_outcome": {
     82         "applies": false,
     83         "answer": false,
     84         "justification": "No funder identified; not applicable.",
     85         "source": "haiku"
     86       },
     87       "financial_interests_declared": {
     88         "applies": true,
     89         "answer": false,
     90         "justification": "No competing interests statement. Authors developed Outlines library (mentioned as 'open source Python library Outlines [Louf and Willard]'), creating potential interest in adoption, but no explicit declaration of conflicts or financial interests.",
     91         "source": "haiku"
     92       }
     93     },
     94     "scope_and_framing": {
     95       "key_terms_defined": {
     96         "applies": true,
     97         "answer": true,
     98         "justification": "Key terms formally defined or explained: finite automaton (Definition 1, p. 5), pushdown automaton (Definition 2, p. 14), guided generation via masking (Section 2.2). Regular expressions and CFGs assumed standard knowledge in field.",
     99         "source": "haiku"
    100       },
    101       "intended_contribution_clear": {
    102         "applies": true,
    103         "answer": true,
    104         "justification": "Contribution explicitly stated: (1) reformulate neural text generation as FSM transitions, (2) develop index-based vocabulary lookup reducing O(N) to O(1) average, (3) extend to CFGs via pushdown automata, (4) provide Outlines implementation. Each is clearly scoped.",
    105         "source": "haiku"
    106       },
    107       "engagement_with_prior_work": {
    108         "applies": true,
    109         "answer": true,
    110         "justification": "Related work cited throughout: Beurer-Kellner et al. on query languages, Scholak et al. on PICARD, Kuchnik et al. on transducers. Paper differentiates ('does not require complete transducer abstraction') and directly compares with Guidance library (Section 3.2). Engagement is scattered but present.",
    111         "source": "haiku"
    112       }
    113     }
    114   },
    115   "type_checklist": {
    116     "theoretical": {
    117       "formal_quality": {
    118         "assumptions_stated_explicitly": {
    119           "applies": true,
    120           "answer": true,
    121           "justification": "Key assumptions explicitly stated: vocabulary from fixed alphabet, LLM outputs categorical distribution over vocabulary, tokens can be grouped by FSM transitions, preprocessing of vocabulary is feasible. Definitions 1 and 2 formalize FSM and PDA assumptions.",
    122           "source": "haiku"
    123         },
    124         "proofs_complete_or_sketched": {
    125           "applies": true,
    126           "answer": false,
    127           "justification": "No formal theorems or proofs provided. Paper describes algorithms (Algorithms 1-4) and provides examples, but claims like 'O(1) on average' and 'complexity reduced from O(N)' are stated without proof or formal justification.",
    128           "source": "haiku"
    129         },
    130         "bounds_tight_or_discussed": {
    131           "applies": true,
    132           "answer": false,
    133           "justification": "Complexity bounds stated ('O(1) average,' 'O(N) naive,' ~50MB memory) but tightness never discussed. No analysis of worst-case behavior, when bounds apply, or whether constants are small. Practical memory example given but no general characterization.",
    134           "source": "haiku"
    135         },
    136         "counterexamples_explored": {
    137           "applies": true,
    138           "answer": false,
    139           "justification": "Paper provides positive examples (float regex, yes/no, IP addresses, Python code) but no systematic exploration of edge cases or failure modes. Phrase 'non-pathological combinations' acknowledges pathological cases exist but does not identify or analyze them.",
    140           "source": "haiku"
    141         },
    142         "notation_consistent": {
    143           "applies": true,
    144           "answer": true,
    145           "justification": "Notation is consistent throughout: V=vocabulary, N=|V|, St=token sequences, α=logits, m=mask function, M=FSM, Q=states, Σ=alphabet, δ=transition, σ=state-to-vocab map. No overloading or conflicting uses observed.",
    146           "source": "haiku"
    147         },
    148         "constructive_vs_existence_noted": {
    149           "applies": true,
    150           "answer": true,
    151           "justification": "Paper is explicitly constructive: Algorithms 1-4 describe how to build indices and sample. Implementation in Outlines library demonstrates constructivity. Not an existence proof; methods are implementable and implemented.",
    152           "source": "haiku"
    153         }
    154       },
    155       "connections": {
    156         "connection_to_practice_discussed": {
    157           "applies": true,
    158           "answer": true,
    159           "justification": "Strong practical grounding: Section 3.1 runnable code examples on GPT2 (yes/no, IP, variable names), Section 3.2 benchmarks vs Guidance library showing 10x+ speedup, Section 4 extends to JSON/Python/SQL formats, Discussion mentions training/fine-tuning applications.",
    160           "source": "haiku"
    161         },
    162         "relationship_to_prior_work_clear": {
    163           "applies": true,
    164           "answer": true,
    165           "justification": "Relationships stated: Kuchnik et al. on transducers ('does not require complete transducer abstraction'), Beurer-Kellner et al. on query languages, Guidance library on prompting. Direct comparison with Guidance (Section 3.2). Engagement present but scattered across sections rather than in dedicated related work.",
    166           "source": "haiku"
    167         },
    168         "computational_complexity_discussed": {
    169           "applies": true,
    170           "answer": true,
    171           "justification": "Complexity thoroughly discussed: O(N) cost for naive masking over entire vocabulary (Section 2.2), O(1) average for index lookup (Algorithm 4, Section 3), memory trade-off stated with concrete example (~50 MB for Python grammar), preprocessing cost described as 'effectively irrelevant.'",
    172           "source": "haiku"
    173         },
    174         "limitations_of_formal_model_stated": {
    175           "applies": true,
    176           "answer": false,
    177           "justification": "FSM model captures constraint satisfaction but model limitations never stated. What aspects of LLM behavior does masking ignore? How does zeroing invalid token probabilities interact with learned distributions? Model's gap from reality not discussed.",
    178           "source": "haiku"
    179         }
    180       }
    181     }
    182   },
    183   "claims": [
    184     {
    185       "claim": "Neural text generation can be reformulated as finite-state machine state transitions",
    186       "evidence": "Section 3 develops formal FSM framework with Definition 1 and Example 1 (floating-point regex); practical implementation in Outlines library; algorithms show FSM state tracking during generation.",
    187       "supported": "strong"
    188     },
    189     {
    190       "claim": "Index-based vocabulary lookup reduces masking cost from O(N) to O(1) average",
    191       "evidence": "Algorithm 4 constructs state-to-vocabulary map via preprocessing; Section 3.2 empirical benchmark vs Guidance library shows 10-100x speedup across token lengths (20-100 tokens).",
    192       "supported": "strong"
    193     },
    194     {
    195       "claim": "Approach generalizes to regular expressions, context-free grammars, and LALR(1) parsers",
    196       "evidence": "Section 3 develops regex with Algorithms 3-4 and Example 1; Section 4 extends to CFGs via pushdown automata (Definition 2) with parser state indexing; Discussion mentions JSON/Python/SQL.",
    197       "supported": "strong"
    198     },
    199     {
    200       "claim": "Approach is model-agnostic and imposes minimal overhead",
    201       "evidence": "Abstract and Section 1 claim agnosticity; Algorithms show masking as orthogonal layer on any function returning probability distribution; Section 3.1 demonstrates on GPT2.",
    202       "supported": "strong"
    203     },
    204     {
    205       "claim": "Memory costs are manageable for practical grammars",
    206       "evidence": "Discussion Section 5 reports '~50 MB' for Python grammar with 'naively constructed indices' using unreduced DFAs, suggesting room for optimization, but no systematic analysis of memory scaling.",
    207       "supported": "moderate"
    208     },
    209     {
    210       "claim": "Approach significantly outperforms existing guided generation libraries",
    211       "evidence": "Section 3.2 single benchmark: Guidance library shows linear scaling, Outlines flat scaling. However, only Guidance compared; no evaluation vs other structured generation approaches (e.g., constrained beam search, PICARD).",
    212       "supported": "moderate"
    213     }
    214   ],
    215   "methodology_tags": [
    216     "theoretical"
    217   ],
    218   "key_findings": "The paper reformulates constrained neural text generation as finite-state machine (FSM) transitions and proposes vocabulary indexing algorithms that reduce masking complexity from O(N) to O(1) average case for regular expressions, extended via pushdown automata to context-free grammars and LALR(1) parsers. Empirical evaluation in Section 3.2 demonstrates 10-100x speedup versus Guidance library across token generation lengths. Implementation in Outlines library enables practical structured generation for JSON, Python, and SQL with minimal inference overhead.",
    219   "red_flags": [
    220     {
    221       "flag": "No formal proofs for complexity claims",
    222       "detail": "O(1) average complexity stated for index lookup but never formally proven; no worst-case analysis, no analysis of when hash-map guarantee holds"
    223     },
    224     {
    225       "flag": "Limited and narrow benchmark comparison",
    226       "detail": "Section 3.2 compares only against Guidance library; no evaluation of alternative structured generation approaches (constrained beam search, other transducer implementations, PICARD, SMC steering)"
    227     },
    228     {
    229       "flag": "Pathological cases acknowledged but unexplored",
    230       "detail": "Paper mentions 'non-pathological combinations of regular expressions and vocabularies' implying pathological cases exist, but does not identify, characterize, or analyze performance degradation in pathological regimes"
    231     },
    232     {
    233       "flag": "No dedicated limitations section",
    234       "detail": "Discussion section addresses memory trade-offs and speculates on future work, but no systematic treatment of when method fails, assumption violations, or scope limitations"
    235     },
    236     {
    237       "flag": "Memory analysis incomplete and nonparametric",
    238       "detail": "Reports ~50MB for Python grammar but provides no model of memory growth vs grammar size/complexity; no worst-case bounds; unclear how 'naively constructed indices' scale"
    239     },
    240     {
    241       "flag": "Transducer comparison incomplete",
    242       "detail": "Kuchnik et al. [2023] uses transducers for similar problem; paper claims simpler approach but does not provide detailed technical comparison or justify transducers' inferiority"
    243     },
    244     {
    245       "flag": "Model limitations not discussed",
    246       "detail": "FSM formalism captures syntax but paper does not address gap between masked probabilities and model's learned distribution; interaction between constraints and semantic generation quality not analyzed"
    247     }
    248   ],
    249   "cited_papers": [
    250     {
    251       "title": "Prompting is programming: A query language for large language models",
    252       "relevance": "Foundational work on query languages for LLM generation; establishes need for structured output interfaces"
    253     },
    254     {
    255       "title": "PICARD: Parsing incrementally for constrained auto-regressive decoding from language models",
    256       "relevance": "Prior work on incremental parsing for structured generation; directly addresses same problem domain"
    257     },
    258     {
    259       "title": "Synchromesh: Reliable code generation from pre-trained language models",
    260       "relevance": "Code generation with constraints; demonstrates practical motivation for structured output guarantees"
    261     },
    262     {
    263       "title": "Validating large language models with RELM",
    264       "relevance": "Transducer-based approach to constrained generation; most similar prior work; direct comparison point for FSM-based indexing"
    265     },
    266     {
    267       "title": "Sequential Monte Carlo Steering of Large Language Models using Probabilistic Programs",
    268       "relevance": "Alternative sampling strategy for constraint satisfaction; contrasting algorithmic approach to same problem"
    269     },
    270     {
    271       "title": "Flexible Grammar-Based Constrained Decoding for Language Models",
    272       "relevance": "Concurrent grammar-based generation method; alternative solution to CFG-constrained sampling"
    273     },
    274     {
    275       "title": "Grammar Prompting for Domain-Specific Language Generation with Large Language Models",
    276       "relevance": "Grammar-based prompting as orthogonal approach; demonstrates multiple strategies for structured generation"
    277     }
    278   ],
    279   "engagement_factors": {
    280     "practical_relevance": {
    281       "score": 3,
    282       "justification": "Open-source Outlines library immediately usable for production structured generation; directly applicable to JSON APIs, code generation, SQL, and other constrained outputs."
    283     },
    284     "surprise_contrarian": {
    285       "score": 2,
    286       "justification": "FSM-based indexing is mathematically elegant and achieves stated complexity, but not surprising for practitioners familiar with formal language theory and parsing; incremental refinement rather than paradigm shift."
    287     },
    288     "fear_safety": {
    289       "score": 0,
    290       "justification": "Infrastructure paper with no AI safety or risk implications; purely technical contribution to controllable LLM generation without adversarial or safety content."
    291     },
    292     "drama_conflict": {
    293       "score": 1,
    294       "justification": "Implicit competition with Guidance library but no controversial positioning; methodological comparison without divisive claims or adversarial framing."
    295     },
    296     "demo_ability": {
    297       "score": 3,
    298       "justification": "Section 3.1 provides runnable code examples with GPT2 (yes/no questions, IP addresses, variable names); Outlines library open source and immediately installable for hands-on experimentation."
    299     },
    300     "brand_recognition": {
    301       "score": 1,
    302       "justification": "Normal Computing is not an established major lab; Outlines library is practical but not yet mainstream; authors not prominent researchers in NLP/AI community."
    303     }
    304   },
    305   "hn_data": {
    306     "threads": [
    307       {
    308         "hn_id": "37125118",
    309         "title": "Show HN: LLMs can generate valid JSON 100% of the time",
    310         "points": 854,
    311         "comments": 303,
    312         "url": "https://news.ycombinator.com/item?id=37125118",
    313         "created_at": "2023-08-14T18:52:54Z"
    314       },
    315       {
    316         "hn_id": "40985017",
    317         "title": "SpreadsheetLLM: Encoding Spreadsheets for Large Language Models",
    318         "points": 190,
    319         "comments": 69,
    320         "url": "https://news.ycombinator.com/item?id=40985017",
    321         "created_at": "2024-07-17T12:16:18Z"
    322       },
    323       {
    324         "hn_id": "35237646",
    325         "title": "CoLT5: Faster Long-Range Transformers With Conditional Computation",
    326         "points": 123,
    327         "comments": 17,
    328         "url": "https://news.ycombinator.com/item?id=35237646",
    329         "created_at": "2023-03-20T19:54:19Z"
    330       },
    331       {
    332         "hn_id": "40976967",
    333         "title": "SpreadsheetLLM: Encoding Spreadsheets for Large Language Models",
    334         "points": 4,
    335         "comments": 1,
    336         "url": "https://news.ycombinator.com/item?id=40976967",
    337         "created_at": "2024-07-16T14:29:34Z"
    338       },
    339       {
    340         "hn_id": "35225719",
    341         "title": "CoLT5: Faster Long-Range Transformers with Conditional Computation",
    342         "points": 4,
    343         "comments": 0,
    344         "url": "https://news.ycombinator.com/item?id=35225719",
    345         "created_at": "2023-03-20T00:52:41Z"
    346       },
    347       {
    348         "hn_id": "40965811",
    349         "title": "SpreadsheetLLM: Encoding Spreadsheets for Large Language Models",
    350         "points": 3,
    351         "comments": 0,
    352         "url": "https://news.ycombinator.com/item?id=40965811",
    353         "created_at": "2024-07-15T07:04:14Z"
    354       },
    355       {
    356         "hn_id": "23908109",
    357         "title": "A curated collection of Covid-19 online datasets",
    358         "points": 3,
    359         "comments": 0,
    360         "url": "https://news.ycombinator.com/item?id=23908109",
    361         "created_at": "2020-07-21T16:17:58Z"
    362       },
    363       {
    364         "hn_id": "44597583",
    365         "title": "Lizard: An Efficient Linearization Framework for Large Language Models",
    366         "points": 2,
    367         "comments": 0,
    368         "url": "https://news.ycombinator.com/item?id=44597583",
    369         "created_at": "2025-07-17T20:06:18Z"
    370       },
    371       {
    372         "hn_id": "44096969",
    373         "title": "Better Zero-Shot Reasoning with Role-Play Prompting",
    374         "points": 2,
    375         "comments": 0,
    376         "url": "https://news.ycombinator.com/item?id=44096969",
    377         "created_at": "2025-05-26T12:48:04Z"
    378       },
    379       {
    380         "hn_id": "41058765",
    381         "title": "Spreadsheetllm: Encoding Spreadsheets for Large Language Models",
    382         "points": 1,
    383         "comments": 0,
    384         "url": "https://news.ycombinator.com/item?id=41058765",
    385         "created_at": "2024-07-24T16:31:12Z"
    386       }
    387     ],
    388     "top_points": 854,
    389     "total_points": 1186,
    390     "total_comments": 390
    391   }
    392 }

Impressum · Datenschutz