scan-v5.json - ai-research-survey - Systematic scan of agentic development research. What's signal, what's noise.

scan-v5.json (26887B)
      1 {
      2   "scan_version": 5,
      3   "paper_type": "empirical",
      4   "paper": {
      5     "title": "Guiding LLMs The Right Way: Fast, Non-Invasive Constrained Generation",
      6     "authors": [
      7       "Luca Beurer-Kellner",
      8       "Marc Fischer",
      9       "Martin T. Vechev"
     10     ],
     11     "year": 2024,
     12     "venue": "International Conference on Machine Learning",
     13     "arxiv_id": "2403.06988",
     14     "doi": "10.48550/arXiv.2403.06988"
     15   },
     16   "checklist": {
     17     "claims_and_evidence": {
     18       "abstract_claims_supported": {
     19         "applies": true,
     20         "answer": true,
     21         "justification": "All abstract claims are backed by Table 2 (accuracy parity/improvement over unconstrained) and Table 3 (throughput up to 2.71× over unconstrained). The claim about existing methods impairing accuracy is directly demonstrated with GUIDANCE dropping from 41.5% to 34.5% on GSM8K.",
     22         "source": "haiku"
     23       },
     24       "causal_claims_justified": {
     25         "applies": true,
     26         "answer": true,
     27         "justification": "The paper claims token misalignment causes accuracy drops; this is supported by controlled experiments where only the decoding method changes while model, prompt, and dataset remain constant, and by the ablation in Table 4 where varying k (the invasiveness parameter) directly changes accuracy.",
     28         "source": "haiku"
     29       },
     30       "generalization_bounded": {
     31         "applies": true,
     32         "answer": false,
     33         "justification": "The paper draws broad conclusions about constrained decoding in general but only tests two models (Mistral 7B, Llama-2 13B) and two downstream tasks (GSM8K, CoNLL2003); claims of general superiority over all settings are not bounded to these tested configurations.",
     34         "source": "haiku"
     35       },
     36       "alternative_explanations_discussed": {
     37         "applies": true,
     38         "answer": false,
     39         "justification": "The paper attributes GUIDANCE's accuracy drops entirely to token misalignment/invasiveness without considering alternative explanations such as differences in prompt formatting forced by templates or grammar structure interactions with few-shot examples.",
     40         "source": "haiku"
     41       },
     42       "proxy_outcome_distinction": {
     43         "applies": true,
     44         "answer": true,
     45         "justification": "Accuracy on GSM8K and CoNLL2003 is measured directly as exact-match task performance, and throughput is measured directly in tokens/second; the paper does not conflate proxy metrics with final outcomes.",
     46         "source": "haiku"
     47       }
     48     },
     49     "limitations_and_scope": {
     50       "limitations_section_present": {
     51         "applies": true,
     52         "answer": false,
     53         "justification": "The paper contains only a generic 'Impact Statement' stating 'none which we feel must be specifically highlighted here'; there is no dedicated limitations or threats-to-validity section.",
     54         "source": "haiku"
     55       },
     56       "threats_to_validity_specific": {
     57         "applies": true,
     58         "answer": false,
     59         "justification": "No threats to validity are discussed; the paper does not address the limited model selection, the restriction to offline grammar settings, or the small sample size for accuracy evaluation.",
     60         "source": "haiku"
     61       },
     62       "scope_boundaries_stated": {
     63         "applies": true,
     64         "answer": false,
     65         "justification": "The only scope qualifier is a brief note that DOMINO is evaluated 'in an offline setting, where all grammars are known ahead of time,' but this is not presented as an explicit scope boundary and no discussion of what results do NOT show is provided.",
     66         "source": "haiku"
     67       }
     68     },
     69     "conflicts_of_interest": {
     70       "funding_disclosed": {
     71         "applies": true,
     72         "answer": true,
     73         "justification": "Funding is disclosed in the Acknowledgements: Swiss State Secretariat for Education, Research and Innovation (SERI), grant SAFEAI (contract no. MB22.00088).",
     74         "source": "haiku"
     75       },
     76       "affiliations_disclosed": {
     77         "applies": true,
     78         "answer": true,
     79         "justification": "All three authors are identified as being from the Department of Computer Science, ETH Zurich, Switzerland, with correspondence emails provided.",
     80         "source": "haiku"
     81       },
     82       "funder_independent_of_outcome": {
     83         "applies": true,
     84         "answer": true,
     85         "justification": "The funder is the Swiss government (SERI) via an ERC Consolidator Grant, which has no commercial interest in whether DOMINO outperforms GUIDANCE or llama.cpp.",
     86         "source": "haiku"
     87       },
     88       "financial_interests_declared": {
     89         "applies": true,
     90         "answer": false,
     91         "justification": "No competing interests statement, patent disclosures, or equity/consulting declarations are present in the paper.",
     92         "source": "haiku"
     93       }
     94     },
     95     "scope_and_framing": {
     96       "key_terms_defined": {
     97         "applies": true,
     98         "answer": true,
     99         "justification": "'Minimally invasive' is given a formal definition (Definition 2.1); 'constrained decoding,' 'sub-word tokens,' 'subterminal,' and related concepts are defined precisely before use.",
    100         "source": "haiku"
    101       },
    102       "intended_contribution_clear": {
    103         "applies": true,
    104         "answer": true,
    105         "justification": "The paper explicitly lists three contributions: identifying the token misalignment challenge, proposing the DOMINO algorithm, and providing an extensive evaluation—stated in the 'Main contributions' bullet list in Section 1.",
    106         "source": "haiku"
    107       },
    108       "engagement_with_prior_work": {
    109         "applies": true,
    110         "answer": true,
    111         "justification": "The paper systematically compares against and situates itself relative to LMQL, GUIDANCE, OUTLINES, PICARD, SYNCHROMESH, LLAMA.CPP, and GCD in Table 1, explaining specifically how each differs on regex/CFG support, pre-computation, and minimal invasiveness.",
    112         "source": "haiku"
    113       }
    114     }
    115   },
    116   "type_checklist": {
    117     "empirical": {
    118       "artifacts": {
    119         "code_released": {
    120           "applies": true,
    121           "answer": false,
    122           "justification": "No GitHub repository or code release is mentioned anywhere in the paper; the implementation of DOMINO is described algorithmically but no artifact link is provided.",
    123           "source": "haiku"
    124         },
    125         "data_released": {
    126           "applies": true,
    127           "answer": true,
    128           "justification": "Both evaluation datasets (GSM8K and CoNLL2003) are standard publicly available benchmarks used unmodified.",
    129           "source": "haiku"
    130         },
    131         "environment_specified": {
    132           "applies": true,
    133           "answer": false,
    134           "justification": "Hardware is mentioned (NVIDIA A100 40GB or H100 80GB GPUs) and inference backends (transformers library, llama.cpp) are named, but no requirements.txt, Dockerfile, or pinned version specifications are provided.",
    135           "source": "haiku"
    136         },
    137         "reproduction_instructions": {
    138           "applies": true,
    139           "answer": false,
    140           "justification": "Appendix C provides grammars and prompts, which is helpful, but there are no step-by-step instructions for installing, configuring, and running DOMINO to reproduce the tables.",
    141           "source": "haiku"
    142         }
    143       },
    144       "statistical_methodology": {
    145         "confidence_intervals_or_error_bars": {
    146           "applies": true,
    147           "answer": false,
    148           "justification": "Tables 2 and 3 report single point estimates for accuracy and throughput with no confidence intervals or error bars.",
    149           "source": "haiku"
    150         },
    151         "significance_tests": {
    152           "applies": true,
    153           "answer": false,
    154           "justification": "No statistical significance tests are used despite comparative accuracy claims across multiple methods on 400 test samples.",
    155           "source": "haiku"
    156         },
    157         "effect_sizes_reported": {
    158           "applies": true,
    159           "answer": true,
    160           "justification": "Effect sizes are effectively reported as absolute accuracy differences (e.g., GUIDANCE drops from 41.5% to 34.5%, DOMINO achieves 41.8%) and throughput multipliers (e.g., 2.71×).",
    161           "source": "haiku"
    162         },
    163         "sample_size_justified": {
    164           "applies": true,
    165           "answer": false,
    166           "justification": "The 400 test samples used for accuracy evaluation are not justified with a power analysis or rationale; 100 repetitions for throughput measurements are stated but not justified.",
    167           "source": "haiku"
    168         },
    169         "variance_reported": {
    170           "applies": true,
    171           "answer": false,
    172           "justification": "No variance, standard deviation, or spread is reported for any accuracy or throughput result; all results in Tables 2 and 3 are single values.",
    173           "source": "haiku"
    174         }
    175       },
    176       "evaluation_design": {
    177         "baselines_included": {
    178           "applies": true,
    179           "answer": true,
    180           "justification": "Baselines include unconstrained generation, GUIDANCE (template and CFG variants), and llama.cpp, evaluated on the same tasks.",
    181           "source": "haiku"
    182         },
    183         "baselines_contemporary": {
    184           "applies": true,
    185           "answer": true,
    186           "justification": "All baselines (GUIDANCE, llama.cpp, OUTLINES) are contemporaneous tools that represent the current state of the art in constrained decoding as of the paper's writing.",
    187           "source": "haiku"
    188         },
    189         "ablation_study": {
    190           "applies": true,
    191           "answer": true,
    192           "justification": "Table 4 ablates the lookahead parameter k (0, 1, ∞), and Figure 5 ablates the number of speculative tokens s (0–10), showing how each affects accuracy and throughput.",
    193           "source": "haiku"
    194         },
    195         "multiple_metrics": {
    196           "applies": true,
    197           "answer": true,
    198           "justification": "The evaluation uses accuracy, throughput (tokens/second), perplexity, and well-formedness rate across experiments.",
    199           "source": "haiku"
    200         },
    201         "human_evaluation": {
    202           "applies": false,
    203           "answer": false,
    204           "justification": "Human evaluation is not relevant for this systems/algorithms paper focused on constrained decoding accuracy and throughput.",
    205           "source": "haiku"
    206         },
    207         "held_out_test_set": {
    208           "applies": true,
    209           "answer": true,
    210           "justification": "400 test samples from the test splits of GSM8K and CoNLL2003 are used, with few-shot demonstrations taken from the separate training splits.",
    211           "source": "haiku"
    212         },
    213         "per_category_breakdown": {
    214           "applies": true,
    215           "answer": true,
    216           "justification": "Table 3 breaks down throughput results by grammar type (JSON no schema, JSON GSM8K schema, C programming, XML with schema, fixed template) and by model.",
    217           "source": "haiku"
    218         },
    219         "failure_cases_discussed": {
    220           "applies": true,
    221           "answer": false,
    222           "justification": "C code generation is briefly noted as the hardest grammar where speculative decoding fails, but this is not presented as a systematic failure case analysis; specific failure modes or examples are not shown.",
    223           "source": "haiku"
    224         },
    225         "negative_results_reported": {
    226           "applies": true,
    227           "answer": true,
    228           "justification": "Table 4 honestly reports that DOMINO with k=0 and k=1 performs dramatically worse than unconstrained generation (e.g., Llama-2 drops to 0.0%), and Figure 5 shows speculative decoding is ineffective for free-form JSON.",
    229           "source": "haiku"
    230         }
    231       },
    232       "setup_transparency": {
    233         "model_versions_specified": {
    234           "applies": true,
    235           "answer": false,
    236           "justification": "Mistral 7B and Llama-2 13B are named with citations, but no specific checkpoint hashes, HuggingFace model IDs, or download dates are provided.",
    237           "source": "haiku"
    238         },
    239         "prompts_provided": {
    240           "applies": true,
    241           "answer": true,
    242           "justification": "Appendix C provides complete grammar definitions and all generation prompts; Appendix D provides example structured reasoning outputs for GSM8K and CoNLL2003.",
    243           "source": "haiku"
    244         },
    245         "hyperparameters_reported": {
    246           "applies": true,
    247           "answer": true,
    248           "justification": "Temperature (1.0), number of speculative tokens (s ∈ {6, 8, 10}), number of warmup repetitions (10), and output length (128 tokens) are all reported for throughput experiments.",
    249           "source": "haiku"
    250         },
    251         "scaffolding_described": {
    252           "applies": false,
    253           "answer": false,
    254           "justification": "This paper does not involve agentic scaffolding; it is a systems paper about constrained decoding algorithms applied directly to LLM inference.",
    255           "source": "haiku"
    256         },
    257         "data_preprocessing_documented": {
    258           "applies": true,
    259           "answer": true,
    260           "justification": "The paper describes using 5-shot prompts from the training split, manually constructing JSON-format responses for few-shot demonstrations, and sampling 100 outputs per configuration with 5 prompts per workload.",
    261           "source": "haiku"
    262         }
    263       },
    264       "data_integrity": {
    265         "raw_data_available": {
    266           "applies": true,
    267           "answer": false,
    268           "justification": "Raw model outputs (generated tokens, timing measurements) are not released; only aggregated accuracy and throughput numbers in tables are reported.",
    269           "source": "haiku"
    270         },
    271         "data_collection_described": {
    272           "applies": true,
    273           "answer": true,
    274           "justification": "The paper describes using 400 test samples from the standard test splits of GSM8K and CoNLL2003 with the procedure for constructing prompts clearly specified.",
    275           "source": "haiku"
    276         },
    277         "recruitment_methods_described": {
    278           "applies": false,
    279           "answer": false,
    280           "justification": "No human participants are involved; standard benchmark datasets are used.",
    281           "source": "haiku"
    282         },
    283         "data_pipeline_documented": {
    284           "applies": true,
    285           "answer": false,
    286           "justification": "While the experimental setup is described, the full pipeline from benchmark loading to accuracy scoring (e.g., how JSON answers are extracted and compared to ground truth) is not documented.",
    287           "source": "haiku"
    288         }
    289       },
    290       "contamination": {
    291         "training_cutoff_stated": {
    292           "applies": true,
    293           "answer": false,
    294           "justification": "Neither the Mistral 7B nor Llama-2 13B training data cutoff dates are stated, despite the evaluation relying on benchmark accuracy.",
    295           "source": "haiku"
    296         },
    297         "train_test_overlap_discussed": {
    298           "applies": true,
    299           "answer": false,
    300           "justification": "GSM8K was publicly released in 2021, well before the training cutoffs of both evaluated models; no discussion of whether these benchmarks appear in training data is provided.",
    301           "source": "haiku"
    302         },
    303         "benchmark_contamination_addressed": {
    304           "applies": true,
    305           "answer": false,
    306           "justification": "Both GSM8K and CoNLL2003 were available before the training cutoffs of Mistral 7B and Llama-2 13B; no contamination analysis or acknowledgment is provided.",
    307           "source": "haiku"
    308         }
    309       },
    310       "human_studies": {
    311         "pre_registered": {
    312           "applies": false,
    313           "answer": false,
    314           "justification": "No human participants involved.",
    315           "source": "haiku"
    316         },
    317         "irb_or_ethics_approval": {
    318           "applies": false,
    319           "answer": false,
    320           "justification": "No human participants involved.",
    321           "source": "haiku"
    322         },
    323         "demographics_reported": {
    324           "applies": false,
    325           "answer": false,
    326           "justification": "No human participants involved.",
    327           "source": "haiku"
    328         },
    329         "inclusion_exclusion_criteria": {
    330           "applies": false,
    331           "answer": false,
    332           "justification": "No human participants involved.",
    333           "source": "haiku"
    334         },
    335         "randomization_described": {
    336           "applies": false,
    337           "answer": false,
    338           "justification": "No human participants involved.",
    339           "source": "haiku"
    340         },
    341         "blinding_described": {
    342           "applies": false,
    343           "answer": false,
    344           "justification": "No human participants involved.",
    345           "source": "haiku"
    346         },
    347         "attrition_reported": {
    348           "applies": false,
    349           "answer": false,
    350           "justification": "No human participants involved.",
    351           "source": "haiku"
    352         }
    353       },
    354       "cost_and_practicality": {
    355         "inference_cost_reported": {
    356           "applies": true,
    357           "answer": true,
    358           "justification": "Throughput in tokens/second is reported for all configurations in Table 3 and Figure 5; precomputation times are reported (1-5s for most grammars, ~20s for C).",
    359           "source": "haiku"
    360         },
    361         "compute_budget_stated": {
    362           "applies": true,
    363           "answer": false,
    364           "justification": "GPU types are mentioned (A100/H100) but total GPU-hours or compute budget for the evaluation is not stated.",
    365           "source": "haiku"
    366         }
    367       }
    368     }
    369   },
    370   "claims": [
    371     {
    372       "claim": "Naive constrained decoding (e.g., GUIDANCE templates) can reduce task accuracy by up to 11 percentage points compared to unconstrained generation due to token misalignment.",
    373       "evidence": "Table 2 shows GUIDANCE reducing GSM8K accuracy from 41.5% to 34.5% for Mistral 7B and from 26.2% to 15.2% for Llama-2 13B.",
    374       "supported": "strong"
    375     },
    376     {
    377       "claim": "DOMINO achieves minimal invasiveness, maintaining or slightly improving task accuracy relative to unconstrained generation.",
    378       "evidence": "Table 2 shows DOMINO matching unconstrained accuracy exactly (e.g., 0.21 vs 0.21 on CoNLL2003) or slightly improving it (41.8% vs 41.5% on GSM8K Mistral 7B).",
    379       "supported": "strong"
    380     },
    381     {
    382       "claim": "DOMINO achieves up to 2.71× throughput improvement over unconstrained generation through speculative decoding.",
    383       "evidence": "Table 3 shows DOMINO with speculative decoding achieving 2.71× on CoNLL2003 with Llama-2 13B and 2.66× with Mistral 7B.",
    384       "supported": "strong"
    385     },
    386     {
    387       "claim": "Insufficient lookahead (k=0 or k=1) dramatically degrades DOMINO accuracy below unconstrained levels.",
    388       "evidence": "Table 4 shows DOMINO (k=0) drops GSM8K accuracy to 30.8% for Mistral and 0.0% for Llama-2, far below unconstrained baselines of 41.5% and 15.5%.",
    389       "supported": "strong"
    390     },
    391     {
    392       "claim": "Pre-computed vocabulary-aligned subterminal trees enable constrained decoding with lower overhead than fully online parsing approaches.",
    393       "evidence": "Table 3 shows DOMINO consistently outperforming llama.cpp's online parsing approach on throughput across all tested grammar types.",
    394       "supported": "strong"
    395     },
    396     {
    397       "claim": "Template-based constrained decoding (GUIDANCE) causes perplexity explosion and produces output that the model would be highly unlikely to generate unconstrained.",
    398       "evidence": "Figure 2 shows template-based output has perplexity 24.50–26.75 versus 4.17 for unconstrained; naturalized template output reaches 49.39.",
    399       "supported": "moderate"
    400     }
    401   ],
    402   "methodology_tags": [
    403     "benchmark-eval",
    404     "theoretical"
    405   ],
    406   "key_findings": "Token misalignment in constrained decoding—where externally imposed token sequences diverge from a model's natural generation distribution—causes accuracy drops of up to 11 percentage points compared to unconstrained generation, a problem exhibited by GUIDANCE templates and partially by llama.cpp. DOMINO addresses this via pre-computed vocabulary-aligned subterminal trees and speculative decoding, achieving formally minimal invasiveness (all unconstrained-valid outputs remain reachable) while simultaneously improving throughput by up to 2.71× over unconstrained generation for structured grammars. The lookahead parameter k is critical: k=0 or k=1 can catastrophically degrade accuracy (to 0% on some configurations), while k=∞ recovers and slightly exceeds unconstrained accuracy.",
    407   "red_flags": [
    408     {
    409       "flag": "No error bars on any result",
    410       "detail": "Tables 2 and 3 report single point estimates for accuracy and throughput across all conditions with no confidence intervals, standard deviations, or significance tests, making it impossible to assess whether observed differences are reliable."
    411     },
    412     {
    413       "flag": "Precomputation excluded from throughput",
    414       "detail": "Section 4.3 explicitly states 'We do not include DOMINO's precomputation time as part of the reported throughputs,' which advantages DOMINO in online or dynamic grammar settings where this cost is amortized differently."
    415     },
    416     {
    417       "flag": "No code released",
    418       "detail": "The algorithm is described in detail but no implementation is released, making independent reproduction dependent on re-implementing complex data structures (subterminal trees, speculative decoder) from scratch."
    419     },
    420     {
    421       "flag": "Two models, two tasks for accuracy claims",
    422       "detail": "Accuracy claims are based on only two models (Mistral 7B, Llama-2 13B) and two tasks (GSM8K, CoNLL2003), which limits generalizability to other model families and task types."
    423     },
    424     {
    425       "flag": "Benchmark contamination unaddressed",
    426       "detail": "Both GSM8K and CoNLL2003 predate the training cutoffs of Mistral 7B and Llama-2 13B; no discussion of whether contamination affects the absolute accuracy baseline values."
    427     },
    428     {
    429       "flag": "No limitations section",
    430       "detail": "The paper's 'Impact Statement' dismisses societal concerns in one sentence and the paper contains no discussion of algorithm limitations, failure modes, or scope boundaries."
    431     }
    432   ],
    433   "cited_papers": [
    434     {
    435       "title": "Efficient Guided Generation for Large Language Models (OUTLINES)",
    436       "relevance": "Pre-computation approach for regex-based constrained decoding that DOMINO builds upon for offline vocabulary-state mapping"
    437     },
    438     {
    439       "title": "Prompting is Programming: A Query Language for Large Language Models (LMQL)",
    440       "relevance": "Baseline constrained decoding system from the same authors; DOMINO is positioned as a more efficient and minimally invasive successor"
    441     },
    442     {
    443       "title": "Synchromesh: Reliable Code Generation from Pre-trained Language Models",
    444       "relevance": "Prior work demonstrating that bridge tokens are necessary for minimally invasive constraining in code generation"
    445     },
    446     {
    447       "title": "Grammar-Constrained Decoding for Structured NLP Tasks Without Finetuning (GCD)",
    448       "relevance": "Baseline online parser-guided approach that DOMINO outperforms on throughput while matching on invasiveness"
    449     },
    450     {
    451       "title": "PICARD: Parsing Incrementally for Constrained Auto-Regressive Decoding from Language Models",
    452       "relevance": "Baseline incremental parser-guided constrained decoding system for SQL generation"
    453     },
    454     {
    455       "title": "Accelerating Large Language Model Decoding with Speculative Sampling",
    456       "relevance": "Foundation for DOMINO's speculative decoding extension; DOMINO adapts speculative sampling to grammar-state-conditioned next-token prediction"
    457     },
    458     {
    459       "title": "Training Verifiers to Solve Math Word Problems (GSM8K)",
    460       "relevance": "Primary benchmark dataset used to evaluate task accuracy of constrained decoding methods"
    461     },
    462     {
    463       "title": "Mistral 7B",
    464       "relevance": "Primary model used in all experiments; the paper's main accuracy and throughput results are based on this model"
    465     }
    466   ],
    467   "engagement_factors": {
    468     "practical_relevance": {
    469       "score": 3,
    470       "justification": "Constrained JSON/structured output generation is a ubiquitous need in LLM applications; showing existing tools hurt accuracy while offering a faster alternative has immediate practitioner value."
    471     },
    472     "surprise_contrarian": {
    473       "score": 2,
    474       "justification": "The finding that constrained decoding—designed to help—can actually hurt task accuracy by up to 11pp, and that a constrained decoder can be faster than unconstrained generation, challenges common assumptions."
    475     },
    476     "fear_safety": {
    477       "score": 0,
    478       "justification": "The paper raises no AI safety or risk concerns; it is a systems efficiency paper."
    479     },
    480     "drama_conflict": {
    481       "score": 1,
    482       "justification": "Implicitly critiques widely-used tools (GUIDANCE, llama.cpp) by showing they degrade accuracy, but the tone is measured and not confrontational."
    483     },
    484     "demo_ability": {
    485       "score": 2,
    486       "justification": "The constrained JSON generation speedup is a concrete, demonstrable capability, though no code is released to enable immediate reproduction."
    487     },
    488     "brand_recognition": {
    489       "score": 1,
    490       "justification": "ETH Zurich has credibility in formal methods and programming languages, but is not a prominent lab in the LLM ecosystem; authors are not widely recognized public figures."
    491     }
    492   },
    493   "hn_data": {
    494     "threads": [
    495       {
    496         "hn_id": "22749961",
    497         "title": "HouseGAN: Generate Realistic Floor Plan Layouts from Relational Graphs",
    498         "points": 2,
    499         "comments": 1,
    500         "url": "https://news.ycombinator.com/item?id=22749961",
    501         "created_at": "2020-04-01T15:51:08Z"
    502       },
    503       {
    504         "hn_id": "46586960",
    505         "title": "Show HN: Two-line change, 30% RAG boost",
    506         "points": 1,
    507         "comments": 0,
    508         "url": "https://news.ycombinator.com/item?id=46586960",
    509         "created_at": "2026-01-12T11:20:16Z"
    510       },
    511       {
    512         "hn_id": "40775759",
    513         "title": "Assessing the Emergent Symbolic Reasoning Abilities of LLMs",
    514         "points": 1,
    515         "comments": 0,
    516         "url": "https://news.ycombinator.com/item?id=40775759",
    517         "created_at": "2024-06-24T13:24:57Z"
    518       },
    519       {
    520         "hn_id": "26385237",
    521         "title": "Seer: Self-Supervised Pretraining of Visual Features in the Wild",
    522         "points": 1,
    523         "comments": 0,
    524         "url": "https://news.ycombinator.com/item?id=26385237",
    525         "created_at": "2021-03-08T13:03:02Z"
    526       }
    527     ],
    528     "top_points": 2,
    529     "total_points": 5,
    530     "total_comments": 1
    531   }
    532 }
	ai-research-survey Systematic scan of agentic development research. What's signal, what's noise.
	git clone https://git.shiptheloop.com/ai-research-survey.git
	Log \| Files \| Refs